diff --git a/cachelib/allocator/CacheAllocator-inl.h b/cachelib/allocator/CacheAllocator-inl.h index 46b903c22f..1b494d15bb 100644 --- a/cachelib/allocator/CacheAllocator-inl.h +++ b/cachelib/allocator/CacheAllocator-inl.h @@ -125,6 +125,7 @@ ShmSegmentOpts CacheAllocator::createShmCacheOpts(TierId tid) { ShmSegmentOpts opts; opts.alignment = sizeof(Slab); opts.typeOpts = memoryTierConfigs[tid].getShmTypeOpts(); + opts.memBindNumaNodes = memoryTierConfigs[tid].getMemBind(); if (auto *v = std::get_if(&opts.typeOpts)) { v->usePosix = config_.usePosixShm; } diff --git a/cachelib/allocator/MemoryTierCacheConfig.h b/cachelib/allocator/MemoryTierCacheConfig.h index ae07a92516..662983ea84 100644 --- a/cachelib/allocator/MemoryTierCacheConfig.h +++ b/cachelib/allocator/MemoryTierCacheConfig.h @@ -53,6 +53,16 @@ class MemoryTierCacheConfig { size_t getRatio() const noexcept { return ratio; } + // Allocate memory only from specified NUMA nodes + MemoryTierCacheConfig& setMemBind(const std::vector& _numaNodes) { + numaNodes = _numaNodes; + return *this; + } + + std::vector getMemBind() const { + return numaNodes; + } + size_t calculateTierSize(size_t totalCacheSize, size_t partitionNum) const { // TODO: Call this method when tiers are enabled in allocator // to calculate tier sizes in bytes. @@ -82,6 +92,9 @@ class MemoryTierCacheConfig { // Options specific to shm type ShmTypeOpts shmOpts; + // Numa node(s) to bind the tier + std::vector numaNodes; + MemoryTierCacheConfig() = default; }; } // namespace cachelib diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp index 90ef34be41..d378522b22 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.cpp @@ -23,9 +23,11 @@ namespace tests { using LruAllocatorMemoryTiersTest = AllocatorMemoryTiersTest; // TODO(MEMORY_TIER): add more tests with different eviction policies -TEST_F(LruAllocatorMemoryTiersTest, MultiTiersInvalid) { this->testMultiTiersInvalid(); } -TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValid) { this->testMultiTiersValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersFromFileInvalid) { this->testMultiTiersFormFileInvalid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersFromFileValid) { this->testMultiTiersFromFileValid(); } TEST_F(LruAllocatorMemoryTiersTest, MultiTiersValidMixed) { this->testMultiTiersValidMixed(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersNumaBindingsSysVValid) { this->testMultiTiersNumaBindingsSysVValid(); } +TEST_F(LruAllocatorMemoryTiersTest, MultiTiersNumaBindingsPosixValid) { this->testMultiTiersNumaBindingsPosixValid(); } } // end of namespace tests } // end of namespace cachelib diff --git a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h index dba8cfd2dd..16e1f88728 100644 --- a/cachelib/allocator/tests/AllocatorMemoryTiersTest.h +++ b/cachelib/allocator/tests/AllocatorMemoryTiersTest.h @@ -27,7 +27,7 @@ namespace tests { template class AllocatorMemoryTiersTest : public AllocatorTest { public: - void testMultiTiersInvalid() { + void testMultiTiersFormFileInvalid() { typename AllocatorT::Config config; config.setCacheSize(100 * Slab::kSize); config.configureMemoryTiers({ @@ -42,7 +42,7 @@ class AllocatorMemoryTiersTest : public AllocatorTest { std::invalid_argument); } - void testMultiTiersValid() { + void testMultiTiersFromFileValid() { typename AllocatorT::Config config; config.setCacheSize(100 * Slab::kSize); config.enableCachePersistence("/tmp"); @@ -83,6 +83,47 @@ class AllocatorMemoryTiersTest : public AllocatorTest { ASSERT(handle != nullptr); ASSERT_NO_THROW(alloc->insertOrReplace(handle)); } + + void testMultiTiersNumaBindingsSysVValid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}) + }); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().cacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } + + void testMultiTiersNumaBindingsPosixValid() { + typename AllocatorT::Config config; + config.setCacheSize(100 * Slab::kSize); + config.enableCachePersistence("/tmp"); + config.usePosixForShm(); + config.configureMemoryTiers({ + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}), + MemoryTierCacheConfig::fromShm() + .setRatio(1).setMemBind({0}) + }); + + auto alloc = std::make_unique(AllocatorT::SharedMemNew, config); + ASSERT(alloc != nullptr); + + auto pool = alloc->addPool("default", alloc->getCacheMemoryStats().cacheSize); + auto handle = alloc->allocate(pool, "key", std::string("value").size()); + ASSERT(handle != nullptr); + ASSERT_NO_THROW(alloc->insertOrReplace(handle)); + } }; } // namespace tests } // namespace cachelib diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt index 1a1063104c..f935e6e706 100644 --- a/cachelib/cachebench/CMakeLists.txt +++ b/cachelib/cachebench/CMakeLists.txt @@ -89,5 +89,6 @@ if (BUILD_TESTS) add_test (consistency/tests/ValueHistoryTest.cpp) add_test (consistency/tests/ValueTrackerTest.cpp) add_test (util/tests/NandWritesTest.cpp) + add_test (util/tests/MemoryTierConfigTest.cpp) add_test (cache/tests/TimeStampTickerTest.cpp) endif() diff --git a/cachelib/cachebench/util/CacheConfig.cpp b/cachelib/cachebench/util/CacheConfig.cpp index f12992dd9e..29cd9cb6a3 100644 --- a/cachelib/cachebench/util/CacheConfig.cpp +++ b/cachelib/cachebench/util/CacheConfig.cpp @@ -137,8 +137,53 @@ std::shared_ptr CacheConfig::getRebalanceStrategy() const { MemoryTierConfig::MemoryTierConfig(const folly::dynamic& configJson) { JSONSetVal(configJson, file); JSONSetVal(configJson, ratio); + JSONSetVal(configJson, memBindNodes); - checkCorrectSize(); + checkCorrectSize(); +} + +static bool starts_with() {return true;} + +std::vector MemoryTierConfig::parseNumaNodes() { + std::vector numaNodes; + + std::vector tokens; + folly::split(",", memBindNodes, tokens, true /*ignore empty*/); + for(const auto &token : tokens) { + if(token.startsWith("!")) { + throw std::invalid_argument(folly::sformat( + "invalid NUMA nodes binding in memory tier config: {} " + "inverse !N or !N-N is not supported " + "nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth.", + token)); + } + else if(token.startsWith("+")) { + throw std::invalid_argument(folly::sformat( + "invalid NUMA nodes binding in memory tier config: {} " + "relative nodes are not supported. " + "nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth.", + token)); + } + else if (token.contains("-")) { + size_t begin, end; + if(folly::split("-", token, begin, end) && begin < end) { + while(begin <=end) { + numaNodes.push_back(begin++); + } + } else { + throw std::invalid_argument(folly::sformat( + "invalid NUMA nodes binding in memory tier config: {} " + "Invalid range format. " + "nodes may be specified as N,N,N or N-N or N,N-N or N-N,N-N and so forth.", + token)); + } + } + else { + numaNodes.push_back(folly::to(token)); + } + } + + return numaNodes; } } // namespace cachebench diff --git a/cachelib/cachebench/util/CacheConfig.h b/cachelib/cachebench/util/CacheConfig.h index b7829e28c7..7a8c9020b0 100644 --- a/cachelib/cachebench/util/CacheConfig.h +++ b/cachelib/cachebench/util/CacheConfig.h @@ -48,11 +48,13 @@ struct MemoryTierConfig : public JSONConfig { MemoryTierCacheConfig getMemoryTierCacheConfig() { MemoryTierCacheConfig config = memoryTierCacheConfigFromSource(); config.setRatio(ratio); + config.setMemBind(parseNumaNodes()); return config; } std::string file{""}; size_t ratio{0}; + std::string memBindNodes{""}; private: MemoryTierCacheConfig memoryTierCacheConfigFromSource() { @@ -62,6 +64,8 @@ struct MemoryTierConfig : public JSONConfig { return MemoryTierCacheConfig::fromFile(file); } } + + std::vector parseNumaNodes(); }; struct CacheConfig : public JSONConfig { diff --git a/cachelib/cachebench/util/tests/MemoryTierConfigTest.cpp b/cachelib/cachebench/util/tests/MemoryTierConfigTest.cpp new file mode 100644 index 0000000000..afd2bf80ad --- /dev/null +++ b/cachelib/cachebench/util/tests/MemoryTierConfigTest.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Copyright 2022-present Facebook. All Rights Reserved. + +#include + +#include +#include + +#include "cachelib/cachebench/util/CacheConfig.h" + +namespace facebook { +namespace cachelib { +namespace cachebench { + +TEST(MemoryTierConfigTest, MemBind_SingleNumaNode) { + const std::string configString = + "{" + " \"ratio\": 1," + " \"memBindNodes\": 1" + "}"; + + const std::vector expectedNumaNodes = {1}; + + auto configJson = folly::parseJson(folly::json::stripComments(configString)); + + MemoryTierConfig memoryTierConfig(configJson); + MemoryTierCacheConfig tierCacheConfig = memoryTierConfig.getMemoryTierCacheConfig(); + + auto parsedNumaNodes = tierCacheConfig.getMemBind(); + ASSERT_TRUE(std::equal(expectedNumaNodes.begin(), expectedNumaNodes.end(), parsedNumaNodes.begin())); +} + +TEST(MemoryTierConfigTest, MemBind_RangeNumaNodes) { + const std::string configString = + "{" + " \"ratio\": 1," + " \"memBindNodes\": \"0-2\"" + "}"; + + const std::vector expectedNumaNodes = {0, 1, 2}; + + auto configJson = folly::parseJson(folly::json::stripComments(configString)); + + MemoryTierConfig memoryTierConfig(configJson); + MemoryTierCacheConfig tierCacheConfig = memoryTierConfig.getMemoryTierCacheConfig(); + + auto parsedNumaNodes = tierCacheConfig.getMemBind(); + ASSERT_TRUE(std::equal(expectedNumaNodes.begin(), expectedNumaNodes.end(), parsedNumaNodes.begin())); +} + +TEST(MemoryTierConfigTest, MemBind_SingleAndRangeNumaNodes) { + const std::string configString = + "{" + " \"ratio\": 1," + " \"memBindNodes\": \"0,2-5\"" + "}"; + + const std::vector expectedNumaNodes = {0, 2, 3, 4, 5}; + + auto configJson = folly::parseJson(folly::json::stripComments(configString)); + + MemoryTierConfig memoryTierConfig(configJson); + MemoryTierCacheConfig tierCacheConfig = memoryTierConfig.getMemoryTierCacheConfig(); + + auto parsedNumaNodes = tierCacheConfig.getMemBind(); + ASSERT_TRUE(std::equal(expectedNumaNodes.begin(), expectedNumaNodes.end(), parsedNumaNodes.begin())); +} + +} // namespace facebook +} // namespace cachelib +} // namespace cachebench \ No newline at end of file diff --git a/cachelib/shm/CMakeLists.txt b/cachelib/shm/CMakeLists.txt index 4f97c0e763..83a798949c 100644 --- a/cachelib/shm/CMakeLists.txt +++ b/cachelib/shm/CMakeLists.txt @@ -25,6 +25,7 @@ add_library (cachelib_shm add_dependencies(cachelib_shm thrift_generated_files) target_link_libraries(cachelib_shm PUBLIC cachelib_common + numa ) install(TARGETS cachelib_shm diff --git a/cachelib/shm/PosixShmSegment.cpp b/cachelib/shm/PosixShmSegment.cpp index 027fee8bb8..1bdeec253d 100644 --- a/cachelib/shm/PosixShmSegment.cpp +++ b/cachelib/shm/PosixShmSegment.cpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "cachelib/common/Utils.h" @@ -176,6 +178,7 @@ void* PosixShmSegment::mapAddress(void* addr) const { util::throwSystemError(EINVAL, "Address already mapped"); } XDCHECK(retAddr == addr || addr == nullptr); + memBind(addr); return retAddr; } @@ -183,6 +186,44 @@ void PosixShmSegment::unMap(void* addr) const { detail::munmapImpl(addr, getSize()); } +static void forcePageAllocation(void* addr, size_t size, size_t pageSize) { + for(volatile char* curAddr = (char*)addr; curAddr < (char*)addr+size; curAddr += pageSize) { + *curAddr = *curAddr; + } +} + +void PosixShmSegment::memBind(void* addr) const { + if(opts_.memBindNumaNodes.empty()) return; + + struct bitmask *oldNodeMask = numa_allocate_nodemask(); + int oldMode = 0; + struct bitmask *nodesMask = numa_allocate_nodemask(); + auto guard = folly::makeGuard([&] { numa_bitmask_free(nodesMask); numa_bitmask_free(oldNodeMask); }); + + for(auto node : opts_.memBindNumaNodes) { + numa_bitmask_setbit(nodesMask, node); + } + + // mbind() cannot be used because mmap was called with MAP_SHARED flag + // But we can set memory policy for current thread and force page allcoation. + // The following logic is used: + // 1. Remember current memory policy for the current thread + // 2. Set new memory policy as specifiec by config + // 3. Force page allocation by touching every page in the segment + // 4. Restore memory policy + + // Remember current memory policy + get_mempolicy(&oldMode, oldNodeMask->maskp, oldNodeMask->size, nullptr, 0); + + // Set memory bindings + set_mempolicy(MPOL_BIND, nodesMask->maskp, nodesMask->size); + + forcePageAllocation(addr, getSize(), detail::getPageSize(opts_.pageSize)); + + // Restore memory policy for the thread + set_mempolicy(oldMode, nodesMask->maskp, nodesMask->size); +} + std::string PosixShmSegment::createKeyForName( const std::string& name) noexcept { // ensure that the slash is always there in the head. repetitive diff --git a/cachelib/shm/PosixShmSegment.h b/cachelib/shm/PosixShmSegment.h index 6aaeb004e7..bf43b2ca55 100644 --- a/cachelib/shm/PosixShmSegment.h +++ b/cachelib/shm/PosixShmSegment.h @@ -108,6 +108,8 @@ class PosixShmSegment : public ShmBase { void createReferenceMapping(); void deleteReferenceMapping() const; + void memBind(void* addr) const; + // file descriptor associated with the shm. This has FD_CLOEXEC set // and once opened, we close this only on destruction of this object int fd_{kInvalidFD}; diff --git a/cachelib/shm/ShmCommon.h b/cachelib/shm/ShmCommon.h index 0998f2f951..8ed5202b62 100644 --- a/cachelib/shm/ShmCommon.h +++ b/cachelib/shm/ShmCommon.h @@ -93,6 +93,7 @@ struct ShmSegmentOpts { PageSizeT pageSize{PageSizeT::NORMAL}; bool readOnly{false}; size_t alignment{1}; // alignment for mapping. + std::vector memBindNumaNodes; // opts specific to segment type ShmTypeOpts typeOpts{PosixSysVSegmentOpts(false)}; diff --git a/cachelib/shm/SysVShmSegment.cpp b/cachelib/shm/SysVShmSegment.cpp index e13d605aa5..8b13246ded 100644 --- a/cachelib/shm/SysVShmSegment.cpp +++ b/cachelib/shm/SysVShmSegment.cpp @@ -18,8 +18,11 @@ #include #include +#include #include #include +#include +#include #include "cachelib/common/Utils.h" @@ -184,6 +187,50 @@ void shmCtlImpl(int shmid, int cmd, shmid_ds* buf) { } } +void mbindImpl(void *addr, unsigned long len, int mode, + const std::vector& memBindNumaNodes, + unsigned int flags) { + struct bitmask *nodesMask = numa_allocate_nodemask(); + auto guard = folly::makeGuard([&] { numa_bitmask_free(nodesMask); }); + + for(auto node : memBindNumaNodes) { + numa_bitmask_setbit(nodesMask, node); + } + + long ret = mbind(addr, len, mode, nodesMask->maskp, nodesMask->size, flags); + if(ret == 0) return; + + switch (errno) { + case EFAULT: + util::throwSystemError(errno); + break; + case EINVAL: + util::throwSystemError(errno, "Invalid parameters when bind segment to NUMA node(s)"); + break; + case EIO: + if(flags & MPOL_MF_STRICT) { + util::throwSystemError(errno, "Segment already allocated on another NUMA node that does not follow the policy."); + } + if(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL )) { + util::throwSystemError(errno, "Segment already allocated but kernel was unable to move it to specified NUMA node(s)."); + } + util::throwSystemError(errno, "Invalid errno"); + break; + case ENOMEM: + util::throwSystemError(errno, "Could not bind memory. Insufficient kernel memory was available"); + break; + case EPERM: + if(flags & MPOL_MF_MOVE_ALL) { + util::throwSystemError(errno, "Process does not have the CAP_SYS_NICE privilege to bind segment with MPOL_MF_MOVE_ALL flag"); + } + util::throwSystemError(errno, "Invalid errno"); + break; + default: + XDCHECK(false); + util::throwSystemError(errno, "Invalid errno"); + } +} + } // namespace detail void ensureSizeforHugePage(size_t size) { @@ -270,11 +317,17 @@ void* SysVShmSegment::mapAddress(void* addr) const { void* retAddr = detail::shmAttachImpl(shmid_, addr, shmFlags); XDCHECK(retAddr == addr || addr == nullptr); + memBind(retAddr); return retAddr; } void SysVShmSegment::unMap(void* addr) const { detail::shmDtImpl(addr); } +void SysVShmSegment::memBind(void* addr) const { + if(opts_.memBindNumaNodes.empty()) return; + detail::mbindImpl(addr, getSize(), MPOL_BIND, opts_.memBindNumaNodes, 0); +} + void SysVShmSegment::markForRemoval() { if (isMarkedForRemoval()) { return; diff --git a/cachelib/shm/SysVShmSegment.h b/cachelib/shm/SysVShmSegment.h index fcebe03eb1..5a57215508 100644 --- a/cachelib/shm/SysVShmSegment.h +++ b/cachelib/shm/SysVShmSegment.h @@ -100,6 +100,7 @@ class SysVShmSegment : public ShmBase { void lockPagesInMemory() const; void createReferenceMapping(); void deleteReferenceMapping() const; + void memBind(void* addr) const; // the key identifier for the shared memory KeyType key_{kInvalidKey}; diff --git a/contrib/prerequisites-centos8.sh b/contrib/prerequisites-centos8.sh index 7e6cfad1d8..26be9201b3 100755 --- a/contrib/prerequisites-centos8.sh +++ b/contrib/prerequisites-centos8.sh @@ -57,7 +57,8 @@ sudo dnf --enablerepo="$POWERTOOLS_REPO" install -y \ libsodium-static \ libdwarf-static \ boost-static \ - double-conversion-static + double-conversion-static \ + numactl-devel #Do not install these from OS packages - they are typically outdated. #gflags-devel \