From 4b951ef459415adbab70b1b685590d60a8dfd29e Mon Sep 17 00:00:00 2001 From: Steven Wu Date: Mon, 20 Oct 2025 14:41:11 -0700 Subject: [PATCH] [CAS] Add OnDiskCAS Add OnDiskCAS abstraction, that implements ObjectStore and ActionCache interface using OnDiskGraphDB and OnDiskKeyValueDB. Reviewers: Pull Request: https://github.com/llvm/llvm-project/pull/114103 --- llvm/include/llvm/CAS/ActionCache.h | 10 + .../llvm/CAS/BuiltinUnifiedCASDatabases.h | 59 ++ llvm/include/llvm/CAS/ObjectStore.h | 49 +- llvm/include/llvm/CAS/UnifiedOnDiskCache.h | 191 +++++ llvm/lib/CAS/ActionCaches.cpp | 156 +++++ llvm/lib/CAS/BuiltinCAS.cpp | 14 +- llvm/lib/CAS/BuiltinCAS.h | 25 +- llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp | 38 + llvm/lib/CAS/CMakeLists.txt | 3 + llvm/lib/CAS/InMemoryCAS.cpp | 8 + llvm/lib/CAS/ObjectStore.cpp | 93 ++- llvm/lib/CAS/OnDiskCAS.cpp | 228 ++++++ llvm/lib/CAS/UnifiedOnDiskCache.cpp | 655 ++++++++++++++++++ llvm/unittests/CAS/ActionCacheTest.cpp | 6 +- .../CAS/BuiltinUnifiedCASDatabasesTest.cpp | 67 ++ llvm/unittests/CAS/CASTestConfig.cpp | 23 +- llvm/unittests/CAS/CASTestConfig.h | 44 +- llvm/unittests/CAS/CMakeLists.txt | 2 + llvm/unittests/CAS/ObjectStoreTest.cpp | 134 +++- llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp | 191 +++++ 20 files changed, 1967 insertions(+), 29 deletions(-) create mode 100644 llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h create mode 100644 llvm/include/llvm/CAS/UnifiedOnDiskCache.h create mode 100644 llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp create mode 100644 llvm/lib/CAS/OnDiskCAS.cpp create mode 100644 llvm/lib/CAS/UnifiedOnDiskCache.cpp create mode 100644 llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp create mode 100644 llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h index 69ee4dde1974a..781ad81001368 100644 --- a/llvm/include/llvm/CAS/ActionCache.h +++ b/llvm/include/llvm/CAS/ActionCache.h @@ -75,6 +75,9 @@ class ActionCache { CanBeDistributed); } + /// Validate the ActionCache contents. + virtual Error validate() const = 0; + virtual ~ActionCache() = default; protected: @@ -97,6 +100,13 @@ class ActionCache { /// Create an action cache in memory. std::unique_ptr createInMemoryActionCache(); +/// Get a reasonable default on-disk path for a persistent ActionCache for the +/// current user. +std::string getDefaultOnDiskActionCachePath(); + +/// Create an action cache on disk. +Expected> createOnDiskActionCache(StringRef Path); + } // end namespace llvm::cas #endif // LLVM_CAS_ACTIONCACHE_H diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h new file mode 100644 index 0000000000000..6c31a82ff9db0 --- /dev/null +++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H +#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H + +#include "llvm/Support/Error.h" + +namespace llvm::cas { + +class ActionCache; +class ObjectStore; + +/// Create on-disk \c ObjectStore and \c ActionCache instances based on +/// \c ondisk::UnifiedOnDiskCache, with built-in hashing. +Expected, std::unique_ptr>> +createOnDiskUnifiedCASDatabases(StringRef Path); + +/// Represents the result of validating the contents using +/// \c validateOnDiskUnifiedCASDatabasesIfNeeded. +/// +/// Note: invalid results are handled as an \c Error. +enum class ValidationResult { + /// The data is already valid. + Valid, + /// The data was invalid, but was recovered. + Recovered, + /// Validation was skipped, as it was not needed. + Skipped, +}; + +/// Validate the data in \p Path, if needed to ensure correctness. +/// +/// \param Path directory for the on-disk database. +/// \param CheckHash Whether to validate hashes match the data. +/// \param AllowRecovery Whether to automatically recover from invalid data by +/// marking the files for garbage collection. +/// \param ForceValidation Whether to force validation to occur even if it +/// should not be necessary. +/// \param LLVMCasBinary If provided, validation is performed out-of-process +/// using the given \c llvm-cas executable which protects against crashes +/// during validation. Otherwise validation is performed in-process. +/// +/// \returns \c Valid if the data is already valid, \c Recovered if data +/// was invalid but has been cleared, \c Skipped if validation is not needed, +/// or an \c Error if validation cannot be performed or if the data is left +/// in an invalid state because \p AllowRecovery is false. +Expected validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional LLVMCasBinary); + +} // namespace llvm::cas + +#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h index 6db5dd3904095..febfff815e86e 100644 --- a/llvm/include/llvm/CAS/ObjectStore.h +++ b/llvm/include/llvm/CAS/ObjectStore.h @@ -111,7 +111,10 @@ class ObjectStore { virtual Expected isMaterialized(ObjectRef Ref) const = 0; /// Validate the underlying object referred by CASID. - virtual Error validate(const CASID &ID) = 0; + virtual Error validateObject(const CASID &ID) = 0; + + /// Validate the entire ObjectStore. + virtual Error validate(bool CheckHash) const = 0; protected: /// Load the object referenced by \p Ref. @@ -215,9 +218,39 @@ class ObjectStore { return Data.size(); } + /// Set the size for limiting growth of on-disk storage. This has an effect + /// for when the instance is closed. + /// + /// Implementations may be not have this implemented. + virtual Error setSizeLimit(std::optional SizeLimit) { + return Error::success(); + } + + /// \returns the storage size of the on-disk CAS data. + /// + /// Implementations that don't have an implementation for this should return + /// \p std::nullopt. + virtual Expected> getStorageSize() const { + return std::nullopt; + } + + /// Prune local storage to reduce its size according to the desired size + /// limit. Pruning can happen concurrently with other operations. + /// + /// Implementations may be not have this implemented. + virtual Error pruneStorageData() { return Error::success(); } + /// Validate the whole node tree. Error validateTree(ObjectRef Ref); + /// Import object from another CAS. This will import the full tree from the + /// other CAS. + Expected importObject(ObjectStore &Upstream, ObjectRef Other); + + /// Print the ObjectStore internals for debugging purpose. + virtual void print(raw_ostream &) const {} + void dump() const; + /// Get CASContext const CASContext &getContext() const { return Context; } @@ -292,6 +325,20 @@ class ObjectProxy { std::unique_ptr createInMemoryCAS(); +/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled. +bool isOnDiskCASEnabled(); + +/// Gets or creates a persistent on-disk path at \p Path. +Expected> createOnDiskCAS(const Twine &Path); + +/// Set \p Path to a reasonable default on-disk path for a persistent CAS for +/// the current user. +Error getDefaultOnDiskCASPath(SmallVectorImpl &Path); + +/// Get a reasonable default on-disk path for a persistent CAS for the current +/// user. +llvm::Expected getDefaultOnDiskCASPath(); + } // namespace cas } // namespace llvm diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h new file mode 100644 index 0000000000000..fa9d2fcfdb4e4 --- /dev/null +++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h @@ -0,0 +1,191 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H +#define LLVM_CAS_UNIFIEDONDISKCACHE_H + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include + +namespace llvm::cas::ondisk { + +class OnDiskKeyValueDB; + +/// A unified CAS nodes and key-value database, using on-disk storage for both. +/// It manages storage growth and provides APIs for garbage collection. +/// +/// High-level properties: +/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the +/// storage size in that directory will keep growing unrestricted. For data to +/// become eligible for garbage-collection there should be no open instances +/// of \p UnifiedOnDiskCache for that directory, by any process. +/// * Garbage-collection needs to be triggered explicitly by the client. It can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers, in the same process or other +/// processes. +/// +/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open +/// for a limited period of time, e.g. for the duration of a build operation. +/// For long-living processes that need periodic access to a +/// \p UnifiedOnDiskCache, the client should device a scheme where access is +/// performed within some defined period. For example, if a service is designed +/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it +/// could keep the instance alive while new requests are coming in but close it +/// after a time period in which there are no new requests. +class UnifiedOnDiskCache { +public: + /// The \p OnDiskGraphDB instance for the open directory. + OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; } + + /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. + /// + /// \param Key the hash bytes for the key. + /// \param Value the \p ObjectID value. + /// + /// \returns the \p ObjectID associated with the \p Key. It may be different + /// than \p Value if another value was already associated with this key. + Expected KVPut(ArrayRef Key, ObjectID Value); + + /// Associate an \p ObjectID, of the \p OnDiskGraphDB instance, with a key. + /// An \p ObjectID as a key is equivalent to its digest bytes. + /// + /// \param Key the \p ObjectID for the key. + /// \param Value the \p ObjectID value. + /// + /// \returns the \p ObjectID associated with the \p Key. It may be different + /// than \p Value if another value was already associated with this key. + Expected KVPut(ObjectID Key, ObjectID Value); + + /// \returns the \p ObjectID, of the \p OnDiskGraphDB instance, associated + /// with the \p Key, or \p std::nullopt if the key does not exist. + Expected> KVGet(ArrayRef Key); + + /// Open a \p UnifiedOnDiskCache instance for a directory. + /// + /// \param Path directory for the on-disk database. The directory will be + /// created if it doesn't exist. + /// \param SizeLimit Optional size for limiting growth. This has an effect for + /// when the instance is closed. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param FaultInPolicy Controls how nodes are copied to primary store. This + /// is recorded at creation time and subsequent opens need to pass the same + /// policy otherwise the \p open will fail. + static Expected> + open(StringRef Path, std::optional SizeLimit, StringRef HashName, + unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy = + OnDiskGraphDB::FaultInPolicy::FullTree); + + /// Validate the data in \p Path, if needed to ensure correctness. + /// + /// Note: if invalid data is detected and \p AllowRecovery is true, then + /// recovery requires exclusive access to the CAS and it is an error to + /// attempt recovery if there is concurrent use of the CAS. + /// + /// \param Path directory for the on-disk database. + /// \param HashName Identifier name for the hashing algorithm that is going to + /// be used. + /// \param HashByteSize Size for the object digest hash bytes. + /// \param CheckHash Whether to validate hashes match the data. + /// \param AllowRecovery Whether to automatically recover from invalid data by + /// marking the files for garbage collection. + /// \param ForceValidation Whether to force validation to occur even if it + /// should not be necessary. + /// \param LLVMCasBinary If provided, validation is performed out-of-process + /// using the given \c llvm-cas executable which protects against crashes + /// during validation. Otherwise validation is performed in-process. + /// + /// \returns \c Valid if the data is already valid, \c Recovered if data + /// was invalid but has been cleared, \c Skipped if validation is not needed, + /// or an \c Error if validation cannot be performed or if the data is left + /// in an invalid state because \p AllowRecovery is false. + static Expected + validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize, + bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional LLVMCasBinary); + + /// This is called implicitly at destruction time, so it is not required for a + /// client to call this. After calling \p close the only method that is valid + /// to call is \p needsGarbageCollection. + /// + /// \param CheckSizeLimit if true it will check whether the primary store has + /// exceeded its intended size limit. If false the check is skipped even if a + /// \p SizeLimit was passed to the \p open call. + Error close(bool CheckSizeLimit = true); + + /// Set the size for limiting growth. This has an effect for when the instance + /// is closed. + void setSizeLimit(std::optional SizeLimit); + + /// \returns the storage size of the cache data. + uint64_t getStorageSize() const; + + /// \returns whether the primary store has exceeded the intended size limit. + /// This can return false even if the overall size of the opened directory is + /// over the \p SizeLimit passed to \p open. To know whether garbage + /// collection needs to be triggered or not, call \p needsGarbaseCollection. + bool hasExceededSizeLimit() const; + + /// \returns whether there are unused data that can be deleted using a + /// \p collectGarbage call. + bool needsGarbageCollection() const { return NeedsGarbageCollection; } + + /// Remove any unused data from the directory at \p Path. If there are no such + /// data the operation is a no-op. + /// + /// This can be called concurrently, regardless of whether there is an open + /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers + /// in the same process or other processes. + /// + /// It is recommended that garbage-collection is triggered concurrently in the + /// background, so that it has minimal effect on the workload of the process. + static Error collectGarbage(StringRef Path); + + /// Remove unused data from the current UnifiedOnDiskCache. + Error collectGarbage(); + + /// Validate the key value databases. + Error validateActionCache(); + + /// Get the upstream OnDiskGraphDB if exists. + /// + /// \returns upstream database or nullptr if upstream database doesn't exist. + OnDiskGraphDB *getUpstreamGraphDB() const { return UpstreamGraphDB; } + + ~UnifiedOnDiskCache(); + +private: + UnifiedOnDiskCache(); + + Expected> + faultInFromUpstreamKV(ArrayRef Key); + + /// \returns the storage size of the primary directory. + uint64_t getPrimaryStorageSize() const; + + std::string RootPath; + std::atomic SizeLimit; + + int LockFD = -1; + + std::atomic NeedsGarbageCollection; + std::string PrimaryDBDir; + + OnDiskGraphDB *UpstreamGraphDB = nullptr; + std::unique_ptr PrimaryGraphDB; + + std::unique_ptr UpstreamKVDB; + std::unique_ptr PrimaryKVDB; +}; + +} // namespace llvm::cas::ondisk + +#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp index 571c5b3ca5b4b..dac50e0740c5c 100644 --- a/llvm/lib/CAS/ActionCaches.cpp +++ b/llvm/lib/CAS/ActionCaches.cpp @@ -13,7 +13,11 @@ #include "BuiltinCAS.h" #include "llvm/ADT/TrieRawHashMap.h" #include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Config/llvm-config.h" #include "llvm/Support/BLAKE3.h" +#include "llvm/Support/Path.h" #define DEBUG_TYPE "cas-action-caches" @@ -47,12 +51,51 @@ class InMemoryActionCache final : public ActionCache { Expected> getImpl(ArrayRef ActionKey, bool CanBeDistributed) const final; + Error validate() const final { + return createStringError("InMemoryActionCache doesn't support validate()"); + } + private: using DataT = CacheEntry; using InMemoryCacheT = ThreadSafeTrieRawHashMap; InMemoryCacheT Cache; }; + +class OnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected> getImpl(ArrayRef ActionKey, + bool CanBeDistributed) const final; + + static Expected> create(StringRef Path); + + Error validate() const final; + +private: + static StringRef getHashName() { return "BLAKE3"; } + + OnDiskActionCache(std::unique_ptr DB); + + std::unique_ptr DB; + using DataT = CacheEntry; +}; + +class UnifiedOnDiskActionCache final : public ActionCache { +public: + Error putImpl(ArrayRef ActionKey, const CASID &Result, + bool CanBeDistributed) final; + Expected> getImpl(ArrayRef ActionKey, + bool CanBeDistributed) const final; + + UnifiedOnDiskActionCache(std::shared_ptr UniDB); + + Error validate() const final; + +private: + std::shared_ptr UniDB; +}; } // end namespace static Error createResultCachePoisonedError(ArrayRef KeyHash, @@ -92,10 +135,123 @@ Error InMemoryActionCache::putImpl(ArrayRef Key, const CASID &Result, Observed.getValue()); } +static constexpr StringLiteral DefaultName = "actioncache"; + namespace llvm::cas { +std::string getDefaultOnDiskActionCachePath() { + SmallString<128> Path; + if (!llvm::sys::path::cache_directory(Path)) + report_fatal_error("cannot get default cache directory"); + llvm::sys::path::append(Path, builtin::DefaultDir, DefaultName); + return Path.str().str(); +} + std::unique_ptr createInMemoryActionCache() { return std::make_unique(); } } // namespace llvm::cas + +OnDiskActionCache::OnDiskActionCache( + std::unique_ptr DB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + DB(std::move(DB)) {} + +Expected> +OnDiskActionCache::create(StringRef AbsPath) { + std::unique_ptr DB; + if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(), + sizeof(HashType), getHashName(), + sizeof(DataT)) + .moveInto(DB)) + return std::move(E); + return std::unique_ptr( + new OnDiskActionCache(std::move(DB))); +} + +Expected> +OnDiskActionCache::getImpl(ArrayRef Key, + bool /*CanBeDistributed*/) const { + std::optional> Val; + if (Error E = DB->get(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), toStringRef(*Val)); +} + +Error OnDiskActionCache::putImpl(ArrayRef Key, const CASID &Result, + bool /*CanBeDistributed*/) { + auto ResultHash = Result.getHash(); + ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size()); + ArrayRef Observed; + if (Error E = DB->put(Key, Expected).moveInto(Observed)) + return E; + + if (Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, + ArrayRef((const uint8_t *)Observed.data(), Observed.size())); +} + +Error OnDiskActionCache::validate() const { + // FIXME: without the matching CAS there is nothing we can check about the + // cached values. The hash size is already validated by the DB validator. + return DB->validate(nullptr); +} + +UnifiedOnDiskActionCache::UnifiedOnDiskActionCache( + std::shared_ptr UniDB) + : ActionCache(builtin::BuiltinCASContext::getDefaultContext()), + UniDB(std::move(UniDB)) {} + +Expected> +UnifiedOnDiskActionCache::getImpl(ArrayRef Key, + bool /*CanBeDistributed*/) const { + std::optional Val; + if (Error E = UniDB->KVGet(Key).moveInto(Val)) + return std::move(E); + if (!Val) + return std::nullopt; + return CASID::create(&getContext(), + toStringRef(UniDB->getGraphDB().getDigest(*Val))); +} + +Error UnifiedOnDiskActionCache::putImpl(ArrayRef Key, + const CASID &Result, + bool /*CanBeDistributed*/) { + auto Expected = UniDB->getGraphDB().getReference(Result.getHash()); + if (LLVM_UNLIKELY(!Expected)) + return Expected.takeError(); + std::optional Observed; + if (Error E = UniDB->KVPut(Key, *Expected).moveInto(Observed)) + return E; + + if (*Expected == Observed) + return Error::success(); + + return createResultCachePoisonedError( + Key, getContext(), Result, UniDB->getGraphDB().getDigest(*Observed)); +} + +Error UnifiedOnDiskActionCache::validate() const { + return UniDB->validateActionCache(); +} + +Expected> +cas::createOnDiskActionCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return OnDiskActionCache::create(Path); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} + +std::unique_ptr +cas::builtin::createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr UniDB) { + return std::make_unique(std::move(UniDB)); +} diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp index 73646ad2c3528..e9bc6d8beed4e 100644 --- a/llvm/lib/CAS/BuiltinCAS.cpp +++ b/llvm/lib/CAS/BuiltinCAS.cpp @@ -9,6 +9,7 @@ #include "BuiltinCAS.h" #include "llvm/ADT/StringExtras.h" #include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" #include "llvm/Support/Process.h" using namespace llvm; @@ -68,7 +69,7 @@ Expected BuiltinCAS::store(ArrayRef Refs, Refs, Data); } -Error BuiltinCAS::validate(const CASID &ID) { +Error BuiltinCAS::validateObject(const CASID &ID) { auto Ref = getReference(ID); if (!Ref) return createUnknownObjectError(ID); @@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) { return Error::success(); } + +Expected> +cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt, + BuiltinCASContext::getHashName(), + sizeof(HashType)); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h index 3b5374d5e1850..4d2de66cf636f 100644 --- a/llvm/lib/CAS/BuiltinCAS.h +++ b/llvm/lib/CAS/BuiltinCAS.h @@ -1,4 +1,4 @@ -//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,9 @@ namespace llvm::cas { class ActionCache; +namespace ondisk { +class UnifiedOnDiskCache; +} // namespace ondisk namespace builtin { /// Common base class for builtin CAS implementations using the same CASContext. @@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore { "corrupt storage"); } - Error validate(const CASID &ID) final; + Error validateObject(const CASID &ID) final; }; +/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing. +Expected> +createBuiltinUnifiedOnDiskCache(StringRef Path); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr UniDB); + +/// \param UniDB A \p UnifiedOnDiskCache instance from \p +/// createBuiltinUnifiedOnDiskCache. +std::unique_ptr createActionCacheFromUnifiedOnDiskCache( + std::shared_ptr UniDB); + +// FIXME: Proxy not portable. Maybe also error-prone? +constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default"; +constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default"; + } // end namespace builtin } // end namespace llvm::cas diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp new file mode 100644 index 0000000000000..f3f6fa043bc52 --- /dev/null +++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp @@ -0,0 +1,38 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "BuiltinCAS.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" + +using namespace llvm; +using namespace llvm::cas; + +Expected, std::unique_ptr>> +cas::createOnDiskUnifiedCASDatabases(StringRef Path) { + std::shared_ptr UniDB; + if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB)) + return std::move(E); + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB)); + return std::make_pair(std::move(CAS), std::move(AC)); +} + +Expected cas::validateOnDiskUnifiedCASDatabasesIfNeeded( + StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation, + std::optional LLVMCasBinary) { +#if LLVM_ENABLE_ONDISK_CAS + return ondisk::UnifiedOnDiskCache::validateIfNeeded( + Path, builtin::BuiltinCASContext::getHashName(), + sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation, + LLVMCasBinary); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled"); +#endif +} diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index a2f8c49e50145..aad77dce370d8 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + BuiltinUnifiedCASDatabases.cpp DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp + OnDiskCAS.cpp OnDiskCommon.cpp OnDiskDataAllocator.cpp OnDiskGraphDB.cpp OnDiskKeyValueDB.cpp OnDiskTrieRawHashMap.cpp + UnifiedOnDiskCache.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp index c63ee70de0849..2d4eedd5bdc8f 100644 --- a/llvm/lib/CAS/InMemoryCAS.cpp +++ b/llvm/lib/CAS/InMemoryCAS.cpp @@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS { return cast(asInMemoryObject(Node)).getData(); } + void print(raw_ostream &OS) const final; + + Error validate(bool CheckHash) const final { + return createStringError("InMemoryCAS doesn't support validate()"); + } + InMemoryCAS() = default; private: @@ -271,6 +277,8 @@ ArrayRef InMemoryObject::getRefs() const { return cast(this)->getRefsImpl(); } +void InMemoryCAS::print(raw_ostream &OS) const {} + Expected InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef ComputedHash, sys::fs::mapped_file_region Map) { diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp index e0be50bbe013a..3110577e03774 100644 --- a/llvm/lib/CAS/ObjectStore.cpp +++ b/llvm/lib/CAS/ObjectStore.cpp @@ -1,4 +1,4 @@ -//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,7 +12,7 @@ #include "llvm/Support/Errc.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" -#include +#include using namespace llvm; using namespace llvm::cas; @@ -21,6 +21,7 @@ void CASContext::anchor() {} void ObjectStore::anchor() {} LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); } +LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); } LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); } @@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) { auto [I, Inserted] = ValidatedRefs.insert(Ref); if (!Inserted) continue; // already validated. - if (Error E = validate(getID(Ref))) + if (Error E = validateObject(getID(Ref))) return E; Expected Obj = load(Ref); if (!Obj) @@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) { return Error::success(); } +Expected ObjectStore::importObject(ObjectStore &Upstream, + ObjectRef Other) { + // Copy the full CAS tree from upstream with depth-first ordering to ensure + // all the child nodes are available in downstream CAS before inserting + // current object. This uses a similar algorithm as + // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema + // so it can be used to import from any other ObjectStore reguardless of the + // CAS schema. + + // There is no work to do if importing from self. + if (this == &Upstream) + return Other; + + /// Keeps track of the state of visitation for current node and all of its + /// parents. Upstream Cursor holds information only from upstream CAS. + struct UpstreamCursor { + ObjectRef Ref; + ObjectHandle Node; + size_t RefsCount; + std::deque Refs; + }; + SmallVector CursorStack; + /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either + /// just stored in the CAS or nodes already exists in the current CAS. + SmallVector PrimaryRefStack; + /// A map from upstream ObjectRef to current ObjectRef. + llvm::DenseMap CreatedObjects; + + auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) { + unsigned NumRefs = Upstream.getNumRefs(Node); + std::deque Refs; + for (unsigned I = 0; I < NumRefs; ++I) + Refs.push_back(Upstream.readRef(Node, I)); + + CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)}); + }; + + auto UpstreamHandle = Upstream.load(Other); + if (!UpstreamHandle) + return UpstreamHandle.takeError(); + enqueueNode(Other, *UpstreamHandle); + + while (!CursorStack.empty()) { + UpstreamCursor &Cur = CursorStack.back(); + if (Cur.Refs.empty()) { + // Copy the node data into the primary store. + // The bottom of \p PrimaryRefStack contains the ObjectRef for the + // current node. + assert(PrimaryRefStack.size() >= Cur.RefsCount); + auto Refs = ArrayRef(PrimaryRefStack) + .slice(PrimaryRefStack.size() - Cur.RefsCount); + auto NewNode = store(Refs, Upstream.getData(Cur.Node)); + if (!NewNode) + return NewNode.takeError(); + + // Remove the current node and its IDs from the stack. + PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount); + CursorStack.pop_back(); + + PrimaryRefStack.push_back(*NewNode); + CreatedObjects.try_emplace(Cur.Ref, *NewNode); + continue; + } + + // Check if the node exists already. + auto CurrentID = Cur.Refs.front(); + Cur.Refs.pop_front(); + auto Ref = CreatedObjects.find(CurrentID); + if (Ref != CreatedObjects.end()) { + // If exists already, just need to enqueue the primary node. + PrimaryRefStack.push_back(Ref->second); + continue; + } + + // Load child. + auto PrimaryID = Upstream.load(CurrentID); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + + enqueueNode(CurrentID, *PrimaryID); + } + + assert(PrimaryRefStack.size() == 1); + return PrimaryRefStack.front(); +} + std::unique_ptr ObjectProxy::getMemoryBuffer(StringRef Name, bool RequiresNullTerminator) const { diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp new file mode 100644 index 0000000000000..035722459236a --- /dev/null +++ b/llvm/lib/CAS/OnDiskCAS.cpp @@ -0,0 +1,228 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "BuiltinCAS.h" +#include "llvm/CAS/BuiltinCASContext.h" +#include "llvm/CAS/BuiltinObjectHasher.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/Path.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::builtin; + +namespace { + +class OnDiskCAS : public BuiltinCAS { +public: + Expected storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) final; + + Expected> loadIfExists(ObjectRef Ref) final; + + CASID getID(ObjectRef Ref) const final; + + std::optional getReference(const CASID &ID) const final; + + Expected isMaterialized(ObjectRef Ref) const final; + + ArrayRef getDataConst(ObjectHandle Node) const final; + + void print(raw_ostream &OS) const final; + Error validate(bool CheckHash) const final; + + static Expected> open(StringRef Path); + + OnDiskCAS(std::shared_ptr UniDB) + : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {} + +private: + ObjectHandle convertHandle(ondisk::ObjectHandle Node) const { + return makeObjectHandle(Node.getOpaqueData()); + } + + ondisk::ObjectHandle convertHandle(ObjectHandle Node) const { + return ondisk::ObjectHandle(Node.getInternalRef(*this)); + } + + ObjectRef convertRef(ondisk::ObjectID Ref) const { + return makeObjectRef(Ref.getOpaqueData()); + } + + ondisk::ObjectID convertRef(ObjectRef Ref) const { + return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this)); + } + + size_t getNumRefs(ObjectHandle Node) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return std::distance(RefsRange.begin(), RefsRange.end()); + } + ObjectRef readRef(ObjectHandle Node, size_t I) const final { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + return convertRef(RefsRange.begin()[I]); + } + Error forEachRef(ObjectHandle Node, + function_ref Callback) const final; + + Error setSizeLimit(std::optional SizeLimit) final; + Expected> getStorageSize() const final; + Error pruneStorageData() final; + + OnDiskCAS(std::unique_ptr GraphDB) + : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {} + + std::unique_ptr OwnedDB; + std::shared_ptr UnifiedDB; + ondisk::OnDiskGraphDB *DB; +}; + +} // end anonymous namespace + +void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); } +Error OnDiskCAS::validate(bool CheckHash) const { + auto Hasher = [](ArrayRef> Refs, ArrayRef Data, + SmallVectorImpl &Result) { + auto Hash = BuiltinObjectHasher::hashObject( + Refs, Data); + Result.assign(Hash.begin(), Hash.end()); + }; + + if (auto E = DB->validate(CheckHash, Hasher)) + return E; + if (UnifiedDB && UnifiedDB->getUpstreamGraphDB()) + return UnifiedDB->getUpstreamGraphDB()->validate(CheckHash, Hasher); + + return Error::success(); +} + +CASID OnDiskCAS::getID(ObjectRef Ref) const { + ArrayRef Hash = DB->getDigest(convertRef(Ref)); + return CASID::create(&getContext(), toStringRef(Hash)); +} + +std::optional OnDiskCAS::getReference(const CASID &ID) const { + std::optional ObjID = + DB->getExistingReference(ID.getHash()); + if (!ObjID) + return std::nullopt; + return convertRef(*ObjID); +} + +Expected OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const { + return DB->isMaterialized(convertRef(ExternalRef)); +} + +ArrayRef OnDiskCAS::getDataConst(ObjectHandle Node) const { + return DB->getObjectData(convertHandle(Node)); +} + +Expected> +OnDiskCAS::loadIfExists(ObjectRef ExternalRef) { + Expected> ObjHnd = + DB->load(convertRef(ExternalRef)); + if (!ObjHnd) + return ObjHnd.takeError(); + if (!*ObjHnd) + return std::nullopt; + return convertHandle(**ObjHnd); +} + +Expected OnDiskCAS::storeImpl(ArrayRef ComputedHash, + ArrayRef Refs, + ArrayRef Data) { + SmallVector IDs; + IDs.reserve(Refs.size()); + for (ObjectRef Ref : Refs) { + IDs.push_back(convertRef(Ref)); + } + + auto StoredID = DB->getReference(ComputedHash); + if (LLVM_UNLIKELY(!StoredID)) + return StoredID.takeError(); + if (Error E = DB->store(*StoredID, IDs, Data)) + return std::move(E); + return convertRef(*StoredID); +} + +Error OnDiskCAS::forEachRef(ObjectHandle Node, + function_ref Callback) const { + auto RefsRange = DB->getObjectRefs(convertHandle(Node)); + for (ondisk::ObjectID Ref : RefsRange) { + if (Error E = Callback(convertRef(Ref))) + return E; + } + return Error::success(); +} + +Error OnDiskCAS::setSizeLimit(std::optional SizeLimit) { + UnifiedDB->setSizeLimit(SizeLimit); + return Error::success(); +} + +Expected> OnDiskCAS::getStorageSize() const { + return UnifiedDB->getStorageSize(); +} + +Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); } + +Expected> OnDiskCAS::open(StringRef AbsPath) { + Expected> DB = + ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(), + sizeof(HashType)); + if (!DB) + return DB.takeError(); + return std::unique_ptr(new OnDiskCAS(std::move(*DB))); +} + +bool cas::isOnDiskCASEnabled() { +#if LLVM_ENABLE_ONDISK_CAS + return true; +#else + return false; +#endif +} + +Expected> cas::createOnDiskCAS(const Twine &Path) { +#if LLVM_ENABLE_ONDISK_CAS + // FIXME: An absolute path isn't really good enough. Should open a directory + // and use openat() for files underneath. + SmallString<256> AbsPath; + Path.toVector(AbsPath); + sys::fs::make_absolute(AbsPath); + + return OnDiskCAS::open(AbsPath); +#else + return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled"); +#endif /* LLVM_ENABLE_ONDISK_CAS */ +} + +std::unique_ptr +cas::builtin::createObjectStoreFromUnifiedOnDiskCache( + std::shared_ptr UniDB) { + return std::make_unique(std::move(UniDB)); +} + +static constexpr StringLiteral DefaultName = "cas"; + +Error cas::getDefaultOnDiskCASPath(SmallVectorImpl &Path) { + if (!llvm::sys::path::cache_directory(Path)) + return createStringError("cache directory is not available"); + llvm::sys::path::append(Path, DefaultDir, DefaultName); + return Error::success(); +} + +Expected cas::getDefaultOnDiskCASPath() { + SmallString<128> Path; + if (auto E = getDefaultOnDiskCASPath(Path)) + return std::move(E); + return Path.str().str(); +} diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp new file mode 100644 index 0000000000000..acf327d57cfe0 --- /dev/null +++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp @@ -0,0 +1,655 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one +/// directory while also restricting storage growth with a scheme of chaining +/// the two most recent directories (primary & upstream), where the primary +/// "faults-in" data from the upstream one. When the primary (most recent) +/// directory exceeds its intended limit a new empty directory becomes the +/// primary one. +/// +/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open +/// receives) there are directories named like this: +/// +/// 'v.' +/// 'v..' +/// ... +/// +/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and +/// the part after the dot is an increasing integer. The primary directory is +/// the one with the highest integer and the upstream one is the directory +/// before it. For example, if the sub-directories contained are: +/// +/// 'v1.5', 'v1.6', 'v1.7', 'v1.8' +/// +/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are +/// unused directories that can be safely deleted at any time and by any +/// process. +/// +/// Contained within the top-level directory is a file named "lock" which is +/// used for processes to take shared or exclusive locks for the contents of the +/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock +/// for the top-level directory; when it closes, if the primary sub-directory +/// exceeded its limit, it attempts to get an exclusive lock in order to create +/// a new empty primary directory; if it can't get the exclusive lock it gives +/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt +/// again. +/// +/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a +/// directory, by any process, the storage size in that directory will keep +/// growing unrestricted. But the major benefit is that garbage-collection can +/// be triggered on a directory concurrently, at any time and by any process, +/// without affecting any active readers/writers in the same process or other +/// processes. +/// +/// The \c UnifiedOnDiskCache also provides validation and recovery on top of +/// the underlying on-disk storage. The low-level storage is designed to remain +/// coherent across regular process crashes, but may be invalid after power loss +/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows +/// validating the contents once per boot and can recover by marking invalid +/// data for garbage collection. +/// +/// The data recovery described above requires exclusive access to the CAS, and +/// it is an error to attempt recovery if the CAS is open in any process/thread. +/// In order to maximize backwards compatibility with tools that do not perform +/// validation before opening the CAS, we do not attempt to get exclusive access +/// until recovery is actually performed, meaning as long as the data is valid +/// it will not conflict with concurrent use. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "BuiltinCAS.h" +#include "OnDiskCommon.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/OnDiskGraphDB.h" +#include "llvm/CAS/OnDiskKeyValueDB.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/raw_ostream.h" +#include + +#if __has_include() +#include +#endif + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out +/// how to handle the leftover sub-directories of the previous version, within +/// the \p UnifiedOnDiskCache::collectGarbage function. +static constexpr StringLiteral DBDirPrefix = "v1."; + +static constexpr StringLiteral ValidationFilename = "v1.validation"; +static constexpr StringLiteral CorruptPrefix = "corrupt."; + +Expected UnifiedOnDiskCache::KVPut(ObjectID Key, ObjectID Value) { + return KVPut(PrimaryGraphDB->getDigest(Key), Value); +} + +Expected UnifiedOnDiskCache::KVPut(ArrayRef Key, + ObjectID Value) { + static_assert(sizeof(Value.getOpaqueData()) == sizeof(uint64_t), + "unexpected return opaque type"); + std::array ValBytes; + support::endian::write64le(ValBytes.data(), Value.getOpaqueData()); + Expected> Existing = PrimaryKVDB->put(Key, ValBytes); + if (!Existing) + return Existing.takeError(); + assert(Existing->size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Existing->data())); +} + +Expected> +UnifiedOnDiskCache::KVGet(ArrayRef Key) { + std::optional> Value; + if (Error E = PrimaryKVDB->get(Key).moveInto(Value)) + return std::move(E); + if (!Value) { + if (UpstreamKVDB) + return faultInFromUpstreamKV(Key); + return std::nullopt; + } + assert(Value->size() == sizeof(uint64_t)); + return ObjectID::fromOpaqueData(support::endian::read64le(Value->data())); +} + +Expected> +UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef Key) { + assert(UpstreamGraphDB); + assert(UpstreamKVDB); + + std::optional> UpstreamValue; + if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue)) + return std::move(E); + if (!UpstreamValue) + return std::nullopt; + + // The value is the \p ObjectID in the context of the upstream + // \p OnDiskGraphDB instance. Translate it to the context of the primary + // \p OnDiskGraphDB instance. + assert(UpstreamValue->size() == sizeof(uint64_t)); + ObjectID UpstreamID = ObjectID::fromOpaqueData( + support::endian::read64le(UpstreamValue->data())); + auto PrimaryID = + PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID)); + if (LLVM_UNLIKELY(!PrimaryID)) + return PrimaryID.takeError(); + return KVPut(Key, *PrimaryID); +} + +Error UnifiedOnDiskCache::validateActionCache() { + auto ValidateRef = [&](FileOffset Offset, ArrayRef Value) -> Error { + assert(Value.size() == sizeof(uint64_t) && "should be validated already"); + auto ID = ObjectID::fromOpaqueData(support::endian::read64le(Value.data())); + auto formatError = [&](Twine Msg) { + return createStringError( + llvm::errc::illegal_byte_sequence, + "bad record at 0x" + + utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " + + Msg.str()); + }; + if (ID.getOpaqueData() == 0) + return formatError("zero is not a valid ref"); + return Error::success(); + }; + if (Error E = PrimaryKVDB->validate(ValidateRef)) + return E; + if (UpstreamKVDB) + return UpstreamKVDB->validate(ValidateRef); + return Error::success(); +} + +/// \returns all the 'v.' names of sub-directories, sorted with +/// ascending order of the integer after the dot. Corrupt directories, if +/// included, will come first. +static Error getAllDBDirs(StringRef Path, SmallVectorImpl &DBDirs, + bool IncludeCorrupt = false) { + struct DBDir { + uint64_t Order; + std::string Name; + }; + SmallVector FoundDBDirs; + + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() != sys::fs::file_type::directory_file) + continue; + StringRef SubDir = sys::path::filename(DirI->path()); + if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) { + FoundDBDirs.push_back({0, std::string(SubDir)}); + continue; + } + if (!SubDir.starts_with(DBDirPrefix)) + continue; + uint64_t Order; + if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order)) + return createStringError(inconvertibleErrorCode(), + "unexpected directory " + DirI->path()); + FoundDBDirs.push_back({Order, std::string(SubDir)}); + } + if (EC) + return createFileError(Path, EC); + + llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool { + return LHS.Order <= RHS.Order; + }); + for (DBDir &Dir : FoundDBDirs) + DBDirs.push_back(std::move(Dir.Name)); + return Error::success(); +} + +static Error getAllGarbageDirs(StringRef Path, + SmallVectorImpl &DBDirs) { + if (Error E = getAllDBDirs(Path, DBDirs, /*IncludeCorrupt=*/true)) + return E; + + // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure + // out how to handle the leftover sub-directories of the previous version. + + for (unsigned Keep = 2; Keep > 0 && !DBDirs.empty(); --Keep) { + StringRef Back(DBDirs.back()); + if (Back.starts_with(CorruptPrefix)) + break; + DBDirs.pop_back(); + } + return Error::success(); +} + +/// \returns Given a sub-directory named 'v.', it outputs the +/// 'v.' name. +static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) { + assert(DBDir.starts_with(DBDirPrefix)); + uint64_t Count; + bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count); + assert(!Failed); + (void)Failed; + OS << DBDirPrefix << Count + 1; +} + +static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath, + bool CheckHash) { + SmallVector Args{LLVMCasBinary, "-cas", RootPath, "-validate"}; + if (CheckHash) + Args.push_back("-check-hash"); + + llvm::SmallString<128> StdErrPath; + int StdErrFD = -1; + if (std::error_code EC = sys::fs::createTemporaryFile( + "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath, + llvm::sys::fs::OF_Text)) + return createStringError(EC, "failed to create temporary file"); + FileRemover OutputRemover(StdErrPath.c_str()); + + std::optional Redirects[] = { + {""}, // stdin = /dev/null + {""}, // stdout = /dev/null + StdErrPath.str(), + }; + + std::string ErrMsg; + int Result = + sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects, + /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg); + + if (Result == -1) + return createStringError("failed to exec " + join(Args, " ") + ": " + + ErrMsg); + if (Result != 0) { + llvm::SmallString<64> Err("cas contents invalid"); + if (!ErrMsg.empty()) { + Err += ": "; + Err += ErrMsg; + } + auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str()); + if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) { + Err += ": "; + Err += (*StdErrBuf)->getBuffer(); + } + return createStringError(Err); + } + return Error::success(); +} + +static Error validateInProcess(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash) { + std::shared_ptr UniDB; + if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName, + HashByteSize) + .moveInto(UniDB)) + return E; + auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB); + if (Error E = CAS->validate(CheckHash)) + return E; + if (Error E = UniDB->validateActionCache()) + return E; + return Error::success(); +} + +static Expected getBootTime() { +#if __has_include() && defined(KERN_BOOTTIME) + struct timeval TV; + size_t TVLen = sizeof(TV); + int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME}; + if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0) + return createStringError(llvm::errnoAsErrorCode(), + "failed to get boottime"); + if (TVLen != sizeof(TV)) + return createStringError("sysctl kern.boottime unexpected format"); + return TV.tv_sec; +#elif defined(__linux__) + // Use the mtime for /proc, which is recreated during system boot. + // We could also read /proc/stat and search for 'btime'. + sys::fs::file_status Status; + if (std::error_code EC = sys::fs::status("/proc", Status)) + return createFileError("/proc", EC); + return Status.getLastModificationTime().time_since_epoch().count(); +#else + llvm::report_fatal_error("unimplemented"); +#endif +} + +Expected +UnifiedOnDiskCache::validateIfNeeded(StringRef RootPath, StringRef HashName, + unsigned HashByteSize, bool CheckHash, + bool AllowRecovery, bool ForceValidation, + std::optional LLVMCasBinary) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, ValidationFilename); + int FD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(FD != -1); + + sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD); + auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); }); + + if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive)) + return createFileError(PathBuf, EC); + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); }); + + SmallString<8> Bytes; + if (Error E = sys::fs::readNativeFileToEOF(File, Bytes)) + return createFileError(PathBuf, std::move(E)); + + uint64_t ValidationBootTime = 0; + if (!Bytes.empty() && + StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime)) + return createFileError(PathBuf, errc::illegal_byte_sequence, + "expected integer"); + + static uint64_t BootTime = 0; + if (BootTime == 0) + if (Error E = getBootTime().moveInto(BootTime)) + return std::move(E); + + bool Recovered = false; + bool Skipped = false; + std::string LogValidationError; + + if (ValidationBootTime == BootTime && !ForceValidation) { + Skipped = true; + return ValidationResult::Skipped; + } + + // Validate! + bool NeedsRecovery = false; + Error E = + LLVMCasBinary + ? validateOutOfProcess(*LLVMCasBinary, RootPath, CheckHash) + : validateInProcess(RootPath, HashName, HashByteSize, CheckHash); + if (E) { + if (AllowRecovery) { + consumeError(std::move(E)); + NeedsRecovery = true; + } else { + return std::move(E); + } + } + + if (NeedsRecovery) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, "lock"); + + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); }); + if (std::error_code EC = tryLockFileThreadSafe(LockFD)) { + if (EC == std::errc::no_lock_available) + return createFileError( + PathBuf, EC, + "CAS validation requires exclusive access but CAS was in use"); + return createFileError(PathBuf, EC); + } + auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + SmallVector DBDirs; + if (Error E = getAllDBDirs(RootPath, DBDirs)) + return std::move(E); + + for (StringRef DBDir : DBDirs) { + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, DBDir); + std::error_code EC; + int Attempt = 0, MaxAttempts = 100; + SmallString<128> GCPath; + for (; Attempt < MaxAttempts; ++Attempt) { + GCPath.assign(RootPath); + sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) + + "." + DBDir); + EC = sys::fs::rename(PathBuf, GCPath); + // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST. + if (EC != errc::directory_not_empty && EC != errc::file_exists) + break; + } + if (Attempt == MaxAttempts) + return createStringError( + EC, "rename " + PathBuf + + " failed: too many CAS directories awaiting pruning"); + if (EC) + return createStringError(EC, "rename " + PathBuf + " to " + GCPath + + " failed: " + EC.message()); + } + Recovered = true; + } + + if (ValidationBootTime != BootTime) { + // Fix filename in case we have error to report. + sys::path::remove_filename(PathBuf); + sys::path::append(PathBuf, ValidationFilename); + if (std::error_code EC = sys::fs::resize_file(FD, 0)) + return createFileError(PathBuf, EC); + raw_fd_ostream OS(FD, /*shouldClose=*/false); + OS.seek(0); // resize does not reset position + OS << BootTime << '\n'; + if (OS.has_error()) + return createFileError(PathBuf, OS.error()); + } + + return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid; +} + +Expected> +UnifiedOnDiskCache::open(StringRef RootPath, std::optional SizeLimit, + StringRef HashName, unsigned HashByteSize, + OnDiskGraphDB::FaultInPolicy FaultInPolicy) { + if (std::error_code EC = sys::fs::create_directories(RootPath)) + return createFileError(RootPath, EC); + + SmallString<256> PathBuf(RootPath); + sys::path::append(PathBuf, "lock"); + int LockFD = -1; + if (std::error_code EC = sys::fs::openFileForReadWrite( + PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None)) + return createFileError(PathBuf, EC); + assert(LockFD != -1); + // Locking the directory using shared lock, which will prevent other processes + // from creating a new chain (essentially while a \p UnifiedOnDiskCache + // instance holds a shared lock the storage for the primary directory will + // grow unrestricted). + if (std::error_code EC = + lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared)) + return createFileError(PathBuf, EC); + + SmallVector DBDirs; + if (Error E = getAllDBDirs(RootPath, DBDirs)) + return std::move(E); + if (DBDirs.empty()) + DBDirs.push_back((Twine(DBDirPrefix) + "1").str()); + + assert(!DBDirs.empty()); + + /// If there is only one directory open databases on it. If there are 2 or + /// more directories, get the most recent directories and chain them, with the + /// most recent being the primary one. The remaining directories are unused + /// data than can be garbage-collected. + std::unique_ptr UpstreamGraphDB; + std::unique_ptr UpstreamKVDB; + if (DBDirs.size() > 1) { + StringRef UpstreamDir = *(DBDirs.end() - 2); + PathBuf = RootPath; + sys::path::append(PathBuf, UpstreamDir); + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + /*UpstreamDB=*/nullptr, FaultInPolicy) + .moveInto(UpstreamGraphDB)) + return std::move(E); + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(UpstreamKVDB)) + return std::move(E); + } + OnDiskGraphDB *UpstreamGraphDBPtr = UpstreamGraphDB.get(); + + StringRef PrimaryDir = *(DBDirs.end() - 1); + PathBuf = RootPath; + sys::path::append(PathBuf, PrimaryDir); + std::unique_ptr PrimaryGraphDB; + if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize, + std::move(UpstreamGraphDB), FaultInPolicy) + .moveInto(PrimaryGraphDB)) + return std::move(E); + std::unique_ptr PrimaryKVDB; + // \p UnifiedOnDiskCache does manual chaining for key-value requests, + // including an extra translation step of the value during fault-in. + if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize, + /*ValueName=*/"objectid", + /*ValueSize=*/sizeof(uint64_t)) + .moveInto(PrimaryKVDB)) + return std::move(E); + + auto UniDB = std::unique_ptr(new UnifiedOnDiskCache()); + UniDB->RootPath = RootPath; + UniDB->SizeLimit = SizeLimit.value_or(0); + UniDB->LockFD = LockFD; + UniDB->NeedsGarbageCollection = DBDirs.size() > 2; + UniDB->PrimaryDBDir = PrimaryDir; + UniDB->UpstreamGraphDB = UpstreamGraphDBPtr; + UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB); + UniDB->UpstreamKVDB = std::move(UpstreamKVDB); + UniDB->PrimaryKVDB = std::move(PrimaryKVDB); + + return std::move(UniDB); +} + +void UnifiedOnDiskCache::setSizeLimit(std::optional SizeLimit) { + this->SizeLimit = SizeLimit.value_or(0); +} + +uint64_t UnifiedOnDiskCache::getStorageSize() const { + uint64_t TotalSize = getPrimaryStorageSize(); + if (UpstreamGraphDB) + TotalSize += UpstreamGraphDB->getStorageSize(); + if (UpstreamKVDB) + TotalSize += UpstreamKVDB->getStorageSize(); + return TotalSize; +} + +uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const { + return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize(); +} + +bool UnifiedOnDiskCache::hasExceededSizeLimit() const { + uint64_t CurSizeLimit = SizeLimit; + if (!CurSizeLimit) + return false; + + // If the hard limit is beyond 85%, declare above limit and request clean up. + unsigned CurrentPrecent = + std::max(PrimaryGraphDB->getHardStorageLimitUtilization(), + PrimaryKVDB->getHardStorageLimitUtilization()); + if (CurrentPrecent > 85) + return true; + + // We allow each of the directories in the chain to reach up to half the + // intended size limit. Check whether the primary directory has exceeded half + // the limit or not, in order to decide whether we need to start a new chain. + // + // We could check the size limit against the sum of sizes of both the primary + // and upstream directories but then if the upstream is significantly larger + // than the intended limit, it would trigger a new chain to be created before + // the primary has reached its own limit. Essentially in such situation we + // prefer reclaiming the storage later in order to have more consistent cache + // hits behavior. + return (CurSizeLimit / 2) < getPrimaryStorageSize(); +} + +Error UnifiedOnDiskCache::close(bool CheckSizeLimit) { + if (LockFD == -1) + return Error::success(); // already closed. + auto _1 = make_scope_exit([&]() { + assert(LockFD >= 0); + sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD); + sys::fs::closeFile(LockFile); + LockFD = -1; + }); + + bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false; + PrimaryKVDB.reset(); + UpstreamKVDB.reset(); + PrimaryGraphDB.reset(); + UpstreamGraphDB = nullptr; + if (std::error_code EC = unlockFileThreadSafe(LockFD)) + return createFileError(RootPath, EC); + + if (!ExceededSizeLimit) + return Error::success(); + + // The primary directory exceeded its intended size limit. Try to get an + // exclusive lock in order to create a new primary directory for next time + // this \p UnifiedOnDiskCache path is opened. + + if (std::error_code EC = tryLockFileThreadSafe( + LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) { + if (EC == errc::no_lock_available) + return Error::success(); // couldn't get exclusive lock, give up. + return createFileError(RootPath, EC); + } + auto _2 = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); }); + + // Managed to get an exclusive lock which means there are no other open + // \p UnifiedOnDiskCache instances for the same path, so we can safely start a + // new primary directory. To start a new primary directory we just have to + // create a new empty directory with the next consecutive index; since this is + // an atomic operation we will leave the top-level directory in a consistent + // state even if the process dies during this code-path. + + SmallString<256> PathBuf(RootPath); + raw_svector_ostream OS(PathBuf); + OS << sys::path::get_separator(); + getNextDBDirName(PrimaryDBDir, OS); + if (std::error_code EC = sys::fs::create_directory(PathBuf)) + return createFileError(PathBuf, EC); + + NeedsGarbageCollection = true; + return Error::success(); +} + +UnifiedOnDiskCache::UnifiedOnDiskCache() = default; + +UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); } + +Error UnifiedOnDiskCache::collectGarbage(StringRef Path) { + SmallVector DBDirs; + if (Error E = getAllGarbageDirs(Path, DBDirs)) + return E; + + SmallString<256> PathBuf(Path); + for (StringRef UnusedSubDir : DBDirs) { + sys::path::append(PathBuf, UnusedSubDir); + if (std::error_code EC = sys::fs::remove_directories(PathBuf)) + return createFileError(PathBuf, EC); + sys::path::remove_filename(PathBuf); + } + return Error::success(); +} + +Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); } diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp index db67e30ca203b..692da230b6e09 100644 --- a/llvm/unittests/CAS/ActionCacheTest.cpp +++ b/llvm/unittests/CAS/ActionCacheTest.cpp @@ -21,7 +21,7 @@ using namespace llvm; using namespace llvm::cas; TEST_P(CASTest, ActionCacheHit) { - std::shared_ptr CAS = createObjectStore(); + std::unique_ptr CAS = createObjectStore(); std::unique_ptr Cache = createActionCache(); std::optional ID; @@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) { } TEST_P(CASTest, ActionCacheMiss) { - std::shared_ptr CAS = createObjectStore(); + std::unique_ptr CAS = createObjectStore(); std::unique_ptr Cache = createActionCache(); std::optional ID1, ID2; @@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) { } TEST_P(CASTest, ActionCacheRewrite) { - std::shared_ptr CAS = createObjectStore(); + std::unique_ptr CAS = createObjectStore(); std::unique_ptr Cache = createActionCache(); std::optional ID1, ID2; diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp new file mode 100644 index 0000000000000..19522e9372d85 --- /dev/null +++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp @@ -0,0 +1,67 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/BuiltinUnifiedCASDatabases.h" +#include "CASTestConfig.h" +#include "llvm/CAS/ActionCache.h" +#include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; + +TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) { + unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true); + + auto WithCAS = [&](llvm::function_ref Action) { + std::pair, std::unique_ptr> DBs; + ASSERT_THAT_ERROR( + createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs), + Succeeded()); + ObjectStore &CAS = *DBs.first; + ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded()); + Action(CAS); + }; + + std::optional ID; + + // Create an object in the CAS. + WithCAS([&ID](ObjectStore &CAS) { + std::optional Ref; + ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded()); + ASSERT_TRUE(Ref.has_value()); + + ID = CAS.getID(*Ref); + }); + + // Check materialization and prune the storage. + WithCAS([&ID](ObjectStore &CAS) { + std::optional Ref = CAS.getReference(*ID); + ASSERT_TRUE(Ref.has_value()); + + std::optional IsMaterialized; + ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized), + Succeeded()); + ASSERT_TRUE(IsMaterialized); + + ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded()); + }); + + // Verify that the previous materialization check kept the object in the CAS. + WithCAS([&ID](ObjectStore &CAS) { + std::optional Ref = CAS.getReference(*ID); + ASSERT_TRUE(Ref.has_value()); + + std::optional IsMaterialized; + ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized), + Succeeded()); + ASSERT_TRUE(IsMaterialized); + }); +} diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp index 91d0970367ac3..dc1578a50541c 100644 --- a/llvm/unittests/CAS/CASTestConfig.cpp +++ b/llvm/unittests/CAS/CASTestConfig.cpp @@ -8,13 +8,20 @@ #include "CASTestConfig.h" #include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/Error.h" #include "gtest/gtest.h" using namespace llvm; using namespace llvm::cas; +namespace llvm::unittest::cas { +void MockEnv::anchor() {} +MockEnv::~MockEnv() {} +} // namespace llvm::unittest::cas + static CASTestingEnv createInMemory(int I) { - return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()}; + return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(), + nullptr, std::nullopt}; } INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, @@ -22,7 +29,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest, #if LLVM_ENABLE_ONDISK_CAS namespace llvm::cas::ondisk { -extern void setMaxMappingSize(uint64_t Size); +void setMaxMappingSize(uint64_t Size); } // namespace llvm::cas::ondisk void setMaxOnDiskCASMappingSize() { @@ -30,6 +37,18 @@ void setMaxOnDiskCASMappingSize() { std::call_once( Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); }); } + +CASTestingEnv createOnDisk(int I) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr CAS; + EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + std::unique_ptr Cache; + EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache), + Succeeded()); + return CASTestingEnv{std::move(CAS), std::move(Cache), nullptr, + std::move(Temp)}; +} +INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk)); #else void setMaxOnDiskCASMappingSize() {} #endif /* LLVM_ENABLE_ONDISK_CAS */ diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h index c08968b95b9cc..27033c93d57bb 100644 --- a/llvm/unittests/CAS/CASTestConfig.h +++ b/llvm/unittests/CAS/CASTestConfig.h @@ -6,16 +6,29 @@ // //===----------------------------------------------------------------------===// +#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H +#define LLVM_UNITTESTS_CASTESTCONFIG_H + #include "llvm/CAS/ActionCache.h" #include "llvm/CAS/ObjectStore.h" +#include "llvm/Testing/Support/SupportHelpers.h" #include "gtest/gtest.h" +#include -#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H -#define LLVM_UNITTESTS_CASTESTCONFIG_H +namespace llvm::unittest::cas { +class MockEnv { + void anchor(); + +public: + virtual ~MockEnv(); +}; +} // namespace llvm::unittest::cas struct CASTestingEnv { std::unique_ptr CAS; std::unique_ptr Cache; + std::unique_ptr Env; + std::optional Temp; }; void setMaxOnDiskCASMappingSize(); @@ -24,26 +37,49 @@ void setMaxOnDiskCASMappingSize(); class OnDiskCASTest : public ::testing::Test { protected: void SetUp() override { +#if !LLVM_ENABLE_ONDISK_CAS + GTEST_SKIP() << "OnDiskCAS is not enabled"; +#endif // Use a smaller database size for testing to conserve disk space. setMaxOnDiskCASMappingSize(); } }; +// Parametered test fixture for ObjectStore and ActionCache tests. class CASTest : public testing::TestWithParam> { protected: std::optional NextCASIndex; + llvm::SmallVector Dirs; + + llvm::SmallVector> Envs; + std::unique_ptr createObjectStore() { auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); + if (TD.Env) + Envs.emplace_back(std::move(TD.Env)); return std::move(TD.CAS); } std::unique_ptr createActionCache() { auto TD = GetParam()(++(*NextCASIndex)); + if (TD.Temp) + Dirs.push_back(std::move(*TD.Temp)); + if (TD.Env) + Envs.emplace_back(std::move(TD.Env)); return std::move(TD.Cache); } - void SetUp() { NextCASIndex = 0; } - void TearDown() { NextCASIndex = std::nullopt; } + void SetUp() { + NextCASIndex = 0; + setMaxOnDiskCASMappingSize(); + } + void TearDown() { + NextCASIndex = std::nullopt; + Dirs.clear(); + Envs.clear(); + } }; #endif diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt index da469f7fccb5a..91e49be770745 100644 --- a/llvm/unittests/CAS/CMakeLists.txt +++ b/llvm/unittests/CAS/CMakeLists.txt @@ -1,9 +1,11 @@ set(ONDISK_CAS_TEST_SOURCES + BuiltinUnifiedCASDatabasesTest.cpp OnDiskGraphDBTest.cpp OnDiskDataAllocatorTest.cpp OnDiskKeyValueDBTest.cpp OnDiskTrieRawHashMapTest.cpp ProgramTest.cpp + UnifiedOnDiskCacheTest.cpp ) set(LLVM_OPTIONAL_SOURCES diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp index 54083fdb408f6..b43ae33d74127 100644 --- a/llvm/unittests/CAS/ObjectStoreTest.cpp +++ b/llvm/unittests/CAS/ObjectStoreTest.cpp @@ -1,4 +1,4 @@ -//===- ObjectStoreTest.cpp ------------------------------------------------===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)", // Run validation on all CASIDs. for (int I = 0, E = IDs.size(); I != E; ++I) - ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded()); + ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded()); // Check that the blobs can be retrieved multiple times. for (int I = 0, E = IDs.size(); I != E; ++I) { @@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) { std::optional ID2; ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded()); ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded()); ASSERT_EQ(ID1, ID2); String1.append(String2); ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded()); ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded()); - ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded()); ASSERT_EQ(ID1, ID2); String2.append(String1); } @@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)", // Check basic printing of IDs. IDs.push_back(CAS1->getID(*Node)); - auto ID = CAS1->getID(Nodes.back()); - EXPECT_EQ(ID.toString(), IDs.back().toString()); - EXPECT_EQ(*Node, Nodes.back()); - EXPECT_EQ(ID, IDs.back()); + EXPECT_EQ(IDs.back().toString(), IDs.back().toString()); + EXPECT_EQ(Nodes.front(), Nodes.front()); + EXPECT_EQ(Nodes.back(), Nodes.back()); + EXPECT_EQ(IDs.front(), IDs.front()); + EXPECT_EQ(IDs.back(), IDs.back()); if (Nodes.size() <= 1) continue; EXPECT_NE(Nodes.front(), Nodes.back()); @@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) { } for (auto ID : CreatedNodes) - ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded()); + ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded()); } #if LLVM_ENABLE_THREADS @@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) { } TEST_P(CASTest, BlobsParallel) { - std::shared_ptr CAS = createObjectStore(); + std::unique_ptr CAS = createObjectStore(); uint64_t Size = 1ULL * 1024; ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); } #ifdef EXPENSIVE_CHECKS TEST_P(CASTest, BlobsBigParallel) { - std::shared_ptr CAS = createObjectStore(); + std::unique_ptr CAS = createObjectStore(); // 100k is large enough to be standalone files in our on-disk cas. uint64_t Size = 100ULL * 1024; ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size)); } #endif // EXPENSIVE_CHECKS + +#ifndef _WIN32 // create_link won't work for directories on Windows +TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) { + // This test intentionally uses symlinked paths to the same CAS to subvert the + // shared memory mappings that would normally be created within a single + // process. This breaks the lock file guarantees, so we must be careful not + // to create or destroy the CAS objects concurrently, which is when the locks + // are normally important. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + uint64_t Size = 1ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} + +TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) { + // See comment in BlobsParallelMultiCAS. + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")), + std::error_code()); + ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")), + std::error_code()); + + std::unique_ptr CAS1, CAS2, CAS3, CAS4; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3), + Succeeded()); + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4), + Succeeded()); + + // 100k is large enough to be standalone files in our on-disk cas. + uint64_t Size = 100ULL * 1024; + ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size)); +} +#endif // _WIN32 #endif // LLVM_ENABLE_THREADS + +TEST_F(OnDiskCASTest, OnDiskCASDiskSize) { + unittest::TempDir Temp("on-disk-cas", /*Unique=*/true); + std::unique_ptr CAS; + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + + uint64_t MaxSize = 100 * 1024 * 1024; + + // Check that we map the files to the correct size. + auto CheckFileSizes = [&](bool Mapped) { + bool FoundIndex = false, FoundData = false; + std::error_code EC; + for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC; + I.increment(EC)) { + StringRef Filename = sys::path::filename(I->path()); + if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) { + FoundIndex = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) { + FoundData = true; + ASSERT_TRUE(I->status()); + if (Mapped) + EXPECT_EQ(I->status()->getSize(), MaxSize); + else + EXPECT_LT(I->status()->getSize(), MaxSize); + } + } + ASSERT_TRUE(FoundIndex); + ASSERT_TRUE(FoundData); + }; + + // Check that we have the full mapping size when the CAS is open. + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + // Check that the CAS is shrunk to a smaller size. + CheckFileSizes(/*Mapped=*/false); + + // Repeat the checks when starting from an existing CAS. + ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded()); + CheckFileSizes(/*Mapped=*/true); + CAS.reset(); + CheckFileSizes(/*Mapped=*/false); +} diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp new file mode 100644 index 0000000000000..e25288a26eb92 --- /dev/null +++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp @@ -0,0 +1,191 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/UnifiedOnDiskCache.h" +#include "CASTestConfig.h" +#include "OnDiskCommonUtils.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; +using namespace llvm::unittest::cas; + +/// Visits all the files of a directory recursively and returns the sum of their +/// sizes. +static Expected countFileSizes(StringRef Path) { + size_t TotalSize = 0; + std::error_code EC; + for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE; + DirI.increment(EC)) { + if (DirI->type() == sys::fs::file_type::directory_file) { + Expected Subsize = countFileSizes(DirI->path()); + if (!Subsize) + return Subsize.takeError(); + TotalSize += *Subsize; + continue; + } + ErrorOr Stat = DirI->status(); + if (!Stat) + return createFileError(DirI->path(), Stat.getError()); + TotalSize += Stat->getSize(); + } + if (EC) + return createFileError(Path, EC); + return TotalSize; +} + +TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) { + unittest::TempDir Temp("ondisk-unified", /*Unique=*/true); + std::unique_ptr UniDB; + + const uint64_t SizeLimit = 1024ull * 64; + auto reopenDB = [&]() { + UniDB.reset(); + ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3", + sizeof(HashType)) + .moveInto(UniDB), + Succeeded()); + }; + + reopenDB(); + + HashType RootHash; + HashType OtherHash; + HashType Key1Hash; + HashType Key2Hash; + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional ID1; + ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded()); + std::optional ID2; + ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded()); + std::optional IDRoot; + ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot), + Succeeded()); + ArrayRef Digest = DB.getDigest(*IDRoot); + ASSERT_EQ(Digest.size(), RootHash.size()); + llvm::copy(Digest, RootHash.data()); + + std::optional IDOther; + ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded()); + Digest = DB.getDigest(*IDOther); + ASSERT_EQ(Digest.size(), OtherHash.size()); + llvm::copy(Digest, OtherHash.data()); + + Key1Hash = digest("key1"); + std::optional Val; + ASSERT_THAT_ERROR(UniDB->KVPut(Key1Hash, *IDRoot).moveInto(Val), + Succeeded()); + EXPECT_EQ(IDRoot, Val); + + Key2Hash = digest("key2"); + std::optional KeyID; + ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded()); + ASSERT_THAT_ERROR(UniDB->KVPut(*KeyID, *ID1).moveInto(Val), Succeeded()); + } + + auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional ID; + ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded()); + std::string PrintedTree; + raw_string_ostream OS(PrintedTree); + ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded()); + EXPECT_EQ(PrintedTree, ExpectedTree); + }; + auto checkRootTree = [&]() { + return checkTree(RootHash, "root\n 1\n 2\n"); + }; + + auto checkKey = [&](const HashType &Key, StringRef ExpectedData) { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional Val; + ASSERT_THAT_ERROR(UniDB->KVGet(Key).moveInto(Val), Succeeded()); + ASSERT_TRUE(Val.has_value()); + std::optional Obj; + ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded()); + EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData); + }; + + checkRootTree(); + checkTree(OtherHash, "other\n"); + checkKey(Key1Hash, "root"); + checkKey(Key2Hash, "1"); + + auto storeBigObject = [&](unsigned Index) { + SmallString<1000> Buf; + Buf.append(970, 'a'); + raw_svector_ostream(Buf) << Index; + std::optional ID; + ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID), + Succeeded()); + }; + + uint64_t PrevStoreSize = UniDB->getStorageSize(); + unsigned Index = 0; + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize); + UniDB->setSizeLimit(SizeLimit * 2); + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + UniDB->setSizeLimit(SizeLimit); + EXPECT_TRUE(UniDB->hasExceededSizeLimit()); + + reopenDB(); + + EXPECT_FALSE(UniDB->hasExceededSizeLimit()); + EXPECT_FALSE(UniDB->needsGarbageCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + while (!UniDB->hasExceededSizeLimit()) { + storeBigObject(Index++); + } + PrevStoreSize = UniDB->getStorageSize(); + ASSERT_THAT_ERROR(UniDB->close(), Succeeded()); + EXPECT_TRUE(UniDB->needsGarbageCollection()); + + reopenDB(); + EXPECT_TRUE(UniDB->needsGarbageCollection()); + + std::optional DirSizeBefore; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore), + Succeeded()); + + ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()), + Succeeded()); + + std::optional DirSizeAfter; + ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter), + Succeeded()); + EXPECT_LT(*DirSizeAfter, *DirSizeBefore); + + reopenDB(); + EXPECT_FALSE(UniDB->needsGarbageCollection()); + + checkRootTree(); + checkKey(Key1Hash, "root"); + + EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize); + + // 'Other' tree and 'Key2' got garbage-collected. + { + OnDiskGraphDB &DB = UniDB->getGraphDB(); + std::optional ID; + ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded()); + EXPECT_FALSE(DB.containsObject(*ID)); + std::optional Val; + ASSERT_THAT_ERROR(UniDB->KVGet(Key2Hash).moveInto(Val), Succeeded()); + EXPECT_FALSE(Val.has_value()); + } +}