diff --git a/llvm/include/llvm/CAS/FileOffset.h b/llvm/include/llvm/CAS/FileOffset.h new file mode 100644 index 0000000000000..21d045e8c9d78 --- /dev/null +++ b/llvm/include/llvm/CAS/FileOffset.h @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file declares interface for FileOffset that represent stored data at an +/// offset from the beginning of a file. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_FILEOFFSET_H +#define LLVM_CAS_FILEOFFSET_H + +#include + +namespace llvm::cas { + +/// FileOffset is a wrapper around `uint64_t` to represent the offset of data +/// from the beginning of the file. +class FileOffset { +public: + uint64_t get() const { return Offset; } + + explicit operator bool() const { return Offset; } + + FileOffset() = default; + explicit FileOffset(uint64_t Offset) : Offset(Offset) {} + +private: + uint64_t Offset = 0; +}; + +} // namespace llvm::cas + +#endif // LLVM_CAS_FILEOFFSET_H diff --git a/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h new file mode 100644 index 0000000000000..5e41bf6ab571e --- /dev/null +++ b/llvm/include/llvm/CAS/OnDiskTrieRawHashMap.h @@ -0,0 +1,236 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file declares interface for OnDiskTrieRawHashMap, a thread-safe and +/// (mostly) lock-free hash map stored as trie and backed by persistent files on +/// disk. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CAS_ONDISKTRIERAWHASHMAP_H +#define LLVM_CAS_ONDISKTRIERAWHASHMAP_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/STLFunctionalExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/FileOffset.h" +#include "llvm/Support/Error.h" +#include + +namespace llvm { + +class raw_ostream; + +namespace cas { + +/// OnDiskTrieRawHashMap is a persistent trie data structure used as hash maps. +/// The keys are fixed length, and are expected to be binary hashes with a +/// normal distribution. +/// +/// - Thread-safety is achieved through the use of atomics within a shared +/// memory mapping. Atomic access does not work on networked filesystems. +/// - Filesystem locks are used, but only sparingly: +/// - during initialization, for creating / opening an existing store; +/// - for the lifetime of the instance, a shared/reader lock is held +/// - during destruction, if there are no concurrent readers, to shrink the +/// files to their minimum size. +/// - Path is used as a directory: +/// - "index" stores the root trie and subtries. +/// - "data" stores (most of) the entries, like a bump-ptr-allocator. +/// - Large entries are stored externally in a file named by the key. +/// - Code is system-dependent and binary format itself is not portable. These +/// are not artifacts that can/should be moved between different systems; they +/// are only appropriate for local storage. +class OnDiskTrieRawHashMap { +public: + LLVM_DUMP_METHOD void dump() const; + void + print(raw_ostream &OS, + function_ref)> PrintRecordData = nullptr) const; + +public: + /// Const value proxy to access the records stored in TrieRawHashMap. + struct ConstValueProxy { + ConstValueProxy() = default; + ConstValueProxy(ArrayRef Hash, ArrayRef Data) + : Hash(Hash), Data(Data) {} + ConstValueProxy(ArrayRef Hash, StringRef Data) + : Hash(Hash), Data(Data.begin(), Data.size()) {} + + ArrayRef Hash; + ArrayRef Data; + }; + + /// Value proxy to access the records stored in TrieRawHashMap. + struct ValueProxy { + operator ConstValueProxy() const { return ConstValueProxy(Hash, Data); } + + ValueProxy() = default; + ValueProxy(ArrayRef Hash, MutableArrayRef Data) + : Hash(Hash), Data(Data) {} + + ArrayRef Hash; + MutableArrayRef Data; + }; + + /// Validate the trie data structure. + /// + /// Callback receives the file offset to the data entry and the data stored. + Error validate( + function_ref RecordVerifier) const; + + /// Check the valid range of file offset for OnDiskTrieRawHashMap. + static bool validOffset(FileOffset Offset) { + return Offset.get() < (1LL << 48); + } + +public: + /// Template class to implement a `pointer` type into the trie data structure. + /// + /// It provides pointer-like operation, e.g., dereference to get underlying + /// data. It also reserves the top 16 bits of the pointer value, which can be + /// used to pack additional information if needed. + template class PointerImpl { + public: + FileOffset getOffset() const { + return FileOffset(OffsetLow32 | (uint64_t)OffsetHigh16 << 32); + } + + explicit operator bool() const { return IsValue; } + + const ProxyT &operator*() const { + assert(IsValue); + return Value; + } + const ProxyT *operator->() const { + assert(IsValue); + return &Value; + } + + PointerImpl() = default; + + protected: + PointerImpl(ProxyT Value, FileOffset Offset, bool IsValue = true) + : Value(Value), OffsetLow32((uint64_t)Offset.get()), + OffsetHigh16((uint64_t)Offset.get() >> 32), IsValue(IsValue) { + if (IsValue) + assert(validOffset(Offset)); + } + + ProxyT Value; + uint32_t OffsetLow32 = 0; + uint16_t OffsetHigh16 = 0; + + // True if points to a value (not a "nullptr"). Use an extra field because + // 0 can be a valid offset. + bool IsValue = false; + }; + + class pointer; + class const_pointer : public PointerImpl { + public: + const_pointer() = default; + + private: + friend class pointer; + friend class OnDiskTrieRawHashMap; + using const_pointer::PointerImpl::PointerImpl; + }; + + class pointer : public PointerImpl { + public: + operator const_pointer() const { + return const_pointer(Value, getOffset(), IsValue); + } + + pointer() = default; + + private: + friend class OnDiskTrieRawHashMap; + using pointer::PointerImpl::PointerImpl; + }; + + /// Find the value from hash. + /// + /// \returns pointer to the value if exists, otherwise returns a non-value + /// pointer that evaluates to `false` when convert to boolean. + const_pointer find(ArrayRef Hash) const; + + /// Helper function to recover a pointer into the trie from file offset. + Expected recoverFromFileOffset(FileOffset Offset) const; + + using LazyInsertOnConstructCB = + function_ref; + using LazyInsertOnLeakCB = + function_ref; + + /// Insert lazily. + /// + /// \p OnConstruct is called when ready to insert a value, after allocating + /// space for the data. It is called at most once. + /// + /// \p OnLeak is called only if \p OnConstruct has been called and a race + /// occurred before insertion, causing the tentative offset and data to be + /// abandoned. This allows clients to clean up other results or update any + /// references. + /// + /// NOTE: Does *not* guarantee that \p OnConstruct is only called on success. + /// The in-memory \a TrieRawHashMap uses LazyAtomicPointer to synchronize + /// simultaneous writes, but that seems dangerous to use in a memory-mapped + /// file in case a process crashes in the busy state. + Expected insertLazy(ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct = nullptr, + LazyInsertOnLeakCB OnLeak = nullptr); + + Expected insert(const ConstValueProxy &Value) { + return insertLazy(Value.Hash, [&](FileOffset, ValueProxy Allocated) { + assert(Allocated.Hash == Value.Hash); + assert(Allocated.Data.size() == Value.Data.size()); + llvm::copy(Value.Data, Allocated.Data.begin()); + }); + } + + size_t size() const; + size_t capacity() const; + + /// Gets or creates a file at \p Path with a hash-mapped trie named \p + /// TrieName. The hash size is \p NumHashBits (in bits) and the records store + /// data of size \p DataSize (in bytes). + /// + /// \p MaxFileSize controls the maximum file size to support, limiting the + /// size of the \a mapped_file_region. \p NewFileInitialSize is the starting + /// size if a new file is created. + /// + /// \p NewTableNumRootBits and \p NewTableNumSubtrieBits are hints to + /// configure the trie, if it doesn't already exist. + /// + /// \pre NumHashBits is a multiple of 8 (byte-aligned). + static Expected + create(const Twine &Path, const Twine &TrieName, size_t NumHashBits, + uint64_t DataSize, uint64_t MaxFileSize, + std::optional NewFileInitialSize, + std::optional NewTableNumRootBits = std::nullopt, + std::optional NewTableNumSubtrieBits = std::nullopt); + + OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS); + OnDiskTrieRawHashMap &operator=(OnDiskTrieRawHashMap &&RHS); + ~OnDiskTrieRawHashMap(); + +private: + struct ImplType; + explicit OnDiskTrieRawHashMap(std::unique_ptr Impl); + std::unique_ptr Impl; +}; + +} // namespace cas +} // namespace llvm + +#endif // LLVM_CAS_ONDISKTRIERAWHASHMAP_H diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt index 6ed724bc2fd76..cc866f25f3240 100644 --- a/llvm/lib/CAS/CMakeLists.txt +++ b/llvm/lib/CAS/CMakeLists.txt @@ -2,10 +2,12 @@ add_llvm_component_library(LLVMCAS ActionCache.cpp ActionCaches.cpp BuiltinCAS.cpp + DatabaseFile.cpp InMemoryCAS.cpp MappedFileRegionArena.cpp ObjectStore.cpp OnDiskCommon.cpp + OnDiskTrieRawHashMap.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS diff --git a/llvm/lib/CAS/DatabaseFile.cpp b/llvm/lib/CAS/DatabaseFile.cpp new file mode 100644 index 0000000000000..db8ce1dc5bb14 --- /dev/null +++ b/llvm/lib/CAS/DatabaseFile.cpp @@ -0,0 +1,123 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file This file implements the common abstractions for CAS database file. +/// +//===----------------------------------------------------------------------===// + +#include "DatabaseFile.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +Error ondisk::createTableConfigError(std::errc ErrC, StringRef Path, + StringRef TableName, const Twine &Msg) { + return createStringError(make_error_code(ErrC), + Path + "[" + TableName + "]: " + Msg); +} + +Error ondisk::checkTable(StringRef Label, size_t Expected, size_t Observed, + StringRef Path, StringRef TrieName) { + if (Expected == Observed) + return Error::success(); + return createTableConfigError(std::errc::invalid_argument, Path, TrieName, + "mismatched " + Label + + " (expected: " + Twine(Expected) + + ", observed: " + Twine(Observed) + ")"); +} + +Expected +DatabaseFile::create(const Twine &Path, uint64_t Capacity, + function_ref NewDBConstructor) { + // Constructor for if the file doesn't exist. + auto NewFileConstructor = [&](MappedFileRegionArena &Alloc) -> Error { + if (Alloc.capacity() < + sizeof(Header) + sizeof(MappedFileRegionArena::Header)) + return createTableConfigError(std::errc::argument_out_of_domain, + Path.str(), "datafile", + "Allocator too small for header"); + (void)new (Alloc.data()) Header{getMagic(), getVersion(), {0}}; + DatabaseFile DB(Alloc); + return NewDBConstructor(DB); + }; + + // Get or create the file. + MappedFileRegionArena Alloc; + if (Error E = MappedFileRegionArena::create(Path, Capacity, sizeof(Header), + NewFileConstructor) + .moveInto(Alloc)) + return std::move(E); + + return DatabaseFile::get( + std::make_unique(std::move(Alloc))); +} + +Error DatabaseFile::addTable(TableHandle Table) { + assert(Table); + assert(&Table.getRegion() == &getRegion()); + int64_t ExistingRootOffset = 0; + const int64_t NewOffset = + reinterpret_cast(&Table.getHeader()) - getRegion().data(); + if (H->RootTableOffset.compare_exchange_strong(ExistingRootOffset, NewOffset)) + return Error::success(); + + // Silently ignore attempts to set the root to itself. + if (ExistingRootOffset == NewOffset) + return Error::success(); + + // Return an proper error message. + TableHandle Root(getRegion(), ExistingRootOffset); + if (Root.getName() == Table.getName()) + return createStringError( + make_error_code(std::errc::not_supported), + "collision with existing table of the same name '" + Table.getName() + + "'"); + + return createStringError(make_error_code(std::errc::not_supported), + "cannot add new table '" + Table.getName() + + "'" + " to existing root '" + + Root.getName() + "'"); +} + +std::optional DatabaseFile::findTable(StringRef Name) { + int64_t RootTableOffset = H->RootTableOffset.load(); + if (!RootTableOffset) + return std::nullopt; + + TableHandle Root(getRegion(), RootTableOffset); + if (Root.getName() == Name) + return Root; + + return std::nullopt; +} + +Error DatabaseFile::validate(MappedFileRegion &Region) { + if (Region.size() < sizeof(Header)) + return createStringError(std::errc::invalid_argument, + "database: missing header"); + + // Check the magic and version. + auto *H = reinterpret_cast
(Region.data()); + if (H->Magic != getMagic()) + return createStringError(std::errc::invalid_argument, + "database: bad magic"); + if (H->Version != getVersion()) + return createStringError(std::errc::invalid_argument, + "database: wrong version"); + + auto *MFH = reinterpret_cast(Region.data() + + sizeof(Header)); + // Check the bump-ptr, which should point past the header. + if (MFH->BumpPtr.load() < (int64_t)sizeof(Header)) + return createStringError(std::errc::invalid_argument, + "database: corrupt bump-ptr"); + + return Error::success(); +} diff --git a/llvm/lib/CAS/DatabaseFile.h b/llvm/lib/CAS/DatabaseFile.h new file mode 100644 index 0000000000000..609e5f1357190 --- /dev/null +++ b/llvm/lib/CAS/DatabaseFile.h @@ -0,0 +1,153 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file declares the common interface for a DatabaseFile that is used to +/// implement OnDiskCAS. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_CAS_DATABASEFILE_H +#define LLVM_LIB_CAS_DATABASEFILE_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/CAS/MappedFileRegionArena.h" +#include "llvm/Support/Error.h" + +namespace llvm::cas::ondisk { + +using MappedFileRegion = MappedFileRegionArena::RegionT; + +/// Generic handle for a table. +/// +/// Generic table header layout: +/// - 2-bytes: TableKind +/// - 2-bytes: TableNameSize +/// - 4-bytes: TableNameRelOffset (relative to header) +class TableHandle { +public: + enum class TableKind : uint16_t { + TrieRawHashMap = 1, + DataAllocator = 2, + }; + struct Header { + TableKind Kind; + uint16_t NameSize; + int32_t NameRelOffset; ///< Relative to Header. + }; + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + MappedFileRegion &getRegion() const { return *Region; } + + template static void check() { + static_assert( + std::is_same::value, + "T::GenericHeader should be of type TableHandle::Header"); + static_assert(offsetof(typename T::Header, GenericHeader) == 0, + "T::GenericHeader must be the head of T::Header"); + } + template bool is() const { return T::Kind == H->Kind; } + template T dyn_cast() const { + check(); + if (is()) + return T(*Region, *reinterpret_cast(H)); + return T(); + } + template T cast() const { + assert(is()); + return dyn_cast(); + } + + StringRef getName() const { + auto *Begin = reinterpret_cast(H) + H->NameRelOffset; + return StringRef(Begin, H->NameSize); + } + + TableHandle() = default; + TableHandle(MappedFileRegion &Region, Header &H) : Region(&Region), H(&H) {} + TableHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : TableHandle(Region, + *reinterpret_cast
(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +/// Encapsulate a database file, which: +/// - Sets/checks magic. +/// - Sets/checks version. +/// - Points at an arbitrary root table. +/// - Sets up a MappedFileRegionArena for allocation. +/// +/// Top-level layout: +/// - 4-bytes: Magic +/// - 4-bytes: Version +/// - 8-bytes: RootTableOffset (16-bits: Kind; 48-bits: Offset) +/// - 8-bytes: BumpPtr from MappedFileRegionArena +class DatabaseFile { +public: + static constexpr uint32_t getMagic() { return 0xDA7ABA53UL; } + static constexpr uint32_t getVersion() { return 1UL; } + struct Header { + uint32_t Magic; + uint32_t Version; + std::atomic RootTableOffset; + }; + + const Header &getHeader() { return *H; } + MappedFileRegionArena &getAlloc() { return Alloc; } + MappedFileRegion &getRegion() { return Alloc.getRegion(); } + + /// Add a table. This is currently not thread safe and should be called inside + /// NewDBConstructor. + Error addTable(TableHandle Table); + + /// Find a table. May return null. + std::optional findTable(StringRef Name); + + /// Create the DatabaseFile at Path with Capacity. + static Expected + create(const Twine &Path, uint64_t Capacity, + function_ref NewDBConstructor); + + size_t size() const { return Alloc.size(); } + +private: + static Expected + get(std::unique_ptr Alloc) { + if (Error E = validate(Alloc->getRegion())) + return std::move(E); + return DatabaseFile(std::move(Alloc)); + } + + static Error validate(MappedFileRegion &Region); + + DatabaseFile(MappedFileRegionArena &Alloc) + : H(reinterpret_cast
(Alloc.data())), Alloc(Alloc) {} + DatabaseFile(std::unique_ptr Alloc) + : DatabaseFile(*Alloc) { + OwnedAlloc = std::move(Alloc); + } + + Header *H = nullptr; + MappedFileRegionArena &Alloc; + std::unique_ptr OwnedAlloc; +}; + +Error createTableConfigError(std::errc ErrC, StringRef Path, + StringRef TableName, const Twine &Msg); + +Error checkTable(StringRef Label, size_t Expected, size_t Observed, + StringRef Path, StringRef TrieName); + +} // namespace llvm::cas::ondisk + +#endif diff --git a/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp new file mode 100644 index 0000000000000..3000c0f0e46f1 --- /dev/null +++ b/llvm/lib/CAS/OnDiskTrieRawHashMap.cpp @@ -0,0 +1,1178 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file Implements OnDiskTrieRawHashMap. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskTrieRawHashMap.h" +#include "DatabaseFile.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/TrieHashIndexGenerator.h" +#include "llvm/CAS/MappedFileRegionArena.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/ThreadPool.h" +#include "llvm/Support/Threading.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::cas; +using namespace llvm::cas::ondisk; + +#if LLVM_ENABLE_ONDISK_CAS + +//===----------------------------------------------------------------------===// +// TrieRawHashMap data structures. +//===----------------------------------------------------------------------===// + +namespace { + +class SubtrieHandle; +class TrieRawHashMapHandle; +class TrieVisitor; + +/// A value stored in the slots inside a SubTrie. A stored value can either be a +/// subtrie (encoded after negation) which is the file offset to another +/// subtrie, or it can be a fileset to a DataRecord. +class SubtrieSlotValue { +public: + explicit operator bool() const { return !isEmpty(); } + bool isEmpty() const { return !Offset; } + bool isData() const { return Offset > 0; } + bool isSubtrie() const { return Offset < 0; } + uint64_t asData() const { + assert(isData()); + return Offset; + } + uint64_t asSubtrie() const { + assert(isSubtrie()); + return -Offset; + } + + FileOffset asSubtrieFileOffset() const { return FileOffset(asSubtrie()); } + + FileOffset asDataFileOffset() const { return FileOffset(asData()); } + + int64_t getRawOffset() const { return Offset; } + + static SubtrieSlotValue getDataOffset(int64_t Offset) { + return SubtrieSlotValue(Offset); + } + + static SubtrieSlotValue getSubtrieOffset(int64_t Offset) { + return SubtrieSlotValue(-Offset); + } + + static SubtrieSlotValue getDataOffset(FileOffset Offset) { + return getDataOffset(Offset.get()); + } + + static SubtrieSlotValue getSubtrieOffset(FileOffset Offset) { + return getDataOffset(Offset.get()); + } + + static SubtrieSlotValue getFromSlot(std::atomic &Slot) { + return SubtrieSlotValue(Slot.load()); + } + + SubtrieSlotValue() = default; + +private: + friend class SubtrieHandle; + explicit SubtrieSlotValue(int64_t Offset) : Offset(Offset) {} + int64_t Offset = 0; +}; + +/// Subtrie layout: +/// - 2-bytes: StartBit +/// - 1-bytes: NumBits=lg(num-slots) +/// - 5-bytes: 0-pad +/// - +class SubtrieHandle { +public: + struct Header { + /// The bit this subtrie starts on. + uint16_t StartBit; + + /// The number of bits this subtrie handles. It has 2^NumBits slots. + uint8_t NumBits; + + /// 0-pad to 8B. + uint8_t ZeroPad1B; + uint32_t ZeroPad4B; + }; + + /// Slot storage: + /// - zero: Empty + /// - positive: RecordOffset + /// - negative: SubtrieOffset + using SlotT = std::atomic; + + static int64_t getSlotsSize(uint32_t NumBits) { + return sizeof(int64_t) * (1u << NumBits); + } + + static int64_t getSize(uint32_t NumBits) { + return sizeof(SubtrieHandle::Header) + getSlotsSize(NumBits); + } + + int64_t getSize() const { return getSize(H->NumBits); } + size_t getNumSlots() const { return Slots.size(); } + + SubtrieSlotValue load(size_t I) const { + return SubtrieSlotValue(Slots[I].load()); + } + void store(size_t I, SubtrieSlotValue V) { + return Slots[I].store(V.getRawOffset()); + } + + void printHash(raw_ostream &OS, ArrayRef Bytes) const; + + /// Return None on success, or the existing offset on failure. + bool compare_exchange_strong(size_t I, SubtrieSlotValue &Expected, + SubtrieSlotValue New) { + return Slots[I].compare_exchange_strong(Expected.Offset, New.Offset); + } + + /// Sink \p V from \p I in this subtrie down to \p NewI in a new subtrie with + /// \p NumSubtrieBits. + /// + /// \p UnusedSubtrie maintains a 1-item "free" list of unused subtries. If a + /// new subtrie is created that isn't used because of a lost race, then it If + /// it's already valid, it should be used instead of allocating a new one. + /// should be returned as an out parameter to be passed back in the future. + /// If it's already valid, it should be used instead of allocating a new one. + /// + /// Returns the subtrie that now lives at \p I. + Expected sink(size_t I, SubtrieSlotValue V, + MappedFileRegionArena &Alloc, + size_t NumSubtrieBits, + SubtrieHandle &UnusedSubtrie, size_t NewI); + + /// Only safe if the subtrie is empty. + void reinitialize(uint32_t StartBit, uint32_t NumBits); + + SubtrieSlotValue getOffset() const { + return SubtrieSlotValue::getSubtrieOffset( + reinterpret_cast(H) - Region->data()); + } + + FileOffset getFileOffset() const { return getOffset().asSubtrieFileOffset(); } + + explicit operator bool() const { return H; } + + Header &getHeader() const { return *H; } + uint32_t getStartBit() const { return H->StartBit; } + uint32_t getNumBits() const { return H->NumBits; } + + static Expected create(MappedFileRegionArena &Alloc, + uint32_t StartBit, uint32_t NumBits); + + static SubtrieHandle getFromFileOffset(MappedFileRegion &Region, + FileOffset Offset) { + return SubtrieHandle(Region, SubtrieSlotValue::getSubtrieOffset(Offset)); + } + + SubtrieHandle() = default; + SubtrieHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H), Slots(getSlots(H)) {} + SubtrieHandle(MappedFileRegion &Region, SubtrieSlotValue Offset) + : SubtrieHandle(Region, *reinterpret_cast
( + Region.data() + Offset.asSubtrie())) {} + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; + MutableArrayRef Slots; + + static MutableArrayRef getSlots(Header &H) { + return MutableArrayRef(reinterpret_cast(&H + 1), 1u << H.NumBits); + } +}; + +/// Handle for a TrieRawHashMap table. +/// +/// TrieRawHashMap table layout: +/// - [8-bytes: Generic table header] +/// - 1-byte: NumSubtrieBits +/// - 1-byte: Flags (not used yet) +/// - 2-bytes: NumHashBits +/// - 4-bytes: RecordDataSize (in bytes) +/// - 8-bytes: RootTrieOffset +/// - 8-bytes: AllocatorOffset (reserved for implementing free lists) +/// - '\0' +/// +/// Record layout: +/// - +/// - +class TrieRawHashMapHandle { +public: + static constexpr TableHandle::TableKind Kind = + TableHandle::TableKind::TrieRawHashMap; + + struct Header { + TableHandle::Header GenericHeader; + uint8_t NumSubtrieBits; + uint8_t Flags; ///< None used yet. + uint16_t NumHashBits; + uint32_t RecordDataSize; + std::atomic RootTrieOffset; + std::atomic AllocatorOffset; + }; + + operator TableHandle() const { + if (!H) + return TableHandle(); + return TableHandle(*Region, H->GenericHeader); + } + + struct RecordData { + OnDiskTrieRawHashMap::ValueProxy Proxy; + SubtrieSlotValue Offset; + FileOffset getFileOffset() const { return Offset.asDataFileOffset(); } + }; + + enum Limits : size_t { + /// Seems like 65528 hash bits ought to be enough. + MaxNumHashBytes = UINT16_MAX >> 3, + MaxNumHashBits = MaxNumHashBytes << 3, + + /// 2^16 bits in a trie is 65536 slots. This restricts us to a 16-bit + /// index. This many slots is suspicously large anyway. + MaxNumRootBits = 16, + + /// 2^10 bits in a trie is 1024 slots. This many slots seems suspiciously + /// large for subtries. + MaxNumSubtrieBits = 10, + }; + + static constexpr size_t getNumHashBytes(size_t NumHashBits) { + assert(NumHashBits % 8 == 0); + return NumHashBits / 8; + } + static constexpr size_t getRecordSize(size_t RecordDataSize, + size_t NumHashBits) { + return RecordDataSize + getNumHashBytes(NumHashBits); + } + + RecordData getRecord(SubtrieSlotValue Offset); + Expected createRecord(MappedFileRegionArena &Alloc, + ArrayRef Hash); + + explicit operator bool() const { return H; } + const Header &getHeader() const { return *H; } + SubtrieHandle getRoot() const; + Expected getOrCreateRoot(MappedFileRegionArena &Alloc); + MappedFileRegion &getRegion() const { return *Region; } + + size_t getFlags() const { return H->Flags; } + uint64_t getNumSubtrieBits() const { return H->NumSubtrieBits; } + uint64_t getNumHashBits() const { return H->NumHashBits; } + size_t getNumHashBytes() const { return getNumHashBytes(H->NumHashBits); } + size_t getRecordDataSize() const { return H->RecordDataSize; } + size_t getRecordSize() const { + return getRecordSize(H->RecordDataSize, H->NumHashBits); + } + + TrieHashIndexGenerator getIndexGen(SubtrieHandle Root, + ArrayRef Hash) { + assert(Root.getStartBit() == 0); + assert(getNumHashBytes() == Hash.size()); + assert(getNumHashBits() == Hash.size() * 8); + return TrieHashIndexGenerator{Root.getNumBits(), getNumSubtrieBits(), Hash}; + } + + static Expected + create(MappedFileRegionArena &Alloc, StringRef Name, + std::optional NumRootBits, uint64_t NumSubtrieBits, + uint64_t NumHashBits, uint64_t RecordDataSize); + + void + print(raw_ostream &OS, + function_ref)> PrintRecordData = nullptr) const; + + Error validate( + function_ref + RecordVerifier) const; + TrieRawHashMapHandle() = default; + TrieRawHashMapHandle(MappedFileRegion &Region, Header &H) + : Region(&Region), H(&H) {} + TrieRawHashMapHandle(MappedFileRegion &Region, intptr_t HeaderOffset) + : TrieRawHashMapHandle( + Region, *reinterpret_cast
(Region.data() + HeaderOffset)) { + } + +private: + MappedFileRegion *Region = nullptr; + Header *H = nullptr; +}; + +} // end anonymous namespace + +struct OnDiskTrieRawHashMap::ImplType { + DatabaseFile File; + TrieRawHashMapHandle Trie; +}; + +Expected SubtrieHandle::create(MappedFileRegionArena &Alloc, + uint32_t StartBit, + uint32_t NumBits) { + assert(StartBit <= TrieRawHashMapHandle::MaxNumHashBits); + assert(NumBits <= UINT8_MAX); + assert(NumBits <= TrieRawHashMapHandle::MaxNumRootBits); + + auto Mem = Alloc.allocate(getSize(NumBits)); + if (LLVM_UNLIKELY(!Mem)) + return Mem.takeError(); + auto *H = + new (*Mem) SubtrieHandle::Header{(uint16_t)StartBit, (uint8_t)NumBits, + /*ZeroPad1B=*/0, /*ZeroPad4B=*/0}; + SubtrieHandle S(Alloc.getRegion(), *H); + for (auto I = S.Slots.begin(), E = S.Slots.end(); I != E; ++I) + new (I) SlotT(0); + return S; +} + +SubtrieHandle TrieRawHashMapHandle::getRoot() const { + if (int64_t Root = H->RootTrieOffset) + return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Root)); + return SubtrieHandle(); +} + +Expected +TrieRawHashMapHandle::getOrCreateRoot(MappedFileRegionArena &Alloc) { + assert(&Alloc.getRegion() == &getRegion()); + if (SubtrieHandle Root = getRoot()) + return Root; + + int64_t Race = 0; + auto LazyRoot = SubtrieHandle::create(Alloc, 0, H->NumSubtrieBits); + if (LLVM_UNLIKELY(!LazyRoot)) + return LazyRoot.takeError(); + if (H->RootTrieOffset.compare_exchange_strong( + Race, LazyRoot->getOffset().asSubtrie())) + return *LazyRoot; + + // There was a race. Return the other root. + // + // TODO: Avoid leaking the lazy root by storing it in an allocator. + return SubtrieHandle(getRegion(), SubtrieSlotValue::getSubtrieOffset(Race)); +} + +Expected +TrieRawHashMapHandle::create(MappedFileRegionArena &Alloc, StringRef Name, + std::optional NumRootBits, + uint64_t NumSubtrieBits, uint64_t NumHashBits, + uint64_t RecordDataSize) { + // Allocate. + auto Offset = Alloc.allocateOffset(sizeof(Header) + Name.size() + 1); + if (LLVM_UNLIKELY(!Offset)) + return Offset.takeError(); + + // Construct the header and the name. + assert(Name.size() <= UINT16_MAX && "Expected smaller table name"); + assert(NumSubtrieBits <= UINT8_MAX && "Expected valid subtrie bits"); + assert(NumHashBits <= UINT16_MAX && "Expected valid hash size"); + assert(RecordDataSize <= UINT32_MAX && "Expected smaller table name"); + auto *H = new (Alloc.getRegion().data() + *Offset) + Header{{TableHandle::TableKind::TrieRawHashMap, (uint16_t)Name.size(), + (uint32_t)sizeof(Header)}, + (uint8_t)NumSubtrieBits, + /*Flags=*/0, + (uint16_t)NumHashBits, + (uint32_t)RecordDataSize, + /*RootTrieOffset=*/{0}, + /*AllocatorOffset=*/{0}}; + char *NameStorage = reinterpret_cast(H + 1); + llvm::copy(Name, NameStorage); + NameStorage[Name.size()] = 0; + + // Construct a root trie, if requested. + TrieRawHashMapHandle Trie(Alloc.getRegion(), *H); + auto Sub = SubtrieHandle::create(Alloc, 0, *NumRootBits); + if (LLVM_UNLIKELY(!Sub)) + return Sub.takeError(); + if (NumRootBits) + H->RootTrieOffset = Sub->getOffset().asSubtrie(); + return Trie; +} + +TrieRawHashMapHandle::RecordData +TrieRawHashMapHandle::getRecord(SubtrieSlotValue Offset) { + char *Begin = Region->data() + Offset.asData(); + OnDiskTrieRawHashMap::ValueProxy Proxy; + Proxy.Data = MutableArrayRef(Begin, getRecordDataSize()); + Proxy.Hash = ArrayRef(reinterpret_cast(Proxy.Data.end()), + getNumHashBytes()); + return RecordData{Proxy, Offset}; +} + +Expected +TrieRawHashMapHandle::createRecord(MappedFileRegionArena &Alloc, + ArrayRef Hash) { + assert(&Alloc.getRegion() == Region); + assert(Hash.size() == getNumHashBytes()); + auto Offset = Alloc.allocateOffset(getRecordSize()); + if (LLVM_UNLIKELY(!Offset)) + return Offset.takeError(); + + RecordData Record = getRecord(SubtrieSlotValue::getDataOffset(*Offset)); + llvm::copy(Hash, const_cast(Record.Proxy.Hash.begin())); + return Record; +} + +Expected +OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const { + // Check alignment. + if (!isAligned(MappedFileRegionArena::getAlign(), Offset.get())) + return createStringError(make_error_code(std::errc::protocol_error), + "unaligned file offset at 0x" + + utohexstr(Offset.get(), /*LowerCase=*/true)); + + // Check bounds. + // + // Note: There's no potential overflow when using \c uint64_t because Offset + // is in valid offset range and the record size is in \c [0,UINT32_MAX]. + if (!validOffset(Offset) || + Offset.get() + Impl->Trie.getRecordSize() > Impl->File.getAlloc().size()) + return createStringError(make_error_code(std::errc::protocol_error), + "file offset too large: 0x" + + utohexstr(Offset.get(), /*LowerCase=*/true)); + + // Looks okay... + TrieRawHashMapHandle::RecordData D = + Impl->Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); + return const_pointer(D.Proxy, D.getFileOffset()); +} + +OnDiskTrieRawHashMap::const_pointer +OnDiskTrieRawHashMap::find(ArrayRef Hash) const { + TrieRawHashMapHandle Trie = Impl->Trie; + assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); + + SubtrieHandle S = Trie.getRoot(); + if (!S) + return const_pointer(); + + TrieHashIndexGenerator IndexGen = Trie.getIndexGen(S, Hash); + size_t Index = IndexGen.next(); + for (;;) { + // Try to set the content. + SubtrieSlotValue V = S.load(Index); + if (!V) + return const_pointer(); + + // Check for an exact match. + if (V.isData()) { + TrieRawHashMapHandle::RecordData D = Trie.getRecord(V); + return D.Proxy.Hash == Hash ? const_pointer(D.Proxy, D.getFileOffset()) + : const_pointer(); + } + + Index = IndexGen.next(); + S = SubtrieHandle(Trie.getRegion(), V); + } +} + +/// Only safe if the subtrie is empty. +void SubtrieHandle::reinitialize(uint32_t StartBit, uint32_t NumBits) { + assert(StartBit > H->StartBit); + assert(NumBits <= H->NumBits); + // Ideally would also assert that all slots are empty, but that's expensive. + + H->StartBit = StartBit; + H->NumBits = NumBits; +} + +Expected +OnDiskTrieRawHashMap::insertLazy(ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct, + LazyInsertOnLeakCB OnLeak) { + TrieRawHashMapHandle Trie = Impl->Trie; + assert(Hash.size() == Trie.getNumHashBytes() && "Invalid hash"); + + MappedFileRegionArena &Alloc = Impl->File.getAlloc(); + std::optional S; + auto Err = Trie.getOrCreateRoot(Alloc).moveInto(S); + if (LLVM_UNLIKELY(Err)) + return std::move(Err); + + TrieHashIndexGenerator IndexGen = Trie.getIndexGen(*S, Hash); + size_t Index = IndexGen.next(); + + // Walk through the hash bytes and insert into correct trie position. + std::optional NewRecord; + SubtrieHandle UnusedSubtrie; + for (;;) { + SubtrieSlotValue Existing = S->load(Index); + + // Try to set it, if it's empty. + if (!Existing) { + if (!NewRecord) { + auto Err = Trie.createRecord(Alloc, Hash).moveInto(NewRecord); + if (LLVM_UNLIKELY(Err)) + return std::move(Err); + if (OnConstruct) + OnConstruct(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy); + } + + if (S->compare_exchange_strong(Index, Existing, NewRecord->Offset)) + return pointer(NewRecord->Proxy, NewRecord->Offset.asDataFileOffset()); + + // Race means that Existing is no longer empty; fall through... + } + + if (Existing.isSubtrie()) { + S = SubtrieHandle(Trie.getRegion(), Existing); + Index = IndexGen.next(); + continue; + } + + // Check for an exact match. + TrieRawHashMapHandle::RecordData ExistingRecord = Trie.getRecord(Existing); + if (ExistingRecord.Proxy.Hash == Hash) { + if (NewRecord && OnLeak) + OnLeak(NewRecord->Offset.asDataFileOffset(), NewRecord->Proxy, + ExistingRecord.Offset.asDataFileOffset(), ExistingRecord.Proxy); + return pointer(ExistingRecord.Proxy, + ExistingRecord.Offset.asDataFileOffset()); + } + + // Sink the existing content as long as the indexes match. + for (;;) { + size_t NextIndex = IndexGen.next(); + size_t NewIndexForExistingContent = + IndexGen.getCollidingBits(ExistingRecord.Proxy.Hash); + + auto Err = S->sink(Index, Existing, Alloc, IndexGen.getNumBits(), + UnusedSubtrie, NewIndexForExistingContent) + .moveInto(S); + if (LLVM_UNLIKELY(Err)) + return std::move(Err); + Index = NextIndex; + + // Found the difference. + if (NextIndex != NewIndexForExistingContent) + break; + } + } +} + +Expected SubtrieHandle::sink(size_t I, SubtrieSlotValue V, + MappedFileRegionArena &Alloc, + size_t NumSubtrieBits, + SubtrieHandle &UnusedSubtrie, + size_t NewI) { + std::optional NewS; + if (UnusedSubtrie) { + // Steal UnusedSubtrie and initialize it. + NewS.emplace(); + std::swap(*NewS, UnusedSubtrie); + NewS->reinitialize(getStartBit() + getNumBits(), NumSubtrieBits); + } else { + // Allocate a new, empty subtrie. + auto Err = SubtrieHandle::create(Alloc, getStartBit() + getNumBits(), + NumSubtrieBits) + .moveInto(NewS); + if (LLVM_UNLIKELY(Err)) + return std::move(Err); + } + + NewS->store(NewI, V); + if (compare_exchange_strong(I, V, NewS->getOffset())) + return *NewS; // Success! + + // Raced. + assert(V.isSubtrie() && "Expected racing sink() to add a subtrie"); + + // Wipe out the new slot so NewS can be reused and set the out parameter. + NewS->store(NewI, SubtrieSlotValue()); + UnusedSubtrie = *NewS; + + // Return the subtrie added by the concurrent sink() call. + return SubtrieHandle(Alloc.getRegion(), V); +} + +void OnDiskTrieRawHashMap::print( + raw_ostream &OS, function_ref)> PrintRecordData) const { + Impl->Trie.print(OS, PrintRecordData); +} + +Error OnDiskTrieRawHashMap::validate( + function_ref RecordVerifier) const { + return Impl->Trie.validate(RecordVerifier); +} + +// Helper function that prints hexdigit and have a sub-byte starting position. +static void printHexDigits(raw_ostream &OS, ArrayRef Bytes, + size_t StartBit, size_t NumBits) { + assert(StartBit % 4 == 0); + assert(NumBits % 4 == 0); + for (size_t I = StartBit, E = StartBit + NumBits; I != E; I += 4) { + uint8_t HexPair = Bytes[I / 8]; + uint8_t HexDigit = I % 8 == 0 ? HexPair >> 4 : HexPair & 0xf; + OS << hexdigit(HexDigit, /*LowerCase=*/true); + } +} + +static void printBits(raw_ostream &OS, ArrayRef Bytes, size_t StartBit, + size_t NumBits) { + assert(StartBit + NumBits <= Bytes.size() * 8u); + for (size_t I = StartBit, E = StartBit + NumBits; I != E; ++I) { + uint8_t Byte = Bytes[I / 8]; + size_t ByteOffset = I % 8; + if (size_t ByteShift = 8 - ByteOffset - 1) + Byte >>= ByteShift; + OS << (Byte & 0x1 ? '1' : '0'); + } +} + +void SubtrieHandle::printHash(raw_ostream &OS, ArrayRef Bytes) const { + // afb[1c:00*01110*0]def + size_t EndBit = getStartBit() + getNumBits(); + size_t HashEndBit = Bytes.size() * 8u; + + size_t FirstBinaryBit = getStartBit() & ~0x3u; + printHexDigits(OS, Bytes, 0, FirstBinaryBit); + + size_t LastBinaryBit = (EndBit + 3u) & ~0x3u; + OS << "["; + printBits(OS, Bytes, FirstBinaryBit, LastBinaryBit - FirstBinaryBit); + OS << "]"; + + printHexDigits(OS, Bytes, LastBinaryBit, HashEndBit - LastBinaryBit); +} + +static void appendIndexBits(std::string &Prefix, size_t Index, + size_t NumSlots) { + std::string Bits; + for (size_t NumBits = 1u; NumBits < NumSlots; NumBits <<= 1) { + Bits.push_back('0' + (Index & 0x1)); + Index >>= 1; + } + for (char Ch : llvm::reverse(Bits)) + Prefix += Ch; +} + +static void printPrefix(raw_ostream &OS, StringRef Prefix) { + while (Prefix.size() >= 4) { + uint8_t Digit; + bool ErrorParsingBinary = Prefix.take_front(4).getAsInteger(2, Digit); + assert(!ErrorParsingBinary); + (void)ErrorParsingBinary; + OS << hexdigit(Digit, /*LowerCase=*/true); + Prefix = Prefix.drop_front(4); + } + if (!Prefix.empty()) + OS << "[" << Prefix << "]"; +} + +LLVM_DUMP_METHOD void OnDiskTrieRawHashMap::dump() const { print(dbgs()); } + +static Expected checkParameter(StringRef Label, size_t Max, + std::optional Value, + std::optional Default, + StringRef Path, StringRef TableName) { + assert(Value || Default); + assert(!Default || *Default <= Max); + if (!Value) + return *Default; + + if (*Value <= Max) + return *Value; + return createTableConfigError( + std::errc::argument_out_of_domain, Path, TableName, + "invalid " + Label + ": " + Twine(*Value) + " (max: " + Twine(Max) + ")"); +} + +size_t OnDiskTrieRawHashMap::size() const { return Impl->File.size(); } +size_t OnDiskTrieRawHashMap::capacity() const { + return Impl->File.getRegion().size(); +} + +Expected +OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine, + size_t NumHashBits, uint64_t DataSize, + uint64_t MaxFileSize, + std::optional NewFileInitialSize, + std::optional NewTableNumRootBits, + std::optional NewTableNumSubtrieBits) { + SmallString<128> PathStorage; + StringRef Path = PathTwine.toStringRef(PathStorage); + SmallString<128> TrieNameStorage; + StringRef TrieName = TrieNameTwine.toStringRef(TrieNameStorage); + + constexpr size_t DefaultNumRootBits = 10; + constexpr size_t DefaultNumSubtrieBits = 6; + + size_t NumRootBits; + if (Error E = checkParameter( + "root bits", TrieRawHashMapHandle::MaxNumRootBits, + NewTableNumRootBits, DefaultNumRootBits, Path, TrieName) + .moveInto(NumRootBits)) + return std::move(E); + + size_t NumSubtrieBits; + if (Error E = checkParameter("subtrie bits", + TrieRawHashMapHandle::MaxNumSubtrieBits, + NewTableNumSubtrieBits, DefaultNumSubtrieBits, + Path, TrieName) + .moveInto(NumSubtrieBits)) + return std::move(E); + + size_t NumHashBytes = NumHashBits >> 3; + if (Error E = + checkParameter("hash size", TrieRawHashMapHandle::MaxNumHashBits, + NumHashBits, std::nullopt, Path, TrieName) + .takeError()) + return std::move(E); + assert(NumHashBits == NumHashBytes << 3 && + "Expected hash size to be byte-aligned"); + if (NumHashBits != NumHashBytes << 3) + return createTableConfigError( + std::errc::argument_out_of_domain, Path, TrieName, + "invalid hash size: " + Twine(NumHashBits) + " (not byte-aligned)"); + + // Constructor for if the file doesn't exist. + auto NewDBConstructor = [&](DatabaseFile &DB) -> Error { + auto Trie = + TrieRawHashMapHandle::create(DB.getAlloc(), TrieName, NumRootBits, + NumSubtrieBits, NumHashBits, DataSize); + if (LLVM_UNLIKELY(!Trie)) + return Trie.takeError(); + + return DB.addTable(*Trie); + }; + + // Get or create the file. + Expected File = + DatabaseFile::create(Path, MaxFileSize, NewDBConstructor); + if (!File) + return File.takeError(); + + // Find the trie and validate it. + std::optional Table = File->findTable(TrieName); + if (!Table) + return createTableConfigError(std::errc::argument_out_of_domain, Path, + TrieName, "table not found"); + if (Error E = checkTable("table kind", (size_t)TrieRawHashMapHandle::Kind, + (size_t)Table->getHeader().Kind, Path, TrieName)) + return std::move(E); + auto Trie = Table->cast(); + assert(Trie && "Already checked the kind"); + + // Check the hash and data size. + if (Error E = checkTable("hash size", NumHashBits, Trie.getNumHashBits(), + Path, TrieName)) + return std::move(E); + if (Error E = checkTable("data size", DataSize, Trie.getRecordDataSize(), + Path, TrieName)) + return std::move(E); + + // No flags supported right now. Either corrupt, or coming from a future + // writer. + if (size_t Flags = Trie.getFlags()) + return createTableConfigError(std::errc::invalid_argument, Path, TrieName, + "unsupported flags: " + Twine(Flags)); + + // Success. + OnDiskTrieRawHashMap::ImplType Impl{DatabaseFile(std::move(*File)), Trie}; + return OnDiskTrieRawHashMap(std::make_unique(std::move(Impl))); +} + +static Error createInvalidTrieError(uint64_t Offset, const Twine &Msg) { + return createStringError(make_error_code(std::errc::protocol_error), + "invalid trie at 0x" + + utohexstr(Offset, /*LowerCase=*/true) + ": " + + Msg); +} + +//===----------------------------------------------------------------------===// +// TrieVisitor data structures. +//===----------------------------------------------------------------------===// + +namespace { +/// A multi-threaded vistior to traverse the Trie. +/// +/// TODO: add more sanity checks that isn't just plain data corruption. For +/// example, some ill-formed data can be constructed to form a cycle using +/// Sub-Tries and it can lead to inifinite loop when visiting (or inserting +/// data). +class TrieVisitor { +public: + TrieVisitor(TrieRawHashMapHandle Trie, unsigned ThreadCount = 0, + unsigned ErrorLimit = 50) + : Trie(Trie), ErrorLimit(ErrorLimit), + Threads(hardware_concurrency(ThreadCount)) {} + virtual ~TrieVisitor() = default; + Error visit(); + +private: + // Virtual method to implement the action when visiting a sub-trie. + virtual Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) { + return Error::success(); + } + + // Virtual method to implement the action when visiting a slot in a trie node. + virtual Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix, + SubtrieSlotValue Slot) { + return Error::success(); + } + +protected: + TrieRawHashMapHandle Trie; + +private: + Error traverseTrieNode(SubtrieHandle Node, StringRef Prefix); + + Error validateSubTrie(SubtrieHandle Node, bool IsRoot); + + // Helper function to capture errors when visiting the trie nodes. + void addError(Error NewError) { + assert(NewError && "not an error"); + std::lock_guard ErrorLock(Lock); + if (NumError >= ErrorLimit) { + // Too many errors. + consumeError(std::move(NewError)); + return; + } + + if (Err) + Err = joinErrors(std::move(*Err), std::move(NewError)); + else + Err = std::move(NewError); + NumError++; + } + + bool tooManyErrors() { + std::lock_guard ErrorLock(Lock); + return (bool)Err && NumError >= ErrorLimit; + } + + const unsigned ErrorLimit; + std::optional Err; + unsigned NumError = 0; + std::mutex Lock; + DefaultThreadPool Threads; +}; + +/// A visitor that traverse and print the Trie. +class TriePrinter : public TrieVisitor { +public: + TriePrinter(TrieRawHashMapHandle Trie, raw_ostream &OS, + function_ref)> PrintRecordData) + : TrieVisitor(Trie, /*ThreadCount=*/1), OS(OS), + PrintRecordData(PrintRecordData) {} + + Error printRecords() { + if (Records.empty()) + return Error::success(); + + OS << "records\n"; + llvm::sort(Records); + for (int64_t Offset : Records) { + TrieRawHashMapHandle::RecordData Record = + Trie.getRecord(SubtrieSlotValue::getDataOffset(Offset)); + if (auto Err = printRecord(Record)) + return Err; + } + return Error::success(); + } + + Error printRecord(TrieRawHashMapHandle::RecordData &Record) { + OS << "- addr=" << (void *)Record.getFileOffset().get() << " "; + if (PrintRecordData) { + PrintRecordData(Record.Proxy.Data); + } else { + OS << "bytes="; + ArrayRef Data( + reinterpret_cast(Record.Proxy.Data.data()), + Record.Proxy.Data.size()); + printHexDigits(OS, Data, 0, Data.size() * 8); + } + OS << "\n"; + return Error::success(); + } + + Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) override { + if (Prefix.empty()) { + OS << "root"; + } else { + OS << "subtrie="; + printPrefix(OS, Prefix); + } + + OS << " addr=" + << (void *)(reinterpret_cast(&SubTrie.getHeader()) - + Trie.getRegion().data()); + OS << " num-slots=" << SubTrie.getNumSlots() << "\n"; + return Error::success(); + } + + Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix, + SubtrieSlotValue Slot) override { + OS << "- index="; + for (size_t Pad : {10, 100, 1000}) + if (I < Pad && Subtrie.getNumSlots() >= Pad) + OS << "0"; + OS << I << " "; + if (Slot.isSubtrie()) { + OS << "addr=" << (void *)Slot.asSubtrie(); + OS << " subtrie="; + printPrefix(OS, Prefix); + OS << "\n"; + return Error::success(); + } + TrieRawHashMapHandle::RecordData Record = Trie.getRecord(Slot); + OS << "addr=" << (void *)Record.getFileOffset().get(); + OS << " content="; + Subtrie.printHash(OS, Record.Proxy.Hash); + OS << "\n"; + Records.push_back(Slot.asData()); + return Error::success(); + } + +private: + raw_ostream &OS; + function_ref)> PrintRecordData; + SmallVector Records; +}; + +/// TrieVerifier that adds additional verification on top of the basic visitor. +class TrieVerifier : public TrieVisitor { +public: + TrieVerifier( + TrieRawHashMapHandle Trie, + function_ref + RecordVerifier) + : TrieVisitor(Trie), RecordVerifier(RecordVerifier) {} + +private: + Error visitSubTrie(StringRef Prefix, SubtrieHandle SubTrie) final { + return Error::success(); + } + + Error visitSlot(unsigned I, SubtrieHandle Subtrie, StringRef Prefix, + SubtrieSlotValue Slot) final { + if (RecordVerifier && Slot.isData()) { + if (!isAligned(MappedFileRegionArena::getAlign(), Slot.asData())) + return createInvalidTrieError(Slot.asData(), "mis-aligned data entry"); + + TrieRawHashMapHandle::RecordData Record = + Trie.getRecord(SubtrieSlotValue::getDataOffset(Slot.asData())); + return RecordVerifier(Slot.asDataFileOffset(), + OnDiskTrieRawHashMap::ConstValueProxy{ + Record.Proxy.Hash, Record.Proxy.Data}); + } + return Error::success(); + } + + function_ref + RecordVerifier; +}; +} // namespace + +Error TrieVisitor::visit() { + auto Root = Trie.getRoot(); + if (!Root) + return Error::success(); + + if (auto Err = validateSubTrie(Root, /*IsRoot=*/true)) + return Err; + + if (auto Err = visitSubTrie("", Root)) + return Err; + + SmallVector Subs; + SmallVector Prefixes; + const size_t NumSlots = Root.getNumSlots(); + for (size_t I = 0, E = NumSlots; I != E; ++I) { + SubtrieSlotValue Slot = Root.load(I); + if (!Slot) + continue; + uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData(); + if (Offset >= (uint64_t)Trie.getRegion().size()) + return createInvalidTrieError(Offset, "slot points out of bound"); + std::string SubtriePrefix; + appendIndexBits(SubtriePrefix, I, NumSlots); + if (Slot.isSubtrie()) { + SubtrieHandle S(Trie.getRegion(), Slot); + Subs.push_back(S); + Prefixes.push_back(SubtriePrefix); + } + if (auto Err = visitSlot(I, Root, SubtriePrefix, Slot)) + return Err; + } + + for (size_t I = 0, E = Subs.size(); I != E; ++I) { + Threads.async( + [&](unsigned Idx) { + // Don't run if there is an error already. + if (tooManyErrors()) + return; + if (auto Err = traverseTrieNode(Subs[Idx], Prefixes[Idx])) + addError(std::move(Err)); + }, + I); + } + + Threads.wait(); + if (Err) + return std::move(*Err); + return Error::success(); +} + +Error TrieVisitor::validateSubTrie(SubtrieHandle Node, bool IsRoot) { + char *Addr = reinterpret_cast(&Node.getHeader()); + const int64_t Offset = Node.getFileOffset().get(); + if (Addr + Node.getSize() >= + Trie.getRegion().data() + Trie.getRegion().size()) + return createInvalidTrieError(Offset, "subtrie node spans out of bound"); + + if (!IsRoot && + Node.getStartBit() + Node.getNumBits() > Trie.getNumHashBits()) { + return createInvalidTrieError(Offset, + "subtrie represents too many hash bits"); + } + + if (IsRoot) { + if (Node.getStartBit() != 0) + return createInvalidTrieError(Offset, + "root node doesn't start at 0 index"); + + return Error::success(); + } + + if (Node.getNumBits() > Trie.getNumSubtrieBits()) + return createInvalidTrieError(Offset, "subtrie has wrong number of slots"); + + return Error::success(); +} + +Error TrieVisitor::traverseTrieNode(SubtrieHandle Node, StringRef Prefix) { + if (auto Err = validateSubTrie(Node, /*IsRoot=*/false)) + return Err; + + if (auto Err = visitSubTrie(Prefix, Node)) + return Err; + + SmallVector Subs; + SmallVector Prefixes; + const size_t NumSlots = Node.getNumSlots(); + for (size_t I = 0, E = NumSlots; I != E; ++I) { + SubtrieSlotValue Slot = Node.load(I); + if (!Slot) + continue; + uint64_t Offset = Slot.isSubtrie() ? Slot.asSubtrie() : Slot.asData(); + if (Offset >= (uint64_t)Trie.getRegion().size()) + return createInvalidTrieError(Offset, "slot points out of bound"); + std::string SubtriePrefix = Prefix.str(); + appendIndexBits(SubtriePrefix, I, NumSlots); + if (Slot.isSubtrie()) { + SubtrieHandle S(Trie.getRegion(), Slot); + Subs.push_back(S); + Prefixes.push_back(SubtriePrefix); + } + if (auto Err = visitSlot(I, Node, SubtriePrefix, Slot)) + return Err; + } + for (size_t I = 0, E = Subs.size(); I != E; ++I) + if (auto Err = traverseTrieNode(Subs[I], Prefixes[I])) + return Err; + + return Error::success(); +} + +void TrieRawHashMapHandle::print( + raw_ostream &OS, function_ref)> PrintRecordData) const { + OS << "hash-num-bits=" << getNumHashBits() + << " hash-size=" << getNumHashBytes() + << " record-data-size=" << getRecordDataSize() << "\n"; + + TriePrinter Printer(*this, OS, PrintRecordData); + if (auto Err = Printer.visit()) + OS << "error: " << toString(std::move(Err)) << "\n"; + + if (auto Err = Printer.printRecords()) + OS << "error: " << toString(std::move(Err)) << "\n"; + + return; +} + +Error TrieRawHashMapHandle::validate( + function_ref + RecordVerifier) const { + // Use the base TrieVisitor to identify the errors inside trie first. + TrieVisitor BasicVerifier(*this); + if (auto Err = BasicVerifier.visit()) + return Err; + + // If the trie data structure is sound, do a second pass to verify data and + // verifier function can assume the index is correct. However, there can be + // newly added bad entries that can still produce error. + TrieVerifier Verifier(*this, RecordVerifier); + return Verifier.visit(); +} + +#else // !LLVM_ENABLE_ONDISK_CAS + +struct OnDiskTrieRawHashMap::ImplType {}; + +Expected +OnDiskTrieRawHashMap::create(const Twine &PathTwine, const Twine &TrieNameTwine, + size_t NumHashBits, uint64_t DataSize, + uint64_t MaxFileSize, + std::optional NewFileInitialSize, + std::optional NewTableNumRootBits, + std::optional NewTableNumSubtrieBits) { + return createStringError(make_error_code(std::errc::not_supported), + "OnDiskTrieRawHashMap is not supported"); +} + +Expected +OnDiskTrieRawHashMap::insertLazy(ArrayRef Hash, + LazyInsertOnConstructCB OnConstruct, + LazyInsertOnLeakCB OnLeak) { + return createStringError(make_error_code(std::errc::not_supported), + "OnDiskTrieRawHashMap is not supported"); +} + +Expected +OnDiskTrieRawHashMap::recoverFromFileOffset(FileOffset Offset) const { + return createStringError(make_error_code(std::errc::not_supported), + "OnDiskTrieRawHashMap is not supported"); +} + +OnDiskTrieRawHashMap::const_pointer +OnDiskTrieRawHashMap::find(ArrayRef Hash) const { + return const_pointer(); +} + +void OnDiskTrieRawHashMap::print( + raw_ostream &OS, function_ref)> PrintRecordData) const { +} + +Error OnDiskTrieRawHashMap::validate( + function_ref + RecordVerifier) const { + return createStringError(make_error_code(std::errc::not_supported), + "OnDiskTrieRawHashMap is not supported"); +} + +size_t OnDiskTrieRawHashMap::size() const { return 0; } +size_t OnDiskTrieRawHashMap::capacity() const { return 0; } + +#endif // LLVM_ENABLE_ONDISK_CAS + +OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(std::unique_ptr Impl) + : Impl(std::move(Impl)) {} +OnDiskTrieRawHashMap::OnDiskTrieRawHashMap(OnDiskTrieRawHashMap &&RHS) = + default; +OnDiskTrieRawHashMap & +OnDiskTrieRawHashMap::operator=(OnDiskTrieRawHashMap &&RHS) = default; +OnDiskTrieRawHashMap::~OnDiskTrieRawHashMap() = default; diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt index ab709e30369bf..0f8fcb9e98954 100644 --- a/llvm/unittests/CAS/CMakeLists.txt +++ b/llvm/unittests/CAS/CMakeLists.txt @@ -1,7 +1,3 @@ -if (LLVM_ENABLE_ONDISK_CAS) - add_definitions(-DLLVM_ENABLE_ONDISK_CAS=1) -endif() - set(LLVM_LINK_COMPONENTS Support CAS @@ -12,6 +8,7 @@ add_llvm_unittest(CASTests ActionCacheTest.cpp CASTestConfig.cpp ObjectStoreTest.cpp + OnDiskTrieRawHashMapTest.cpp ProgramTest.cpp ) diff --git a/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp new file mode 100644 index 0000000000000..7bedfe4b29e30 --- /dev/null +++ b/llvm/unittests/CAS/OnDiskTrieRawHashMapTest.cpp @@ -0,0 +1,220 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/CAS/OnDiskTrieRawHashMap.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/Alignment.h" +#include "llvm/Testing/Support/Error.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gtest/gtest.h" + +#if LLVM_ENABLE_ONDISK_CAS +using namespace llvm; +using namespace llvm::cas; + +namespace { + +struct OnDiskTrieRawHashMapTestFixture + : public ::testing::TestWithParam { + static constexpr size_t MB = 1024u * 1024u; + static constexpr size_t DataSize = 8; // Multiple of 8B. + + std::optional Temp; + size_t NumHashBytes; + + void SetUp() override { + Temp.emplace("trie-raw-hash-map", /*Unique=*/true); + NumHashBytes = GetParam(); + } + void TearDown() override { Temp.reset(); } + + Expected createTrie() { + size_t NumHashBits = NumHashBytes * 8; + return OnDiskTrieRawHashMap::create( + Temp->path((Twine(NumHashBytes) + "B").str()), "index", + /*NumHashBits=*/NumHashBits, DataSize, /*MaxFileSize=*/MB, + /*NewInitialFileSize=*/std::nullopt); + } +}; + +// Create tries with various sizes of hash and with data. +TEST_P(OnDiskTrieRawHashMapTestFixture, General) { + std::optional Trie1; + ASSERT_THAT_ERROR(createTrie().moveInto(Trie1), Succeeded()); + std::optional Trie2; + ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded()); + + uint8_t Hash0Bytes[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + uint8_t Hash1Bytes[8] = {1, 0, 0, 0, 0, 0, 0, 0}; + auto Hash0 = ArrayRef(Hash0Bytes).take_front(NumHashBytes); + auto Hash1 = ArrayRef(Hash1Bytes).take_front(NumHashBytes); + constexpr StringLiteral Data0v1Bytes = "data0.v1"; + constexpr StringLiteral Data0v2Bytes = "data0.v2"; + constexpr StringLiteral Data1Bytes = "data1..."; + static_assert(Data0v1Bytes.size() == DataSize, "math error"); + static_assert(Data0v2Bytes.size() == DataSize, "math error"); + static_assert(Data1Bytes.size() == DataSize, "math error"); + ArrayRef Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size()); + ArrayRef Data0v2 = ArrayRef(Data0v2Bytes.data(), Data0v2Bytes.size()); + ArrayRef Data1 = ArrayRef(Data1Bytes.data(), Data1Bytes.size()); + + // Lookup when trie is empty. + EXPECT_FALSE(Trie1->find(Hash0)); + + // Insert. + std::optional Offset; + std::optional> Data; + { + std::optional Insertion; + ASSERT_THAT_ERROR(Trie1->insert({Hash0, Data0v1}).moveInto(Insertion), + Succeeded()); + EXPECT_EQ(Hash0, (*Insertion)->Hash); + EXPECT_EQ(Data0v1, (*Insertion)->Data); + EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data())); + + Offset = Insertion->getOffset(); + Data = (*Insertion)->Data; + } + + // Find. + { + auto Lookup = Trie1->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Find in a different instance of the same on-disk trie that existed + // before the insertion. + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Create a new instance and check that too. + Trie2.reset(); + ASSERT_THAT_ERROR(createTrie().moveInto(Trie2), Succeeded()); + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v1, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Change the data. + llvm::copy(Data0v2, Data->data()); + { + auto Lookup = Trie2->find(Hash0); + ASSERT_TRUE(Lookup); + EXPECT_EQ(Hash0, Lookup->Hash); + EXPECT_EQ(Data0v2, Lookup->Data); + EXPECT_EQ(Offset->get(), Lookup.getOffset().get()); + } + + // Find different hash. + EXPECT_FALSE(Trie1->find(Hash1)); + EXPECT_FALSE(Trie2->find(Hash1)); + + // Recover from an offset. + { + OnDiskTrieRawHashMap::const_pointer Recovered; + ASSERT_THAT_ERROR(Trie1->recoverFromFileOffset(*Offset).moveInto(Recovered), + Succeeded()); + ASSERT_TRUE(Recovered); + EXPECT_EQ(Offset->get(), Recovered.getOffset().get()); + EXPECT_EQ(Hash0, Recovered->Hash); + EXPECT_EQ(Data0v2, Recovered->Data); + } + + // Recover from a bad offset. + { + FileOffset BadOffset(1); + OnDiskTrieRawHashMap::const_pointer Recovered; + ASSERT_THAT_ERROR( + Trie1->recoverFromFileOffset(BadOffset).moveInto(Recovered), Failed()); + } + + // Insert another thing. + { + std::optional Insertion; + ASSERT_THAT_ERROR(Trie1->insert({Hash1, Data1}).moveInto(Insertion), + Succeeded()); + EXPECT_EQ(Hash1, (*Insertion)->Hash); + EXPECT_EQ(Data1, (*Insertion)->Data); + EXPECT_TRUE(isAddrAligned(Align(8), (*Insertion)->Data.data())); + + EXPECT_NE(Offset->get(), Insertion->getOffset().get()); + } + + // Validate. + { + auto RecordVerify = + [&](FileOffset Offset, + OnDiskTrieRawHashMap::ConstValueProxy Proxy) -> Error { + if (Proxy.Hash.size() != NumHashBytes) + return createStringError("wrong hash size"); + if (Proxy.Data.size() != DataSize) + return createStringError("wrong data size"); + + return Error::success(); + }; + ASSERT_THAT_ERROR(Trie1->validate(RecordVerify), Succeeded()); + ASSERT_THAT_ERROR(Trie2->validate(RecordVerify), Succeeded()); + } + + // Size and capacity. + { + EXPECT_EQ(Trie1->capacity(), MB); + EXPECT_EQ(Trie2->capacity(), MB); + EXPECT_LE(Trie1->size(), MB); + EXPECT_LE(Trie2->size(), MB); + } +} + +INSTANTIATE_TEST_SUITE_P(OnDiskTrieRawHashMapTest, + OnDiskTrieRawHashMapTestFixture, + ::testing::Values(1, 2, 4, 8)); + +TEST(OnDiskTrieRawHashMapTest, OutOfSpace) { + unittest::TempDir Temp("trie-raw-hash-map", /*Unique=*/true); + std::optional Trie; + + // Too small to create header. + ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create( + Temp.path("NoSpace1").str(), "index", + /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/8, + /*NewInitialFileSize=*/std::nullopt) + .moveInto(Trie), + Failed()); + + // Just enough for root node but not enough for any insertion. + ASSERT_THAT_ERROR(OnDiskTrieRawHashMap::create( + Temp.path("NoSpace2").str(), "index", + /*NumHashBits=*/8, /*DataSize=*/8, /*MaxFileSize=*/118, + /*NewInitialFileSize=*/std::nullopt, + /*NewTableNumRootBits=*/1, /*NewTableNumSubtrieBits=*/1) + .moveInto(Trie), + Succeeded()); + uint8_t Hash0Bytes[1] = {0}; + auto Hash0 = ArrayRef(Hash0Bytes); + constexpr StringLiteral Data0v1Bytes = "data0.v1"; + ArrayRef Data0v1 = ArrayRef(Data0v1Bytes.data(), Data0v1Bytes.size()); + std::optional Insertion; + ASSERT_THAT_ERROR(Trie->insert({Hash0, Data0v1}).moveInto(Insertion), + Failed()); +} + +} // namespace + +#endif // LLVM_ENABLE_ONDISK_CAS