diff --git a/.dockerignore b/.dockerignore index daa6ce8..cf910f1 100644 --- a/.dockerignore +++ b/.dockerignore @@ -11,3 +11,6 @@ dist # Don't put Mac stuff in the docker build env cocoa + +c\+\+/build + diff --git a/.gitignore b/.gitignore index 5a47da1..e24994e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ eggs build dist *~ +test_build diff --git a/ProtobagCocoa.podspec.json b/ProtobagCocoa.podspec.json index 43df0c0..744fdf0 100644 --- a/ProtobagCocoa.podspec.json +++ b/ProtobagCocoa.podspec.json @@ -1,6 +1,6 @@ { "name": "ProtobagCocoa", - "version": "0.0.2", + "version": "0.0.3", "summary": "Protobag: an archive of string-serialized Protobufs", "homepage": "https://github.com/StandardCyborg/protobag", "license": "Apache 2", @@ -10,7 +10,7 @@ "cocoapods_version": ">= 1.0", "source": { "git": "git@github.com:StandardCyborg/protobag.git", - "tag": "v0.0.2" + "tag": "v0.0.3" }, "public_header_files": [ "c++/protobag/**/*.{hpp,h}" diff --git a/README.md b/README.md index 7c0a772..06388d4 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,17 @@ -# Protobag: A bag o' String-serialized Protobuf Messages +# Protobag: A bag o' Serialized Protobuf Messages _With built-in support for time-series data_ [![Build Status](https://circleci.com/gh/StandardCyborg/protobag.svg?style=svg&circle-token=ed56e2ec32789fa3e5f664bc8ea73c55e119de4b)](https://app.circleci.com/pipelines/github/StandardCyborg/protobag) +## Quickstart & Demo + +See [this python noteboook](examples/notebook-demo/protobag-demo-full.ipynb) +for a demo of key features. + +Or you can drop into a Protobag development shell using a clone of this repo +and Docker; FMI see: +`./pb-dev --help` + ## Summary [Protobuf](https://github.com/protocolbuffers/protobuf) is a popular data @@ -60,31 +69,100 @@ wrappers over `libarchive`. See [ArchiveUtil](c++/protobag/protobag/ArchiveUtil.hpp). +## Development + + +## Discussion of Key Features + +### Protobag indexes Protobuf message Descriptors + +By default, `protobag` not only saves those messages but also +**indexes Protobuf message descriptors** so that your `protobag` readers don't +need your proto schemas to decode your messages. + +#### Wat? +In order to deserialize a Protobuf message, typically you need +`protoc`-generated code for that message type (and you need `protoc`-generated +code for your specific programming language). This `protoc`-generated code is +engineered for efficiency and provides a clean API for accessing message +attributes. But what if you don't have that `protoc`-generated code? Or you +don't even have the `.proto` message definitions to generate such code? + +In Protobuf version 3.x, the authors added official support for +[the self-describing message paradigm](https://developers.google.com/protocol-buffers/docs/techniques). +Now a user can serialize not just a message but Protobuf Descriptor data that +describes the message schema and enables deserialzing the message +*without protoc-generated code*-- all you need is the `protobuf` library itself. +(This is a core feature of other serialization libraries +[like Avro](http://avro.apache.org/docs/1.6.1/)). + +Note: dynamic message decoding is slower than using `protoc`-generated code. +Furthermore, the `protoc`-generated code makes defensive programming a bit +easier. You probably want to use the `protoc`-generated code for your +messages if you can. +#### Protobag enables all messages to be self-describing messages +While Protobuf includes tools for using self-describing messages, the feature +isn't simply a toggle in your `.proto` file, and the API is a bit complicated +(because Google claims they don't use it much internally). -TODO: quickstart and stuff +`protobag` automatically indexes the Protobuf Descriptor data for your messages +at write time. (And you can disable this indexing if so desired). At read +time, `protobag` automatically uses this indexed Descriptor data if the user +reading your `protobag` file lacks the needed `protoc`-generated code to +deserialize a message. -TODO: bag index doc +What if a message type evolves? `protobag` indexes each distinct message type +for each write session. If you change your schema for a message type between +write sessions, `protobag` will have indexed both schemas and will use the +proper one for dynamic deserialization. -TODO: "treat as a map" API +#### For More Detail +For Python, see: + * `protobag.build_fds_for_msg()` -- This method collects the descriptor data + needed for any Protobuf Message instance or class. + * `protobag.DynamicMessageFactory::dynamic_decode()` -- This method uses + standard Protobuf APIs to deserialize messages given only Protobuf + Descriptor data. -coming soon +For C++, see: + * `BagIndexBuilder::DescriptorIndexer::Observe()` -- This method collects the + descriptor data needed for any Protobuf Message instance or class. + * `DynamicMsgFactory` -- This utility uses uses standard Protobuf APIs to + deserialize messages given only Protobuf Descriptor data. + +## Cocoa Pods + +You can integrate Protobag into an iOS or OSX application using the CocoaPod `ProtobagCocoa.podspec.json` +podspec included in this repo. Protobag is explicitly designed to be cross-platform (and utilize only C++ +features friendly to iOS) to facilitate such interoperability. + +Note: before pushing, be sure to edit the "version" field of the `ProtobagCocoa.podspec.json` file +to match the version you're pushing. ``` -indocker % cd /opt/protobag/cxx -indocker % mkdir -p build && cd build -indocker % cmake -DCMAKE_BUILD_TYPE=DEBUG .. -indocker % make -j `nproc` && ./protobag_test --gtest_filter=DemoTest* + pod repo push SCCocoaPods ProtobagCocoa.podspec.json --use-libraries --verbose --allow-warnings ``` +## C++ Build + +Use the existing CMake-based build system. + +In c++ subdir: ``` - pod repo push SCCocoaPods ProtobagCocoa.podspec.json --use-libraries --verbose --allow-warnings +mkdir build && cd build +cmake .. +make -j +make test ``` +## Python Build -in python subdir: +The Python library includes a wheel that leverages the above C++ CMake build system. + +In python subdir: ``` python3 setup.py bdist_wheel ``` -for both linux and xcode + diff --git a/c++/protobag/protobag/BagIndexBuilder.cpp b/c++/protobag/protobag/BagIndexBuilder.cpp index e3ab9a6..a0165db 100644 --- a/c++/protobag/protobag/BagIndexBuilder.cpp +++ b/c++/protobag/protobag/BagIndexBuilder.cpp @@ -5,8 +5,6 @@ #include #include - #include - #include #include @@ -156,12 +154,6 @@ BagIndex_TopicStats &BagIndexBuilder::GetMutableStats(const std::string &topic) return topic_to_stats[topic]; } -// uint64_t BagIndexBuilder::GetNextFilenum(const std::string &topic) { -// const auto &stats = GetMutableStats(topic); -// return stats.n_messages() + 1; -// } - - void BagIndexBuilder::Observe( const Entry &entry, const std::string &final_entryname) { @@ -169,27 +161,29 @@ void BagIndexBuilder::Observe( final_entryname.empty() ? entry.entryname : final_entryname; if (_do_timeseries_indexing) { - const auto &maybe_tt = entry.GetTopicTime(); - if (maybe_tt.has_value()) { - TopicTime tt = *maybe_tt; - tt.set_entryname(entryname); - - { - auto &stats = GetMutableStats(tt.topic()); - stats.set_n_messages(stats.n_messages() + 1); - } + if (entry.IsStampedMessage()) { + const auto &maybe_tt = entry.GetTopicTime(); + if (maybe_tt.has_value()) { + TopicTime tt = *maybe_tt; + tt.set_entryname(entryname); + + { + auto &stats = GetMutableStats(tt.topic()); + stats.set_n_messages(stats.n_messages() + 1); + } - { - if (!_tto) { - _tto.reset(new TopicTimeOrderer()); + { + if (!_tto) { + _tto.reset(new TopicTimeOrderer()); + } + _tto->Observe(tt); } - _tto->Observe(tt); - } - { - const auto &t = tt.timestamp(); - *_index.mutable_start() = std::min(_index.start(), t); - *_index.mutable_end() = std::max(_index.end(), t); + { + const auto &t = tt.timestamp(); + *_index.mutable_start() = std::min(_index.start(), t); + *_index.mutable_end() = std::max(_index.end(), t); + } } } } diff --git a/c++/protobag/protobag/Entry.cpp b/c++/protobag/protobag/Entry.cpp index 0d4b4ae..7430a77 100644 --- a/c++/protobag/protobag/Entry.cpp +++ b/c++/protobag/protobag/Entry.cpp @@ -54,7 +54,18 @@ std::string Entry::ToString() const { // } bool MaybeEntry::IsNotFound() const { - return error == archive::Archive::ReadStatus::EntryNotFound().error; + static const std::string kIsNotFoundPrefix = + archive::Archive::ReadStatus::EntryNotFound().error + ": "; + return error.find(kIsNotFoundPrefix) == 0; +} + +MaybeEntry MaybeEntry::NotFound(const std::string &entryname) { + MaybeEntry m; + m.error = fmt::format( + "{}: {}", + archive::Archive::ReadStatus::EntryNotFound().error, + entryname); + return m; } std::string GetTopicFromEntryname(const std::string &entryname) { diff --git a/c++/protobag/protobag/Entry.hpp b/c++/protobag/protobag/Entry.hpp index 211fe7c..0352a41 100644 --- a/c++/protobag/protobag/Entry.hpp +++ b/c++/protobag/protobag/Entry.hpp @@ -127,6 +127,7 @@ struct Entry { // == Raw Mode ================================ + // Create a raw entry from a Protobuf message instance (force-skips indexing) template static Result CreateRaw( const std::string &entryname, @@ -149,6 +150,7 @@ struct Entry { }; } + // Create a raw entry from raw bytes static Entry CreateRawFromBytes( const std::string &entryname, std::string &&raw_msg_contents) { @@ -249,7 +251,7 @@ struct Entry { return IsA() || ( // An unpacked StampedDatum is OK too - GetTopicTime().has_value()); + HasTopic()); } bool IsRaw() const { @@ -257,6 +259,7 @@ struct Entry { } std::optional GetTopicTime() const; + bool HasTopic() const; template Result GetAs(bool validate_type_url = true) const { @@ -306,6 +309,8 @@ struct MaybeEntry : public Result { // See Archive::ReadStatus for definition; this can be an acceptible error bool IsNotFound() const; + static MaybeEntry NotFound(const std::string &entryname); + static MaybeEntry Err(const std::string &s) { MaybeEntry m; m.error = s; return m; } @@ -361,4 +366,9 @@ inline std::optional Entry::GetTopicTime() const { } } +inline bool Entry::HasTopic() const { + auto maybe_tt = GetTopicTime(); + return maybe_tt.has_value() && !maybe_tt->topic().empty(); +} + } /* namespace protobag */ diff --git a/c++/protobag/protobag/ReadSession.cpp b/c++/protobag/protobag/ReadSession.cpp index f0c3b86..97233c5 100644 --- a/c++/protobag/protobag/ReadSession.cpp +++ b/c++/protobag/protobag/ReadSession.cpp @@ -39,7 +39,7 @@ MaybeEntry ReadSession::ReadEntryFrom( const auto maybe_bytes = archive->ReadAsStr(entryname); if (maybe_bytes.IsEntryNotFound()) { - return MaybeEntry::Err(maybe_bytes.error); + return MaybeEntry::NotFound(entryname); } else if (!maybe_bytes.IsOk()) { return MaybeEntry::Err( fmt::format("Read error for {}: {}", entryname, maybe_bytes.error)); @@ -56,7 +56,7 @@ MaybeEntry ReadSession::ReadEntryFrom( auto maybe_any = PBFactory::LoadFromContainer(*maybe_bytes.value); - // do we need to handle text format separately ? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // TODO maybe handle text format separately ? if (!maybe_any.IsOk()) { return MaybeEntry::Err(fmt::format( "Could not read protobuf from {}: {}", entryname, maybe_any.error)); @@ -103,7 +103,7 @@ MaybeEntry ReadSession::GetNext() { _archive, entryname, _plan.raw_mode, _spec.unpack_stamped_messages); if (maybe_entry.IsNotFound()) { if (_plan.require_all) { - return MaybeEntry::Err(fmt::format("Entry not found: {}", entryname)); + return MaybeEntry::NotFound(entryname); } else { return GetNext(); } @@ -126,6 +126,21 @@ Result ReadSession::GetIndex(const std::string &path) { return ReadLatestIndex(rp->_archive); } +Result> ReadSession::GetAllTopics(const std::string &path) { + auto maybe_index = GetIndex(path); + if (!maybe_index.IsOk()) { + return {.error = maybe_index.error}; + } + + const BagIndex &index = *maybe_index.value; + std::vector topics; + topics.reserve(index.topic_to_stats_size()); + for (const auto &entry : index.topic_to_stats()) { + topics.push_back(entry.first); + } + return {.value = topics}; +} + Result ReadSession::ReadLatestIndex(archive::Archive::Ptr archive) { if (!archive) { return {.error = "No archive to read"}; @@ -171,11 +186,12 @@ Result ReadSession::GetEntriesToRead( return {.error = "No archive to read"}; } - auto maybe_index = ReadLatestIndex(archive); // TODO support multiple indices ~~~~~~~~~~~~~~~~ + auto maybe_index = ReadLatestIndex(archive); // TODO support multiple indices if (!maybe_index.IsOk()) { + // TODO: support reindexing // // Then create one! // maybe_index = GetReindexed(archive); - return {.error = "Unindexed protobag not supported right now"}; // ~~~~~~~~~~~~~~~~~~~~~~~~~ + return {.error = "Unindexed protobag not currently supported"}; } if (!maybe_index.IsOk()) { @@ -291,12 +307,13 @@ Result ReadSession::GetEntriesToRead( if (window.has_end() && (window.end() < tt.timestamp())) { continue; } -// std::cout << "entries_to_read: " << tt.entryname() << std::endl; + entries_to_read.push(tt.entryname()); } return {.value = ReadPlan{ .entries_to_read = entries_to_read, - .require_all = false, // TODO should we report if index and archive don't match? ~~~~~~~~~~~~~~ + .require_all = false, + // TODO should we report if index and archive don't match? .raw_mode = false, }}; diff --git a/c++/protobag/protobag/ReadSession.hpp b/c++/protobag/protobag/ReadSession.hpp index 2bfb013..2f12aaa 100644 --- a/c++/protobag/protobag/ReadSession.hpp +++ b/c++/protobag/protobag/ReadSession.hpp @@ -21,7 +21,8 @@ class ReadSession final { Selection selection; bool unpack_stamped_messages; - // NB: for now we *only* support time-ordered reads for stamped entries. non-stamped are not ordered + // NB: for now we *only* support time-ordered reads for stamped entries. + // Non-stamped are not ordered. static Spec ReadAllFromPath(const std::string &path) { Selection sel; @@ -41,15 +42,16 @@ class ReadSession final { MaybeEntry GetNext(); - // TODO: begin() end() interface? see do-while loop in demo - // TODO TypeResolver halper ? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // Utilities // Read just the index from `path` static Result GetIndex(const std::string &path); + // Get a list of all the topics from `path` (if the archive at `path` + // has any time-series data). NB: Ignores the protobag index. + static Result> GetAllTopics(const std::string &path); + protected: Spec _spec; archive::Archive::Ptr _archive; @@ -62,11 +64,6 @@ class ReadSession final { }; ReadPlan _plan; - // maybe move these and make public ? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~` - // static Result GetReindexed(archive::Archive::Ptr archive); - - - static MaybeEntry ReadEntryFrom( archive::Archive::Ptr archive, const std::string &entryname, diff --git a/c++/protobag/protobag/Utils/TimeSync.cpp b/c++/protobag/protobag/Utils/TimeSync.cpp index 6b56ba4..c305f29 100644 --- a/c++/protobag/protobag/Utils/TimeSync.cpp +++ b/c++/protobag/protobag/Utils/TimeSync.cpp @@ -12,7 +12,6 @@ #include "protobag/Utils/IterProducts.hpp" #include "protobag/Utils/TopicTime.hpp" - // #include using Duration = ::google::protobuf::Duration; using Timestamp = ::google::protobuf::Timestamp; @@ -71,15 +70,6 @@ std::vector FindMinCostBundle( const std::vector> &all_q_stamps, ::google::protobuf::Duration max_slop) { -// std::cout << "FINDMINCOST all_q_stamps start" << std::endl; -// for (size_t q = 0; q < all_q_stamps.size(); ++q) { -// std::cout << "q " << q << std::endl; -// for (const auto &t : all_q_stamps[q]) { -// std::cout << t << std::endl; -// } -// } -// std::cout << "FINDMINCOST all_q_stamps end" << std::endl; - // Compute and return the total duration of the stamps indicated at `indices` auto TotalDuration = [&](const std::vector &indices) -> Duration { Timestamp start = MaxTimestamp(); @@ -157,7 +147,6 @@ struct MaxSlopTimeSync::Impl { return; } const TopicTime &tt = *maybeTT; -// std::cout << "tt.topic() " << tt.topic() << std::endl; if (topic_to_q.find(tt.topic()) != topic_to_q.end()) { auto &topic_q = topic_to_q[tt.topic()]; if (topic_q.Size() >= spec.max_queue_size) { @@ -165,13 +154,6 @@ struct MaxSlopTimeSync::Impl { } topic_q.Push(tt.timestamp(), std::move(entry)); } -// std::cout << "enqueued" << std::endl; - -// std::cout << "queue sizes:" << std::endl; -// for (const auto &tq : topic_to_q) { -// std::cout << tq.first << " " << tq.second.Size() << std::endl; -// } - } MaybeBundle TryGetNext() { @@ -182,7 +164,6 @@ struct MaxSlopTimeSync::Impl { for (const auto &tq : topic_to_q) { if (tq.second.IsEmpty()) { return kNoBundle; -// std::cout << "queue is empty " << tq.first << std::endl; } } @@ -197,9 +178,7 @@ struct MaxSlopTimeSync::Impl { for (const auto &topic : topics_ordered) { all_q_stamps.push_back(topic_to_q[topic].GetTimestamps()); } -// std::cout << "start find min cost" << std::endl; auto maybe_bundle_ts = FindMinCostBundle(all_q_stamps, spec.max_slop); -// std::cout << "end find min cost" << std::endl; if (maybe_bundle_ts.empty()) { return kNoBundle; } else { @@ -262,7 +241,6 @@ MaybeBundle MaxSlopTimeSync::GetNext() { bool reading = true; while (reading) { auto maybe_next_entry = rs.GetNext(); -// std::cout << "maybe_next_entry: " << (maybe_next_entry.error.empty() ? maybe_next_entry.value->ToString() : maybe_next_entry.error) << std::endl; if (!maybe_next_entry.IsOk()) { reading = false; return MaybeBundle::Err(maybe_next_entry.error); @@ -270,7 +248,6 @@ MaybeBundle MaxSlopTimeSync::GetNext() { _impl->Enqueue(std::move(*maybe_next_entry.value)); auto maybe_next_bundle = _impl->TryGetNext(); -// std::cout << "maybe_next_bundle: " << maybe_next_bundle.error << std::endl; if (maybe_next_bundle.IsOk()) { return maybe_next_bundle; } // else continue reading; maybe we'll get a bundle next time diff --git a/c++/protobag/protobag/Utils/TimeSync.hpp b/c++/protobag/protobag/Utils/TimeSync.hpp index 9ab3483..f5f0915 100644 --- a/c++/protobag/protobag/Utils/TimeSync.hpp +++ b/c++/protobag/protobag/Utils/TimeSync.hpp @@ -32,6 +32,7 @@ struct MaybeBundle : Result { } }; +// Base interace to a Time Synchronization algorithm. class TimeSync { public: typedef std::shared_ptr Ptr; @@ -52,10 +53,10 @@ class TimeSync { // Approximately synchronizes messages from given topics as follows: -// * Waits until there is at least one StampedMessage for every topic +// * Waits until there is at least one StampedMessage for every topic (and +// ignores entries that lack topci/timestamp data) // * Look at all possible bundlings of messages receieved thus far ... -// * Discard any bundling with total time difference greater than -// `max_slop` +// * Discard any bundle with total time difference greater than `max_slop` // * Emit the bundle with minimal total time difference and dequeue emitted // messages // * Continue until source ReadSession exhausted @@ -83,7 +84,7 @@ class MaxSlopTimeSync final : public TimeSync { struct Spec { std::vector topics; ::google::protobuf::Duration max_slop; - size_t max_queue_size = 1; + size_t max_queue_size = 1; // Recall: max queue size *per topic* // static WithMaxSlop(float max_slop_sec) { // Specs s; diff --git a/c++/protobag/protobag/archive/Archive.hpp b/c++/protobag/protobag/archive/Archive.hpp index f89d858..88e0ce4 100644 --- a/c++/protobag/protobag/archive/Archive.hpp +++ b/c++/protobag/protobag/archive/Archive.hpp @@ -26,11 +26,11 @@ class Archive { struct Spec { // clang-format off std::string mode; - // Choices: "read", "write" (which also means append) + // Choices: "read", "write" ("append" not yet tested / supported) std::string path; - // TODO a path ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // A local path for the archive // Special values: - // "" - Generate a + // "" - Generate (and write to) a temporary file std::string format; // Choices: // "memory" - Simply use an in-memory hashmap to store all archive @@ -65,9 +65,6 @@ class Archive { struct ReadStatus : public Result { static ReadStatus EntryNotFound() { return Err("EntryNotFound"); } bool IsEntryNotFound() const { return error == "EntryNotFound"; } - - // static ReadStatus EndOfArchive() { return Err("EndOfArchive"); } - // bool IsEndOfArchive() const { return error == "EndOfArchive"; } static ReadStatus Err(const std::string &s) { ReadStatus st; st.error = s; return st; @@ -88,6 +85,7 @@ class Archive { // TODO: bulk reads of several entries, probably be faster + // Writing ------------------------------------------------------------------ virtual OkOrErr Write( const std::string &entryname, const std::string &data) { diff --git a/c++/protobag/protobag/archive/DirectoryArchive.cpp b/c++/protobag/protobag/archive/DirectoryArchive.cpp index b8de723..ad3dd5b 100644 --- a/c++/protobag/protobag/archive/DirectoryArchive.cpp +++ b/c++/protobag/protobag/archive/DirectoryArchive.cpp @@ -79,16 +79,20 @@ OkOrErr DirectoryArchive::Write( fs::path entry_path = fs::path(_spec.path) / entry_path_rel; fs::create_directories(entry_path.parent_path()); + // Write! { std::ofstream out(entry_path, std::ios::binary); out << data; } + // Did that work? if (fs::is_regular_file(entry_path)) { return kOK; } else { return OkOrErr::Err( - fmt::format("Failed to write {} in {}", entryname, ToString())); + fmt::format( + "Failed to write entryname: {} entry_path: {} {}", + entryname, entry_path.u8string(), ToString())); } } diff --git a/c++/protobag_native/protobag_native.cpp b/c++/protobag_native/protobag_native.cpp index f4c2d0e..c51d824 100644 --- a/c++/protobag_native/protobag_native.cpp +++ b/c++/protobag_native/protobag_native.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifndef PROTOBAG_VERSION #define PROTOBAG_VERSION "unknown" @@ -77,7 +77,8 @@ struct native_entry final { }; -class Reader final { + +class PyReader final { public: void Start(const std::string &path, const std::string &sel_pb_bytes) { auto maybe_sel = PBFactory::LoadFromContainer(sel_pb_bytes); @@ -105,15 +106,21 @@ class Reader final { _read_sess = *maybe_rp.value; } - native_entry Next() { + std::optional GetNext() { if (!_read_sess) { throw std::runtime_error("Invalid read session"); } auto &reader = *_read_sess; auto maybe_entry = reader.GetNext(); - if (maybe_entry.IsEndOfSequence()) { - throw pybind11::stop_iteration(); + if (maybe_entry.IsEndOfSequence() || maybe_entry.IsNotFound()) { + // NB: We use this exception instead of pybind11::stop_iteration due + // to a bug in pybind related to libc++. FMI see: + // * https://gitter.im/pybind/Lobby?at=5f18cfc9361e295cf01fd21a + // * (This fix appears to still have a bug) + // https://github.com/pybind/pybind11/pull/949 + return std::nullopt; + } else if (!maybe_entry.IsOk()) { throw std::runtime_error(maybe_entry.error); } @@ -141,12 +148,86 @@ class Reader final { return *maybe_str.value; } + static std::vector GetAllTopics(const std::string &path) { + auto maybe_topics = ReadSession::GetAllTopics(path); + if (!maybe_topics.IsOk()) { + throw std::runtime_error( + fmt::format("Failed to read topics from {}: {}", + path, maybe_topics.error)); + } + return *maybe_topics.value; + } + + ReadSession::Ptr GetSession() const { return _read_sess; } + protected: ReadSession::Ptr _read_sess; }; -class Writer final { + +class PyTimeSyncBase { +public: + virtual ~PyTimeSyncBase() { } + virtual std::optional> GetNext() { + if (!_sync) { + throw std::runtime_error("Invalid synchronizer"); + } + + MaybeBundle next = _sync->GetNext(); + if (next.IsEndOfSequence()) { + // NB: We use this exception instead of pybind11::stop_iteration due + // to a bug in pybind related to libc++. FMI see: + // * https://gitter.im/pybind/Lobby?at=5f18cfc9361e295cf01fd21a + // * (This fix appears to still have a bug) + // https://github.com/pybind/pybind11/pull/949 + return std::nullopt; + + } else if (!next.IsOk()) { + throw std::runtime_error(next.error); + } + + std::list nbundle; + for (const auto &entry : *next.value) { + nbundle.push_back(native_entry::FromEntry(entry)); + } + return nbundle; + } + +protected: + TimeSync::Ptr _sync; +}; + +class PyMaxSlopTimeSync : public PyTimeSyncBase { +public: + void Start( + const PyReader &reader, + const MaxSlopTimeSync::Spec &spec) { + + auto read_sess = reader.GetSession(); + if (!read_sess) { + throw std::runtime_error("Invalid read session"); + } + + _spec = spec; + auto maybe_sync = MaxSlopTimeSync::Create(read_sess, spec); + if (!maybe_sync.IsOk()) { + throw std::runtime_error(fmt::format( + "Failed to create MaxSlopTimeSync: {}", maybe_sync.error)); + } + + _sync = *maybe_sync.value; + } + + MaxSlopTimeSync::Spec GetSpec() const { return _spec; } + +protected: + MaxSlopTimeSync::Spec _spec; +}; + + + +class PyWriter final { public: void Start(WriteSession::Spec s) { s.archive_spec.mode = "write"; @@ -259,6 +340,8 @@ PYBIND11_MODULE(protobag_native, m) { m.def("get_version", []() { return std::string(PROTOBAG_VERSION);}); + + /// native_entry py::class_(m, "native_entry", "Handle to a native entry") .def(py::init<>()) .def_readwrite("entryname", &native_entry::entryname) @@ -269,17 +352,68 @@ PYBIND11_MODULE(protobag_native, m) { .def_readwrite("sec", &native_entry::sec) .def_readwrite("nanos", &native_entry::nanos); - py::class_(m, "Reader", "Handle to a Protobag ReadSession") + + /// Reading + py::class_(m, "PyReader", "Handle to a Protobag ReadSession") .def(py::init<>(), "Create a null session") - .def("start", &Reader::Start, "Begin reading the given Selection") - .def("__iter__", [](Reader &r) -> Reader& { return r; }) - .def("next", &Reader::Next, "Generator interface: emit the next entry") - .def("__next__", &Reader::Next, "Generator interface: emit the next entry") + .def("start", &PyReader::Start, "Begin reading the given Selection") + .def( + "get_next", + &PyReader::GetNext, + "Get next item or None for end of sequence") .def_static( "get_index", - &Reader::GetIndex, - "Get the (string-serialized) BagIndex for the bag at the given path"); + &PyReader::GetIndex, + "Get the (string-serialized) BagIndex for the bag at the given path") + .def_static( + "get_topics", + &PyReader::GetAllTopics, + "Get the list of topics (for any time-series data) in the bag " + "at the given path"); + + + /// TimeSync + py::class_( + m, "MaxSlopTimeSyncSpec", "Spec for a MaxSlopTimeSync") + .def(py::init<>()) + .def_readwrite( + "topics", &MaxSlopTimeSync::Spec::topics, "Synchronize these topics") + .def("set_max_slop", + [](MaxSlopTimeSync::Spec &s, int64_t sec, int32_t nanos) { + s.max_slop.set_seconds(sec); s.max_slop.set_nanos(nanos); + }, + py::arg("seconds"), + py::arg("nanos"), + "Discard any bundle with total time difference greater than `max_slop`") + .def("get_max_slop", + [](MaxSlopTimeSync::Spec &s) { + py::dict d; + d["seconds"] = s.max_slop.seconds(); + d["nanos"] = s.max_slop.nanos(); + return d; + }) + .def_readwrite( + "max_queue_size", + &MaxSlopTimeSync::Spec::max_queue_size, + "Buffer at most this many messages per topic"); + + py::class_( + m, "PyMaxSlopTimeSync", + "Approximately synchronize two or more StampedMessage topics using " + "a max slop algorithm. FMI see docs for `protobag::MaxSlopTimeSync`. " + "Non-timestamped entries read during synchronization are dropped and " + "ignored. ") + .def(py::init<>()) + .def( + "start", &PyMaxSlopTimeSync::Start, + "Begin synchronizing the given reader") + .def( + "get_next", + &PyMaxSlopTimeSync::GetNext, + "Get next bundle or None for end of sequence"); + + /// Writing py::class_(m, "WriterSpec", "Spec for a WriteSession") .def(py::init<>()) .def_readwrite( @@ -299,14 +433,14 @@ PYBIND11_MODULE(protobag_native, m) { }, "Write in this format"); - py::class_(m, "Writer", "Handle to a Protobag WriteSession") + py::class_(m, "PyWriter", "Handle to a Protobag WriteSession") .def(py::init<>(), "Create a null session") - .def("start", &Writer::Start, "Begin writing given a WriteSession::Spec") - .def("close", &Writer::Close, "End writing session") - .def("write_raw", &Writer::WriteRaw, "Write the given raw bytes") + .def("start", &PyWriter::Start, "Begin writing given a WriteSession::Spec") + .def("close", &PyWriter::Close, "End writing session") + .def("write_raw", &PyWriter::WriteRaw, "Write the given raw bytes") .def( "write_msg", - &Writer::WriteMsg, + &PyWriter::WriteMsg, py::arg("entryname"), py::arg("type_url"), py::arg("msg_bytes"), @@ -314,7 +448,7 @@ PYBIND11_MODULE(protobag_native, m) { "Write the given message") .def( "write_stamped_msg", - &Writer::WriteStampedMsg, + &PyWriter::WriteStampedMsg, py::arg("topic"), py::arg("sec"), py::arg("nanos"), diff --git a/c++/protobag_test/protobag/DemoTest.cpp b/c++/protobag_test/protobag/DemoTest.cpp index 3243fde..70cb18c 100644 --- a/c++/protobag_test/protobag/DemoTest.cpp +++ b/c++/protobag_test/protobag/DemoTest.cpp @@ -132,15 +132,6 @@ TEST(DemoTest, TestDemo) { "Read entry:" << std::endl << current.ToString()); - // TODO show how to decode, also with dynamic decode! ~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // if (current.topic == "/topic1") { - // LOG("Read a string msg: " << UnpackedToPBTxt(current.stamped_msg)); - // } else if (current.topic == "/topic2") { - // LOG("Read an int msg: " << UnpackedToPBTxt(current.stamped_msg)); - // } else { - // LOG("Got ??? " << PBToString(current.stamped_msg)); - // } - LOG(""); } while(still_reading); @@ -151,9 +142,10 @@ TEST(DemoTest, TestDemo) { if (!maybe_index.IsOk()) { throw std::runtime_error(maybe_index.error); } - LOG( - "Protobag Index:" << std::endl << - PBToString(*maybe_index.value)); + // This is super noisy + // LOG( + // "Protobag Index:" << std::endl << + // PBToString(*maybe_index.value)); } } @@ -168,144 +160,118 @@ TEST(DemoTest, TestDemo) { // #include #include -// // https://github.com/protocolbuffers/protobuf/blob/7bff8393cab939bfbb9b5c69b3fe76b4d83c41ee/src/google/protobuf/util/json_util.cc#L217 -// namespace detail { -// using namespace google::protobuf; -// const char* kTypeUrlPrefix = "type.googleapis.com"; -// util::TypeResolver* generated_type_resolver_ = NULL; -// ::google::protobuf::internal::once_flag generated_type_resolver_init_; - -// std::string GetTypeUrl(const Message& message) { -// return std::string(kTypeUrlPrefix) + "/" + -// message.GetDescriptor()->full_name(); -// } - -// void DeleteGeneratedTypeResolver() { delete generated_type_resolver_; } - -// void InitGeneratedTypeResolver() { -// generated_type_resolver_ = util::NewTypeResolverForDescriptorPool( -// kTypeUrlPrefix, DescriptorPool::generated_pool()); -// ::google::protobuf::internal::OnShutdown(&DeleteGeneratedTypeResolver); -// } - -// util::TypeResolver* GetGeneratedTypeResolver() { -// ::google::protobuf::internal::call_once(generated_type_resolver_init_, -// InitGeneratedTypeResolver); -// return generated_type_resolver_; -// } -// } // namespace detail - -TEST(DemoTest, TestMonkey) { - - TopicTime tt; - - tt.set_topic("my-topic"); - tt.mutable_timestamp()->set_seconds(123); - - LOG( - "tt:" << std::endl << - PBToString(tt)); - - ::google::protobuf::DescriptorProto p; - tt.GetDescriptor()->CopyTo(&p); - // LOG( - // "tt descriptor:" << - // PBToString(p)); - - ::google::protobuf::FileDescriptorSet fds; - ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); - tt.GetDescriptor()->file()->CopyTo(fd); - LOG("containing_type " << tt.GetDescriptor()->containing_type()); - - LOG("dependency_count " << tt.GetDescriptor()->file()->dependency_count()); - for (int d = 0; d < tt.GetDescriptor()->file()->dependency_count(); ++d) { - ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); - const ::google::protobuf::FileDescriptor *dep = tt.GetDescriptor()->file()->dependency(d); - dep->CopyTo(fd); - LOG("copied " << dep->name()); - } - - // { - // google::protobuf::Any any; - // ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); - // any.GetDescriptor()->file()->CopyTo(fd); - // } - // { - // google::protobuf::Timestamp any; - // ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); - // any.GetDescriptor()->file()->CopyTo(fd); - // } - // { - // google::protobuf::DescriptorProto any; - // ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); - // any.GetDescriptor()->file()->CopyTo(fd); - // } - - // LOG( - // "tt fds:" << - // PBToString(fds)); - - { - using namespace ::google::protobuf; - const std::string msg_str = PBToString(tt); - - SimpleDescriptorDatabase db; - DescriptorPool pool(&db); - for (int i = 0; i < fds.file_size(); ++i) { - db.Add(fds.file(i)); - } - - { - std::vector fnames; - bool success = db.FindAllFileNames(&fnames); - if (success) { - for (const auto &fname : fnames) { - LOG("db file: " << fname); - } - } - } - - - LOG("full name " << tt.GetDescriptor()->full_name()); - DynamicMessageFactory factory; - const Descriptor *mt = nullptr; - mt = pool.FindMessageTypeByName(tt.GetDescriptor()->full_name()); - LOG("mt " << mt); +// TEST(DemoTest, TestMonkey) { + +// TopicTime tt; + +// tt.set_topic("my-topic"); +// tt.mutable_timestamp()->set_seconds(123); + +// LOG( +// "tt:" << std::endl << +// PBToString(tt)); + +// ::google::protobuf::DescriptorProto p; +// tt.GetDescriptor()->CopyTo(&p); +// // LOG( +// // "tt descriptor:" << +// // PBToString(p)); + +// ::google::protobuf::FileDescriptorSet fds; +// ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); +// tt.GetDescriptor()->file()->CopyTo(fd); +// LOG("containing_type " << tt.GetDescriptor()->containing_type()); + +// LOG("dependency_count " << tt.GetDescriptor()->file()->dependency_count()); +// for (int d = 0; d < tt.GetDescriptor()->file()->dependency_count(); ++d) { +// ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); +// const ::google::protobuf::FileDescriptor *dep = tt.GetDescriptor()->file()->dependency(d); +// dep->CopyTo(fd); +// LOG("copied " << dep->name()); +// } + +// // { +// // google::protobuf::Any any; +// // ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); +// // any.GetDescriptor()->file()->CopyTo(fd); +// // } +// // { +// // google::protobuf::Timestamp any; +// // ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); +// // any.GetDescriptor()->file()->CopyTo(fd); +// // } +// // { +// // google::protobuf::DescriptorProto any; +// // ::google::protobuf::FileDescriptorProto *fd = fds.add_file(); +// // any.GetDescriptor()->file()->CopyTo(fd); +// // } + +// // LOG( +// // "tt fds:" << +// // PBToString(fds)); + + +// { +// using namespace ::google::protobuf; +// const std::string msg_str = PBToString(tt); + +// SimpleDescriptorDatabase db; +// DescriptorPool pool(&db); +// for (int i = 0; i < fds.file_size(); ++i) { +// db.Add(fds.file(i)); +// } + +// { +// std::vector fnames; +// bool success = db.FindAllFileNames(&fnames); +// if (success) { +// for (const auto &fname : fnames) { +// LOG("db file: " << fname); +// } +// } +// } + + +// LOG("full name " << tt.GetDescriptor()->full_name()); +// DynamicMessageFactory factory; +// const Descriptor *mt = nullptr; +// mt = pool.FindMessageTypeByName(tt.GetDescriptor()->full_name()); +// LOG("mt " << mt); - if (mt) { - std::unique_ptr mp(factory.GetPrototype(mt)->New()); - LOG("value of message ptr " << mp.get()); - - if (mp) { - // NOTE! msg is owned by the factory!! might wanna do a Swap - auto &msg = *mp; - ::google::protobuf::TextFormat::ParseFromString(msg_str, &msg); - LOG("debug " << msg.DebugString()); - - - { - std::string out; - auto status = ::google::protobuf::util::MessageToJsonString(msg, &out); - if (!status.ok()) { - LOG("status out " << status.ToString()); - } - LOG("my jsons: " << out); - } - } - } - - - // using namespace google::protobuf; - // const DescriptorPool* pool = tt.GetDescriptor()->file()->pool(); - // util::TypeResolver* resolver = - // pool == DescriptorPool::generated_pool() - // ? detail::GetGeneratedTypeResolver() - // : util::NewTypeResolverForDescriptorPool(detail::kTypeUrlPrefix, pool); +// if (mt) { +// std::unique_ptr mp(factory.GetPrototype(mt)->New()); +// LOG("value of message ptr " << mp.get()); + +// if (mp) { +// // NOTE! msg is owned by the factory!! might wanna do a Swap +// auto &msg = *mp; +// ::google::protobuf::TextFormat::ParseFromString(msg_str, &msg); +// LOG("debug " << msg.DebugString()); + + +// { +// std::string out; +// auto status = ::google::protobuf::util::MessageToJsonString(msg, &out); +// if (!status.ok()) { +// LOG("status out " << status.ToString()); +// } +// LOG("my jsons: " << out); +// } +// } +// } + + +// // using namespace google::protobuf; +// // const DescriptorPool* pool = tt.GetDescriptor()->file()->pool(); +// // util::TypeResolver* resolver = +// // pool == DescriptorPool::generated_pool() +// // ? detail::GetGeneratedTypeResolver() +// // : util::NewTypeResolverForDescriptorPool(detail::kTypeUrlPrefix, pool); - } +// } -} +// } diff --git a/c++/protobag_test/protobag/EntryTest.cpp b/c++/protobag_test/protobag/EntryTest.cpp index 8826b52..382e9e0 100644 --- a/c++/protobag_test/protobag/EntryTest.cpp +++ b/c++/protobag_test/protobag/EntryTest.cpp @@ -1 +1,13 @@ -// todo \ No newline at end of file +#include "gtest/gtest.h" + +#include "protobag/Entry.hpp" +#include "protobag/Utils/StdMsgUtils.hpp" + +using namespace protobag; + +TEST(EntryTest, TestBasic) { + + auto entry = Entry::Create("/moof", ToStringMsg("moof")); + EXPECT_EQ(entry.entryname, "/moof"); + EXPECT_EQ(entry.msg.type_url(), GetTypeURL()); +} diff --git a/c++/protobag_test/protobag/ProtobagTest.cpp b/c++/protobag_test/protobag/ProtobagTest.cpp index 99b8a71..4a5b3e5 100644 --- a/c++/protobag_test/protobag/ProtobagTest.cpp +++ b/c++/protobag_test/protobag/ProtobagTest.cpp @@ -1,11 +1,9 @@ #include "gtest/gtest.h" -#include - #include "protobag/Protobag.hpp" TEST(ProtobagTest, TestBasic) { - // TODO + // See DemoTest } diff --git a/c++/protobag_test/protobag/ReadSessionTest.cpp b/c++/protobag_test/protobag/ReadSessionTest.cpp index 2b8e7e1..82130f4 100644 --- a/c++/protobag_test/protobag/ReadSessionTest.cpp +++ b/c++/protobag_test/protobag/ReadSessionTest.cpp @@ -3,7 +3,10 @@ #include #include #include +#include +#include "protobag/archive/Archive.hpp" +#include "protobag/BagIndexBuilder.hpp" #include "protobag/Entry.hpp" #include "protobag/Utils/PBUtils.hpp" #include "protobag/Utils/StdMsgUtils.hpp" @@ -13,14 +16,9 @@ using namespace protobag; +using namespace protobag::archive; using namespace protobag_test; -static const std::vector kExpectedEntries = { - Entry::CreateStamped("/topic1", 0, 0, ToStringMsg("foo")), - Entry::CreateStamped("/topic2", 0, 0, ToIntMsg(1337)), - Entry::CreateStamped("/topic1", 1, 0, ToStringMsg("bar")), -}; - inline ReadSession::Ptr OpenReaderAndCheck(const ReadSession::Spec &spec) { auto result = ReadSession::Create(spec); @@ -36,51 +34,196 @@ ReadSession::Ptr OpenReaderAndCheck(const ReadSession::Spec &spec) { return r; } -// TODO need to massage out indexing and stuff ... -// TEST(ReadSessionMemory, TestBasic) { -// } +inline +Entry CreateStampedWithEntryname( + const std::string &entryname, + Entry entry) { + + entry.entryname = entryname; + return entry; +} -// FIXME: add re-index support or make index fixture -// TEST(ReadSessionDirectory, TestBasic) { -// auto fixture_path = GetFixture("ReadSessionDirectory.TestBasic"); +// So that we can make the tests in this module independent of WriteSession, +// we manually create protobag fixtures using the utility below, which +// simulates what a WriteSession + DirectoryArchive would leave on disk. +template +void WriteEntriesAndIndex( + const std::string &path, + const EntryContainerT &entries, + const std::string &format="directory") { + + auto maybe_dar = Archive::Open({ + .mode="write", + .path=path, + .format=format, + }); + if (!maybe_dar.IsOk()) { + throw std::runtime_error(maybe_dar.error); + } + auto dar = *maybe_dar.value; + + // Write entries + BagIndexBuilder::UPtr builder(new BagIndexBuilder()); + for (const auto &entry : entries) { + auto maybe_m_bytes = PBFactory::ToBinaryString(entry.msg); + if (!maybe_m_bytes.IsOk()) { + throw std::runtime_error(maybe_m_bytes.error); + } + + auto status = dar->Write(entry.entryname, *maybe_m_bytes.value); + if (!status.IsOk()) { + throw std::runtime_error(status.error); + } + + builder->Observe(entry, entry.entryname); + } + + // Write index + BagIndex index = BagIndexBuilder::Complete(std::move(builder)); + { + auto index_entry = CreateStampedWithEntryname( + "/_protobag_index/bag_index/1337.1337.stampedmsg.protobin", + Entry::CreateStamped( + "/_protobag_index/bag_index", 1337, 1337, index)); -// auto rp = OpenReaderAndCheck( -// ReadSession::Spec::ReadAllFromPath(fixture_path)); + auto maybe_m_bytes = PBFactory::ToBinaryString(index_entry.msg); + if (!maybe_m_bytes.IsOk()) { + throw std::runtime_error(maybe_m_bytes.error); + } -// auto &reader = *rp; + auto status = dar->Write(index_entry.entryname, *maybe_m_bytes.value); + if (!status.IsOk()) { + throw std::runtime_error(status.error); + } + } + + dar->Close(); +} + +template +void ReadAllEntriesAndCheck( + const std::string &path, + const EntryContainerT &expected_entries) { + + auto rp = OpenReaderAndCheck(ReadSession::Spec::ReadAllFromPath(path)); + auto &reader = *rp; + + std::vector actual_entries; + bool still_reading = true; + bool has_index = false; + do { + MaybeEntry maybe_next = reader.GetNext(); + if (maybe_next.IsEndOfSequence()) { + still_reading = false; + break; + } + ASSERT_TRUE(maybe_next.IsOk()) << maybe_next.error; + const auto &entry = *maybe_next.value; + if (entry.entryname.find("/_protobag_index") != std::string::npos) { + has_index = true; + } else { + actual_entries.push_back(*maybe_next.value); + } + } while(still_reading); + + + std::vector expected_names; + std::unordered_map name_to_expected; + { + for (const auto &eentry : expected_entries) { + name_to_expected[eentry.entryname] = eentry; + expected_names.push_back(eentry.entryname); + } + } -// std::vector actual_entries; -// bool still_reading = true; -// do { -// MaybeEntry maybe_next = reader.GetNext(); -// if (maybe_next.IsEndOfSequence()) { -// still_reading = false; -// break; -// } - -// ASSERT_TRUE(maybe_next.IsOk()) << maybe_next.error; + std::vector actual_names; + std::unordered_map name_to_actual; + { + for (const auto &aentry : actual_entries) { + name_to_actual[aentry.entryname] = aentry; + actual_names.push_back(aentry.entryname); + } + } + + // Check just the entry name lists match + EXPECT_SORTED_SEQUENCES_EQUAL(expected_names, actual_names); + ASSERT_EQ(expected_names.size(), actual_names.size()); + + // Check contents + for (const auto &expected_me : name_to_expected) { + const auto &actual = name_to_actual[expected_me.first]; + auto expected = expected_me.second; -// actual_entries.push_back(*maybe_next.value); + if (expected.IsStampedMessage()) { + + auto e_tt = expected.GetTopicTime(); + ASSERT_TRUE(e_tt.has_value()); + auto a_tt = actual.GetTopicTime(); + ASSERT_TRUE(a_tt.has_value()); + EXPECT_EQ(e_tt->topic(), a_tt->topic()); + EXPECT_EQ(e_tt->timestamp().seconds(), a_tt->timestamp().seconds()); + EXPECT_EQ(e_tt->timestamp().nanos(), a_tt->timestamp().nanos()); + + + // Unpack expected, actual is already unpacked + auto maybe_ee = expected.UnpackFromStamped(); + ASSERT_TRUE(maybe_ee.IsOk()); + expected = *maybe_ee.value; + } else { + EXPECT_EQ( + PBToString(actual.msg), + PBToString(expected.msg)); + } + + EXPECT_TRUE(actual.EntryDataEqualTo(expected)) << + "Actual: " << actual.ToString() << + "\nExpected:\n" << expected.ToString(); + } +} -// } while(still_reading); +TEST(ReadSessionTest, DirectoryTestMessages) { + auto testdir = CreateTestTempdir("ReadSessionTest.DirectoryTestMessages"); -// auto expected_entries = kExpectedEntries; + static const std::vector kExpectedEntries = { + Entry::Create("/moof", ToStringMsg("moof")), + Entry::Create("/hi_1337", ToIntMsg(1337)), + }; -// ASSERT_EQ(actual_entries.size(), expected_entries.size()); -// for (size_t i = 0; i < actual_entries.size(); ++i) { -// auto expected = expected_entries[i]; -// auto actual = actual_entries[i]; + WriteEntriesAndIndex(testdir, kExpectedEntries); -// EXPECT_EQ( -// PBToString(actual.msg), -// PBToString(expected.msg)); - + ReadAllEntriesAndCheck(testdir, kExpectedEntries); +} + +TEST(ReadSessionTest, DirectoryTestStampedMessages) { + auto testdir = CreateTestTempdir("ReadSessionTest.DirectoryTestStampedMessages"); + + static const std::vector kExpectedEntries = { + CreateStampedWithEntryname( + "/topic1/0.0.stampedmsg.protobin", + Entry::CreateStamped("/topic1", 0, 0, ToStringMsg("foo"))), + CreateStampedWithEntryname( + "/topic2/0.0.stampedmsg.protobin", + Entry::CreateStamped("/topic2", 0, 0, ToIntMsg(1337))), + CreateStampedWithEntryname( + "/topic1/1.0.stampedmsg.protobin", + Entry::CreateStamped("/topic1", 1, 0, ToStringMsg("bar"))), + }; -// EXPECT_TRUE(actual.EntryDataEqualTo(expected)) << -// "Actual: " << actual.ToString() << -// "\nExpected:\n" << expected.ToString(); -// } -// } + WriteEntriesAndIndex(testdir, kExpectedEntries); -// TODO: test with tar and zip \ No newline at end of file + ReadAllEntriesAndCheck(testdir, kExpectedEntries); +} + +TEST(ReadSessionTest, DirectoryTestRawMessages) { + auto testdir = CreateTestTempdir("ReadSessionTest.DirectoryTestRawMessages"); + + static const std::vector kExpectedEntries = { + Entry::CreateRawFromBytes("/i_am_raw", "i am raw data"), + Entry::CreateRawFromBytes("/i_am_raw2", "i am also raw data"), + }; + + WriteEntriesAndIndex(testdir, kExpectedEntries); + + ReadAllEntriesAndCheck(testdir, kExpectedEntries); +} diff --git a/c++/protobag_test/protobag/Utils/TimeSyncTest.cpp b/c++/protobag_test/protobag/Utils/TimeSyncTest.cpp index 757b99f..fd63e5d 100644 --- a/c++/protobag_test/protobag/Utils/TimeSyncTest.cpp +++ b/c++/protobag_test/protobag/Utils/TimeSyncTest.cpp @@ -12,7 +12,6 @@ using namespace protobag; using namespace protobag_test; -// TODO promote for other tests? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ namespace protobag { inline diff --git a/c++/protobag_test/protobag/WriteSessionTest.cpp b/c++/protobag_test/protobag/WriteSessionTest.cpp index 033a346..7f31a1b 100644 --- a/c++/protobag_test/protobag/WriteSessionTest.cpp +++ b/c++/protobag_test/protobag/WriteSessionTest.cpp @@ -15,9 +15,13 @@ using namespace protobag_test; std::vector CreateEntriesFixture() { return { + Entry::Create("/moof", ToStringMsg("moof")), + Entry::CreateStamped("/topic1", 0, 0, ToStringMsg("foo")), Entry::CreateStamped("/topic1", 1, 0, ToStringMsg("bar")), Entry::CreateStamped("/topic2", 0, 0, ToIntMsg(1337)), + + Entry::CreateRawFromBytes("/i_am_raw", "i am raw data"), }; } @@ -40,7 +44,7 @@ inline void ExpectWriteOk(WriteSession &w, const Entry &entry) { OkOrErr result = w.WriteEntry(entry); if (!result.IsOk()) { - throw new std::runtime_error(result.error); + throw std::runtime_error(result.error); } } @@ -86,6 +90,8 @@ TEST(WriteSessionDirectory, TestBasic) { "/topic1/0.0.stampedmsg.protobin", "/topic1/1.0.stampedmsg.protobin", "/topic2/0.0.stampedmsg.protobin", + "/moof", + "/i_am_raw", }; EXPECT_SORTED_SEQUENCES_EQUAL(expected, actual); } @@ -169,10 +175,33 @@ TEST(WriteSessionDirectory, TestBasic) { } } + { + auto res = dar->ReadAsStr("moof"); + ASSERT_TRUE(res.IsOk()) << res.error; + auto maybe_msg = PBFactory::LoadFromContainer<::google::protobuf::Any>(*res.value); + ASSERT_TRUE(maybe_msg.IsOk()) << maybe_msg.error; + const ::google::protobuf::Any &any_msg = *maybe_msg.value; + ASSERT_EQ(any_msg.type_url(), GetTypeURL()); + { + auto maybe_msg = PBFactory::UnpackFromAny(any_msg); + ASSERT_TRUE(maybe_msg.IsOk()) << maybe_msg.error; + + const StdMsg_String &m = *maybe_msg.value; + EXPECT_EQ(m.value(), "moof"); + } + } { - // TODO check bag meta ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + auto res = dar->ReadAsStr("i_am_raw"); + ASSERT_TRUE(res.IsOk()) << res.error; + + auto maybe_msg = PBFactory::LoadFromContainer<::google::protobuf::Any>(*res.value); + ASSERT_TRUE(maybe_msg.IsOk()) << maybe_msg.error; + const ::google::protobuf::Any &any_msg = *maybe_msg.value; + ASSERT_EQ(any_msg.type_url(), ""); + ASSERT_EQ(any_msg.value(), "i am raw data"); } + } } diff --git a/c++/protobag_test/protobag_test/Utils.hpp b/c++/protobag_test/protobag_test/Utils.hpp index 99c7ed1..8d8fe7e 100644 --- a/c++/protobag_test/protobag_test/Utils.hpp +++ b/c++/protobag_test/protobag_test/Utils.hpp @@ -275,4 +275,4 @@ ::testing::AssertionResult AssertSequencesEqual( a2, \ #expected, \ #actual)); \ - } while(0) \ No newline at end of file + } while(0) diff --git a/cocoa/ProtobagOSX/Podfile b/cocoa/ProtobagOSX/Podfile index 766e955..a5bfb14 100644 --- a/cocoa/ProtobagOSX/Podfile +++ b/cocoa/ProtobagOSX/Podfile @@ -1,14 +1,14 @@ # Uncomment the next line to define a global platform for your project # platform :ios, '9.0' -source 'https://github.com/CocoaPods/Specs.git' +source 'https://github.com/CocoaPods/Specs.git' source 'git@github.com:StandardCyborg/SCCocoaPods.git' target 'ProtobagOSX' do pod 'ProtobagCocoa', :podspec => '../../ProtobagCocoa.podspec.json' pod 'ProtobagCocoaTest', :podspec => 'ProtobagCocoaTest.podspec.json' pod 'GTestCpp', :podspec => 'GTestCpp.podspec.json' - pod 'LibArchiveCocoa', :podspec => '/Users/pwais/Documents/LibArchiveCocoa/LibArchiveCocoa.podspec.json' + pod 'LibArchiveCocoa', '~> 3.4.2' # pod 'PyBind11C++', :podspec => 'PyBind11C++.podspec' # pod 'ProtobagPyNative', :podspec => 'ProtobagPyNative.podspec' end @@ -19,5 +19,5 @@ target 'protobag_native' do pod 'GTestCpp', :podspec => 'GTestCpp.podspec.json' pod 'PyBind11C++', :podspec => 'PyBind11C++.podspec' pod 'ProtobagPyNative', :podspec => 'ProtobagPyNative.podspec' - pod 'LibArchiveCocoa', :podspec => '/Users/pwais/Documents/LibArchiveCocoa/LibArchiveCocoa.podspec.json' + pod 'LibArchiveCocoa', ~> '3.4.2' end diff --git a/cocoa/ProtobagOSX/ProtobagOSX/main.mm b/cocoa/ProtobagOSX/ProtobagOSX/main.mm index b3d1d3e..70bfa88 100644 --- a/cocoa/ProtobagOSX/ProtobagOSX/main.mm +++ b/cocoa/ProtobagOSX/ProtobagOSX/main.mm @@ -12,8 +12,6 @@ #include int main(int argc, char * argv[]) { -// std::cout << "hi" << std::endl; -// return 0; testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } diff --git a/cocoa/README.md b/cocoa/README.md index 3b68cf5..6332989 100644 --- a/cocoa/README.md +++ b/cocoa/README.md @@ -1 +1,3 @@ -todo describe xcode project and stuff +This directory contains a prototype of building Protobag using XCode. If you're using a Mac and you can CMake installed, +you should be able to build Protobag using the existing CMake build system. If you need to integrate Protobag into +an iOS or OSX application, please see the root `ProtobagCocoa.podspec.json` file. diff --git a/docker/Dockerfile b/docker/Dockerfile index 536fac3..4f65259 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,12 +6,12 @@ FROM ubuntu:${UBUNTU_VERSION} as base # up properly when deleting things and the cache gets stale. ENV PYTHONDONTWRITEBYTECODE 1 -# python3 +# python3 and dev tools RUN apt-get update && \ apt-get install -y \ python3-dev \ python3-pip -RUN pip3 install pytest +RUN pip3 install pytest jupyter # build-essential-ish with clang; forcing to libc++ 8 RUN apt-get update && \ diff --git a/examples/c++-writer/MyMessages.proto b/examples/c++-writer/MyMessages.proto index 8442b7f..2682f73 100644 --- a/examples/c++-writer/MyMessages.proto +++ b/examples/c++-writer/MyMessages.proto @@ -5,6 +5,8 @@ package my_messages; message DinoHunter { string first_name = 1; int32 id = 2; + + // Misc attributes of this hunter map attribs = 3; enum DinoType { @@ -19,6 +21,7 @@ message DinoHunter { DinoType type = 2; } + // Dinos that this hunter has captured repeated Dino dinos = 4; } diff --git a/examples/notebook-demo/.gitignore b/examples/notebook-demo/.gitignore new file mode 100644 index 0000000..1327cac --- /dev/null +++ b/examples/notebook-demo/.gitignore @@ -0,0 +1,2 @@ +*.zip +.ipynb_checkpoints diff --git a/examples/notebook-demo/MyMessages_pb2.py b/examples/notebook-demo/MyMessages_pb2.py new file mode 100644 index 0000000..c0901c4 --- /dev/null +++ b/examples/notebook-demo/MyMessages_pb2.py @@ -0,0 +1,263 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: MyMessages.proto + +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + + + +DESCRIPTOR = _descriptor.FileDescriptor( + name='MyMessages.proto', + package='my_messages', + syntax='proto3', + serialized_options=None, + serialized_pb=b'\n\x10MyMessages.proto\x12\x0bmy_messages\"\xd7\x02\n\nDinoHunter\x12\x12\n\nfirst_name\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\x05\x12\x35\n\x07\x61ttribs\x18\x03 \x03(\x0b\x32$.my_messages.DinoHunter.AttribsEntry\x12+\n\x05\x64inos\x18\x04 \x03(\x0b\x32\x1c.my_messages.DinoHunter.Dino\x1a.\n\x0c\x41ttribsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x44\n\x04\x44ino\x12\x0c\n\x04name\x18\x01 \x01(\t\x12.\n\x04type\x18\x02 \x01(\x0e\x32 .my_messages.DinoHunter.DinoType\"O\n\x08\x44inoType\x12\x07\n\x03IDK\x10\x00\x12\x10\n\x0cVEGGIESAURUS\x10\x01\x12\x10\n\x0cMEATIESAURUS\x10\x02\x12\x16\n\x12PEOPLEEATINGSAURUS\x10\x03\" \n\x08Position\x12\t\n\x01x\x18\x01 \x01(\x02\x12\t\n\x01y\x18\x02 \x01(\x02\x62\x06proto3' +) + + + +_DINOHUNTER_DINOTYPE = _descriptor.EnumDescriptor( + name='DinoType', + full_name='my_messages.DinoHunter.DinoType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='IDK', index=0, number=0, + serialized_options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='VEGGIESAURUS', index=1, number=1, + serialized_options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='MEATIESAURUS', index=2, number=2, + serialized_options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PEOPLEEATINGSAURUS', index=3, number=3, + serialized_options=None, + type=None), + ], + containing_type=None, + serialized_options=None, + serialized_start=298, + serialized_end=377, +) +_sym_db.RegisterEnumDescriptor(_DINOHUNTER_DINOTYPE) + + +_DINOHUNTER_ATTRIBSENTRY = _descriptor.Descriptor( + name='AttribsEntry', + full_name='my_messages.DinoHunter.AttribsEntry', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='key', full_name='my_messages.DinoHunter.AttribsEntry.key', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='value', full_name='my_messages.DinoHunter.AttribsEntry.value', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=b'8\001', + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=180, + serialized_end=226, +) + +_DINOHUNTER_DINO = _descriptor.Descriptor( + name='Dino', + full_name='my_messages.DinoHunter.Dino', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='my_messages.DinoHunter.Dino.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='type', full_name='my_messages.DinoHunter.Dino.type', index=1, + number=2, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=228, + serialized_end=296, +) + +_DINOHUNTER = _descriptor.Descriptor( + name='DinoHunter', + full_name='my_messages.DinoHunter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='first_name', full_name='my_messages.DinoHunter.first_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='id', full_name='my_messages.DinoHunter.id', index=1, + number=2, type=5, cpp_type=1, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='attribs', full_name='my_messages.DinoHunter.attribs', index=2, + number=3, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='dinos', full_name='my_messages.DinoHunter.dinos', index=3, + number=4, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[_DINOHUNTER_ATTRIBSENTRY, _DINOHUNTER_DINO, ], + enum_types=[ + _DINOHUNTER_DINOTYPE, + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=34, + serialized_end=377, +) + + +_POSITION = _descriptor.Descriptor( + name='Position', + full_name='my_messages.Position', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='x', full_name='my_messages.Position.x', index=0, + number=1, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + _descriptor.FieldDescriptor( + name='y', full_name='my_messages.Position.y', index=1, + number=2, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=379, + serialized_end=411, +) + +_DINOHUNTER_ATTRIBSENTRY.containing_type = _DINOHUNTER +_DINOHUNTER_DINO.fields_by_name['type'].enum_type = _DINOHUNTER_DINOTYPE +_DINOHUNTER_DINO.containing_type = _DINOHUNTER +_DINOHUNTER.fields_by_name['attribs'].message_type = _DINOHUNTER_ATTRIBSENTRY +_DINOHUNTER.fields_by_name['dinos'].message_type = _DINOHUNTER_DINO +_DINOHUNTER_DINOTYPE.containing_type = _DINOHUNTER +DESCRIPTOR.message_types_by_name['DinoHunter'] = _DINOHUNTER +DESCRIPTOR.message_types_by_name['Position'] = _POSITION +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +DinoHunter = _reflection.GeneratedProtocolMessageType('DinoHunter', (_message.Message,), { + + 'AttribsEntry' : _reflection.GeneratedProtocolMessageType('AttribsEntry', (_message.Message,), { + 'DESCRIPTOR' : _DINOHUNTER_ATTRIBSENTRY, + '__module__' : 'MyMessages_pb2' + # @@protoc_insertion_point(class_scope:my_messages.DinoHunter.AttribsEntry) + }) + , + + 'Dino' : _reflection.GeneratedProtocolMessageType('Dino', (_message.Message,), { + 'DESCRIPTOR' : _DINOHUNTER_DINO, + '__module__' : 'MyMessages_pb2' + # @@protoc_insertion_point(class_scope:my_messages.DinoHunter.Dino) + }) + , + 'DESCRIPTOR' : _DINOHUNTER, + '__module__' : 'MyMessages_pb2' + # @@protoc_insertion_point(class_scope:my_messages.DinoHunter) + }) +_sym_db.RegisterMessage(DinoHunter) +_sym_db.RegisterMessage(DinoHunter.AttribsEntry) +_sym_db.RegisterMessage(DinoHunter.Dino) + +Position = _reflection.GeneratedProtocolMessageType('Position', (_message.Message,), { + 'DESCRIPTOR' : _POSITION, + '__module__' : 'MyMessages_pb2' + # @@protoc_insertion_point(class_scope:my_messages.Position) + }) +_sym_db.RegisterMessage(Position) + + +_DINOHUNTER_ATTRIBSENTRY._options = None +# @@protoc_insertion_point(module_scope) diff --git a/examples/notebook-demo/protobag-demo-full.ipynb b/examples/notebook-demo/protobag-demo-full.ipynb new file mode 100644 index 0000000..d57a526 --- /dev/null +++ b/examples/notebook-demo/protobag-demo-full.ipynb @@ -0,0 +1,1171 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Protobag Demo\n", + "\n", + "This notebook shows:\n", + " * [How to write and read Protobuf messsages to a `protobag` zip archive](#write_read)\n", + " * [How to convert protobag zip archives to and from Pandas Dataframes and Parquet tables](#dataframes)\n", + " \n", + "If you only have 30 seconds, hit \"run all cells\" and try scrolling to these sections:\n", + " * [Writing Plain Protobuf Messages](#write_plain)\n", + " * [Reading Plain Protobuf Messages](#read_plain)\n", + " * [Converting a Protobag to a Pandas Dataframe](#dataframes) (longer, but hopefully easy to skim)\n", + "\n", + "Protobag archives are just zip archives that contain serialized Protobuf messages as files. We hope this notebook shows how `protobag` may prove more useful than working with Protobuf + Zip archives directly.\n", + "\n", + "## Environment Set-up\n", + "\n", + "To run this notebook locally, try using the protobag dockerized environment from a clone of the protobag repo:\n", + "```shell\n", + "my-machine $ ./pb-dev --build-env\n", + "my-machine $ ./pb-dev --shell\n", + "in-docker % jupyter notebook --allow-root examples/notebook-demo\n", + "```\n", + "\n", + "**Google Colab** You can also [run this notebook in Google Colab](https://colab.sandbox.google.com/github/StandardCyborg/protobag/blob/master/examples/notebook-demo/protobag-demo-full.ipynb). In the Colab environment, you'll need to install `protobag` and some other dependencies. Running the cell below will take care of that for you. You might need to restart the runtime (Use the menu option: Runtime > Restart runtime ...) in order for Colab to recognize the new modules." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We're running in the dockerized environment! We can simply add protobag to the PYTHONPATH\n", + "running pytest\n", + "running egg_info\n", + "writing protobag.egg-info/PKG-INFO\n", + "writing dependency_links to protobag.egg-info/dependency_links.txt\n", + "writing requirements to protobag.egg-info/requires.txt\n", + "writing top-level names to protobag.egg-info/top_level.txt\n", + "reading manifest file 'protobag.egg-info/SOURCES.txt'\n", + "writing manifest file 'protobag.egg-info/SOURCES.txt'\n", + "running build_ext\n", + "cmake version 3.10.2\n", + "\n", + "CMake suite maintained and supported by Kitware (kitware.com/cmake).\n", + "-- The C compiler identification is Clang 8.0.0\n", + "-- The CXX compiler identification is Clang 8.0.0\n", + "-- Check for working C compiler: /usr/bin/cc\n", + "-- Check for working C compiler: /usr/bin/cc -- works\n", + "-- Detecting C compiler ABI info\n", + "-- Detecting C compiler ABI info - done\n", + "-- Detecting C compile features\n", + "-- Detecting C compile features - done\n", + "-- Check for working CXX compiler: /usr/bin/c++\n", + "-- Check for working CXX compiler: /usr/bin/c++ -- works\n", + "-- Detecting CXX compiler ABI info\n", + "-- Detecting CXX compiler ABI info - done\n", + "-- Detecting CXX compile features\n", + "-- Detecting CXX compile features - done\n", + "-- Found GTest: /opt/gtest/lib/libgtest.a \n", + "-- Found Threads: TRUE \n", + "-- Found Protobuf: /usr/local/lib/libprotobuf.a;-lpthread (found version \"3.11.3\") \n", + "-- Found LibArchive: /usr/local/lib/libarchive.so (found version \"3.4.2\") \n", + "-- Found PythonInterp: /usr/bin/python3 (found version \"3.6.9\") \n", + "-- Found PythonLibs: /usr/lib/x86_64-linux-gnu/libpython3.6m.so\n", + "-- Performing Test HAS_FLTO\n", + "-- Performing Test HAS_FLTO - Success\n", + "-- LTO enabled\n", + "-- Configuring done\n", + "-- Generating done\n", + "-- Build files have been written to: /opt/protobag/python/build/temp.linux-x86_64-3.6\n", + "\u001b[35m\u001b[1mScanning dependencies of target protobag_native\u001b[0m\n", + "[ 6%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/ArchiveUtil.cpp.o\u001b[0m\n", + "[ 18%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/BagIndexBuilder.cpp.o\u001b[0m\n", + "[ 31%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/Entry.cpp.o\u001b[0m\n", + "[ 43%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/ReadSession.cpp.o\u001b[0m\n", + "[ 43%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/Utils/Tempfile.cpp.o\u001b[0m\n", + "[ 43%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/Protobag.cpp.o\u001b[0m\n", + "[ 43%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/Utils/PBUtils.cpp.o\u001b[0m\n", + "[ 50%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/WriteSession.cpp.o\u001b[0m\n", + "[ 62%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/Utils/TimeSync.cpp.o\u001b[0m\n", + "[ 68%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/archive/Archive.cpp.o\u001b[0m\n", + "[ 68%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/archive/DirectoryArchive.cpp.o\u001b[0m\n", + "[ 75%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/archive/LibArchiveArchive.cpp.o\u001b[0m\n", + "[ 87%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag/archive/MemoryArchive.cpp.o\u001b[0m\n", + "[ 87%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag/protobag_msg/ProtobagMsg.pb.cc.o\u001b[0m\n", + "[ 93%] \u001b[32mBuilding CXX object CMakeFiles/protobag_native.dir/protobag_native/protobag_native.cpp.o\u001b[0m\n", + "[100%] \u001b[32m\u001b[1mLinking CXX shared library /opt/protobag/python/protobag/protobag_native.cpython-36m-x86_64-linux-gnu.so\u001b[0m\n", + "[100%] Built target protobag_native\n", + "\u001b[1m============================= test session starts ==============================\u001b[0m\n", + "platform linux -- Python 3.6.9, pytest-6.0.1, py-1.9.0, pluggy-0.13.1 -- /usr/bin/python3\n", + "cachedir: .pytest_cache\n", + "rootdir: /opt/protobag/python, configfile: setup.cfg\n", + "collected 18 items \u001b[0m\n", + "\n", + "protobag_test/test_protobag.py::test_type_url \u001b[32mPASSED\u001b[0m\u001b[32m [ 5%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_to_pb_timestamp \u001b[32mPASSED\u001b[0m\u001b[32m [ 11%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_to_sec_nanos \u001b[32mPASSED\u001b[0m\u001b[32m [ 16%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_to_topic_time \u001b[32mPASSED\u001b[0m\u001b[32m [ 22%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_build_fds_for_msg \u001b[32mPASSED\u001b[0m\u001b[32m [ 27%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_msg_entry_print \u001b[32mPASSED\u001b[0m\u001b[32m [ 33%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_raw_entry_print \u001b[32mPASSED\u001b[0m\u001b[32m [ 38%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_stamped_entry_print \u001b[32mPASSED\u001b[0m\u001b[32m [ 44%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_typed_bytes \u001b[32mPASSED\u001b[0m\u001b[32m [ 50%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_serdes_msg_from_typed_bytes_empty \u001b[32mPASSED\u001b[0m\u001b[32m [ 55%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_serdes_msg_from_typed_bytes_default_serdes \u001b[32mPASSED\u001b[0m\u001b[32m [ 61%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_serdes_msg_from_typed_bytes_user_registered \u001b[32mPASSED\u001b[0m\u001b[32m [ 66%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_serdes_msg_from_typed_bytes_dynamic_decode \u001b[32mPASSED\u001b[0m\u001b[32m [ 72%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_write_read_msg \u001b[32mPASSED\u001b[0m\u001b[32m [ 77%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_write_read_stamped_msg \u001b[32mPASSED\u001b[0m\u001b[32m [ 83%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_write_read_raw \u001b[32mPASSED\u001b[0m\u001b[32m [ 88%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_entry_to_dict_row \u001b[32mPASSED\u001b[0m\u001b[32m [ 94%]\u001b[0m\n", + "protobag_test/test_protobag.py::test_dict_row_to_entry \u001b[32mPASSED\u001b[0m\u001b[32m [100%]\u001b[0m\n", + "\n", + "============================== slowest durations ===============================\n", + "0.14s call protobag_test/test_protobag.py::test_write_read_stamped_msg\n", + "0.08s call protobag_test/test_protobag.py::test_write_read_msg\n", + "0.01s call protobag_test/test_protobag.py::test_serdes_msg_from_typed_bytes_dynamic_decode\n", + "0.01s call protobag_test/test_protobag.py::test_build_fds_for_msg\n", + "\n", + "(50 durations < 0.005s hidden. Use -vv to show these durations.)\n", + "\u001b[32m============================== \u001b[32m\u001b[1m18 passed\u001b[0m\u001b[32m in 0.54s\u001b[0m\u001b[32m ==============================\u001b[0m\n", + "Protobag added to PYTHONPATH\n", + "Using protobag version 0.0.3 at /opt/protobag/python/protobag/__init__.py\n" + ] + } + ], + "source": [ + "import os\n", + "import sys\n", + "if os.path.exists('/opt/protobag'):\n", + " print(\"We're running in the dockerized environment! We can simply add protobag to the PYTHONPATH\")\n", + " if not os.path.exists('/opt/protobag/python/protobag/protobag_native.cpython-36m-x86_64-linux-gnu.so'):\n", + " # We need to build the `protobag_native` module. The easiest way is to:\n", + " !cd /opt/protobag/python && python3 setup.py test\n", + " sys.path.append('/opt/protobag/python/')\n", + " print(\"Protobag added to PYTHONPATH\")\n", + "elif 'google.colab' in sys.modules:\n", + " try:\n", + " import protobag\n", + " except:\n", + " print(\"Installing protobag\")\n", + " !pip3 install protobag-0.0.3-cp36-cp36m-linux_x86_64.whl\n", + " print(\"Installing libarchive; Colab linux does not yet have the version we need\")\n", + " !ls /usr/local/lib/libarchive.so.17 || (cd /tmp && \\\n", + " wget https://github.com/libarchive/libarchive/archive/v3.4.2.tar.gz && \\\n", + " tar xfz v3.4.2.tar.gz && \\\n", + " (rm -rf /opt/libarchive || true) && \\\n", + " mv libarchive-3.4.2 /opt/libarchive && \\\n", + " cd /opt/libarchive && \\\n", + " mkdir -p build && cd build && \\\n", + " cmake .. && \\\n", + " make -j `nproc` && \\\n", + " make install)\n", + " print(\"Installing libc++\")\n", + " !apt-get install -y libc++-8-dev libc++abi-8-dev || echo \"ignoring 'libmkldnn.so.0 is not a symbolic link' issue\"\n", + "\n", + "import protobag\n", + "print(\"Using protobag version %s at %s\" % (protobag.__version__, protobag.__file__))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Our Protobuf Messages\n", + "\n", + "*Note: If you're not super familiar with Protobuf, you might want to check out [their address book data tutorial](https://developers.google.com/protocol-buffers/docs/pythontutorial)*.\n", + "\n", + "Suppose we've developed a cool game called Dino Hunters where people run around on a deserted island and try to capture wild dinosaurs. We're using Protobuf to persist data about the `DinoHunter` characters in our game. Furthermore, we want to use Protobuf to log the 2D `Position`s of our hunters as the run around and hunt dinos. Our Protobuf message schema is as follows:\n", + "\n", + "```protobuf\n", + "syntax = \"proto3\";\n", + "\n", + "package my_messages;\n", + "\n", + "message DinoHunter {\n", + " string first_name = 1;\n", + " int32 id = 2;\n", + " \n", + " // Misc attributes of this hunter\n", + " map attribs = 3;\n", + "\n", + " enum DinoType {\n", + " IDK = 0;\n", + " VEGGIESAURUS = 1;\n", + " MEATIESAURUS = 2;\n", + " PEOPLEEATINGSAURUS = 3;\n", + " }\n", + "\n", + " message Dino {\n", + " string name = 1;\n", + " DinoType type = 2;\n", + " }\n", + "\n", + " // Dinos that this hunter has captured\n", + " repeated Dino dinos = 4;\n", + "}\n", + "\n", + "message Position {\n", + " float x = 1;\n", + " float y = 2;\n", + "}\n", + "```\n", + "\n", + "We now need the `protoc`-generated Python code in order to use these messages. For convenience, we'll just download a copy from the `protobag` repository:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "if not os.path.exists('MyMessages_pb2.py'):\n", + " print(\"Downloading MyMessages_pb2.py from Github\")\n", + " !wget https://raw.githubusercontent.com/StandardCyborg/protobag/blob/master/examples/notebook-demo/MyMessages_pb2.py\n", + "\n", + "from MyMessages_pb2 import DinoHunter\n", + "from MyMessages_pb2 import Position\n", + "\n", + "## Note: you can prove to yourself that the downloaded file matches the schema above using the code below:\n", + "# import MyMessages_pb2\n", + "# from google.protobuf.descriptor_pb2 import FileDescriptorProto\n", + "# fd = FileDescriptorProto()\n", + "# MyMessages_pb2.DESCRIPTOR.CopyToProto(fd)\n", + "# print(fd)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "OK! Let's create some dino hunters:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "first_name: \"py_max\"\n", + "id: 1\n", + "dinos {\n", + " name: \"py_nibbles\"\n", + " type: PEOPLEEATINGSAURUS\n", + "}\n", + "\n", + "first_name: \"py_lara\"\n", + "id: 2\n", + "dinos {\n", + " name: \"py_bites\"\n", + " type: PEOPLEEATINGSAURUS\n", + "}\n", + "dinos {\n", + " name: \"py_stinky\"\n", + " type: VEGGIESAURUS\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "max_hunter = DinoHunter(\n", + " first_name='py_max',\n", + " id=1,\n", + " dinos=[\n", + " {'name': 'py_nibbles', 'type': DinoHunter.PEOPLEEATINGSAURUS},\n", + " ])\n", + "print(max_hunter)\n", + "\n", + "lara_hunter = DinoHunter(\n", + " first_name='py_lara',\n", + " id=2,\n", + " dinos=[\n", + " {'name': 'py_bites', 'type': DinoHunter.PEOPLEEATINGSAURUS},\n", + " {'name': 'py_stinky', 'type': DinoHunter.VEGGIESAURUS},\n", + " ])\n", + "\n", + "print(lara_hunter)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Writing and Reading Protobuf messages to a `protobag` archive\n", + "\n", + "### Plain Messages (`protobag.MessageEntry`)\n", + "\n", + "`protobag` archives are just zip archives that contain serialized Protobuf messages as files. (Tar and other formats are also supported via [libarchive](https://github.com/libarchive/libarchive)). So `protobag` offers a simple API for **writing** messages to an archive:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "bag = protobag.Protobag(path='example.zip')\n", + "writer = bag.create_writer()\n", + "writer.write_msg(\"hunters/py_max\", max_hunter)\n", + "writer.write_msg(\"hunters/py_lara\", lara_hunter)\n", + "writer.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can verify that the above just wrote a zip archive for you:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: example.zip\r\n", + " Length Date Time Name\r\n", + "--------- ---------- ----- ----\r\n", + " 72 1980-01-01 00:00 hunters/py_max\r\n", + " 86 1980-01-01 00:00 hunters/py_lara\r\n", + " 8691 1980-01-01 00:00 /_protobag_index/bag_index/1597696075.0.stampedmsg.protobin\r\n", + "--------- -------\r\n", + " 8849 3 files\r\n" + ] + } + ], + "source": [ + "!which unzip > /dev/null || (apt-get update && apt-get install unzip)\n", + "!unzip -l example.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Hmm, what's that `/_protobag_index/bag_index/xxxxx.stampedmsg.protobin` file?\n", + "\n", + "By default, `protobag` not only saves those messages but also **indexes Protobuf message descriptors** so that your `protobag` readers don't need your proto schemas to decode your messages. (You can also disable this indexing if you wish. For further discussion, see [the root README.md](https://github.com/StandardCyborg/protobag#protobag-indexes-protobuf-message-descriptors) ).\n", + "\n", + "\n", + "To **read** specific messages from a `protobag` archive, you can use this simple API:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MessageEntry:\n", + " entryname: hunters/py_max\n", + " type_url: type.googleapis.com/my_messages.DinoHunter\n", + " has serdes: True\n", + " has descriptor_data: False\n", + " msg:\n", + "first_name: \"py_max\"\n", + "id: 1\n", + "dinos {\n", + " name: \"py_nibbles\"\n", + " type: PEOPLEEATINGSAURUS\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(\n", + " path='example.zip',\n", + " \n", + " # Tell protobag to use our protoc-generated python code:\n", + " msg_classes=(DinoHunter, Position))\n", + "entry = bag.get_entry(\"hunters/py_max\")\n", + "print(entry)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Time-Series Data (`protobag.StampedEntry`)\n", + "`protobag` features a **topic-timestamp-message** API for recording time-series data. This API is modeled after [`rosbag`](http://wiki.ros.org/rosbag), [LCM log files](https://lcm-proj.github.io/log_file_format.html) (where topics are called \"channels\"), and your favorite message bus systems like [Kafka](https://kafka.apache.org/) or [AWS SQS](https://aws.amazon.com/sqs/). Each **topic** contains Protobuf messages of a single type, and each message has a nanosecond-precision timestamp (using the `google.protobuf.Timestamp` object, which has built-in conversion to other timestamp datastructures like Python's `datetime`). \n", + "\n", + "Protobag has special handling for these timestamped entries:\n", + " * For writing:\n", + " * Topics organized into archive \"folders\" and filenames are chosen automatically.\n", + " * Protobag indexes Protobuf Message Descriptors as described above.\n", + " * Protobag indexes the message timestamps for efficient time-ordered playback.\n", + " * For reading:\n", + " * Protobag offers a simple [Selection](https://github.com/StandardCyborg/protobag/blob/master/c%2B%2B/protobag/protobag_msg/ProtobagMsg.proto) API for reading specific sets of topics, time ranges, or even just individual events.\n", + " * Protobag offers a [TimeSync](https://github.com/StandardCyborg/protobag/blob/master/c%2B%2B/protobag/protobag/Utils/TimeSync.hpp) for synchronizing topics that have messages recorded at different rates. \n", + " \n", + "\n", + "Using our Dino Hunters example, we'll log (**write**) the 2D positions of a dino and a hunter during a chase scene:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: example.zip\r\n", + " Length Date Time Name\r\n", + "--------- ---------- ----- ----\r\n", + " 100 1980-01-01 00:00 positions/lara/0.0.stampedmsg.protobin\r\n", + " 109 1980-01-01 00:00 positions/trex/0.1000000.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/1.0.stampedmsg.protobin\r\n", + " 111 1980-01-01 00:00 positions/trex/1.1000000.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/2.0.stampedmsg.protobin\r\n", + " 111 1980-01-01 00:00 positions/trex/2.1000000.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/3.0.stampedmsg.protobin\r\n", + " 111 1980-01-01 00:00 positions/trex/3.1000000.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/4.0.stampedmsg.protobin\r\n", + " 111 1980-01-01 00:00 positions/trex/4.1000000.stampedmsg.protobin\r\n", + " 10707 1980-01-01 00:00 /_protobag_index/bag_index/1596831107.0.stampedmsg.protobin\r\n", + "--------- -------\r\n", + " 11788 11 files\r\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(path='example.zip')\n", + "writer = bag.create_writer()\n", + "for t in range(5):\n", + " lara_pos = Position(x=t, y=t+1)\n", + " writer.write_stamped_msg(\"positions/lara\", lara_pos, t_sec=t, t_nanos=0)\n", + "\n", + " trex_pos = Position(x=t+2, y=t+3)\n", + " writer.write_stamped_msg(\"positions/trex\", trex_pos, t_sec=t, t_nanos=int(1e6))\n", + " # Note: trex positions are slightly out-of-sync with those of lara by 1ms\n", + "writer.close()\n", + "!unzip -l example.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now **read** them using the `protobag.SelectionBuilder` helper tool:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read just the positions of trex:\n", + "Time: 1970-01-01 00:00:00.001000 Position: 2.0 3.0\n", + "Time: 1970-01-01 00:00:01.001000 Position: 3.0 4.0\n", + "Time: 1970-01-01 00:00:02.001000 Position: 4.0 5.0\n", + "Time: 1970-01-01 00:00:03.001000 Position: 5.0 6.0\n", + "Time: 1970-01-01 00:00:04.001000 Position: 6.0 7.0\n", + "\n", + "\n", + "Read *all* timeseries data:\n", + "Topic: positions/lara Time: 1970-01-01 00:00:00 Position: 0.0 1.0\n", + "Topic: positions/trex Time: 1970-01-01 00:00:00.001000 Position: 2.0 3.0\n", + "Topic: positions/lara Time: 1970-01-01 00:00:01 Position: 1.0 2.0\n", + "Topic: positions/trex Time: 1970-01-01 00:00:01.001000 Position: 3.0 4.0\n", + "Topic: positions/lara Time: 1970-01-01 00:00:02 Position: 2.0 3.0\n", + "Topic: positions/trex Time: 1970-01-01 00:00:02.001000 Position: 4.0 5.0\n", + "Topic: positions/lara Time: 1970-01-01 00:00:03 Position: 3.0 4.0\n", + "Topic: positions/trex Time: 1970-01-01 00:00:03.001000 Position: 5.0 6.0\n", + "Topic: positions/lara Time: 1970-01-01 00:00:04 Position: 4.0 5.0\n", + "Topic: positions/trex Time: 1970-01-01 00:00:04.001000 Position: 6.0 7.0\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(\n", + " path='example.zip',\n", + " \n", + " # Tell protobag to use our protoc-generated python code:\n", + " msg_classes=(DinoHunter, Position))\n", + "\n", + "print(\"Read just the positions of trex:\")\n", + "sel_trex = protobag.SelectionBuilder.select_window(topics=[\"positions/trex\"])\n", + "for entry in bag.iter_entries(selection=sel_trex):\n", + " print(\"Time: %s Position: %s %s\" % (entry.timestamp.ToDatetime(), entry.msg.x, entry.msg.y))\n", + "print()\n", + "print()\n", + " \n", + "print(\"Read *all* timeseries data:\")\n", + "sel_all_time_series = protobag.SelectionBuilder.select_window_all()\n", + "for entry in bag.iter_entries(selection=sel_all_time_series):\n", + " print(\"Topic: %s Time: %s Position: %s %s\" % (entry.topic, entry.timestamp.ToDatetime(), entry.msg.x, entry.msg.y))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Synchronizing Time-Series Data\n", + "Notice in the example that the positions of `lara` and `trex` are recorded at *almost* the same time, but not exactly. But what if we wanted to synchronize that data?\n", + "\n", + "When handling time series data, it's often desirable to synchronize two or more topics (or series) of data and to process 'bundles' of messages with nearly (or exactly) the same timestamps. There are a variety of algorithms and techniques for computing such bundlings. Protobag includes a [MaxSlopTimeSync](https://github.com/StandardCyborg/protobag/blob/293ae8ee82510f3d207d22317997d5f432827e14/c%2B%2B/protobag/protobag/Utils/TimeSync.hpp#L55) utility that offers some robustness for a variety of uses:\n", + " * If some topics have 'missing' messages (e.g. the message was dropped during recording), then `MaxSlopTimeSync` will ignore those. The user provides as parameters a 'max_slop' (a time duration) and a 'max_queue_size' (a number of messages to buffer) that controls how to handle recording gaps. See the [MaxSlopTimeSync](https://github.com/StandardCyborg/protobag/blob/293ae8ee82510f3d207d22317997d5f432827e14/c%2B%2B/protobag/protobag/Utils/TimeSync.hpp#L55) docs for more detail.\n", + " * If topics have messages recorded at the same rates and at the same times (e.g. two sensors are configured to record with the exact same timestamps), then `MaxSlopTimeSync` will provide exactly-synchronized bundles.\n", + " * If topics have messages recorded at somewhat different times (e.g. two sensors both record at 5Hz, but one records 50ms before the other), then `MaxSlopTimeSync` will find the nearest matches.\n", + "\n", + "FMI consult the unit tests [for MaxSlopTimeSync](https://github.com/StandardCyborg/protobag/blob/293ae8ee82510f3d207d22317997d5f432827e14/c%2B%2B/protobag_test/protobag/Utils/TimeSyncTest.cpp#L71) and [for IterProducts](https://github.com/StandardCyborg/protobag/blob/293ae8ee82510f3d207d22317997d5f432827e14/c%2B%2B/protobag_test/protobag/Utils/IterProductsTest.cpp#L31).\n", + "\n", + "In the example below, we'll synchronize the data for `lara` and `trex` using the `MaxSlopTimeSync` algorithm in order to compute their relative distances at each timestep:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bundle: 1970-01-01 00:00:00.001000 | Distance trex <-> lara: 2.8284271247461903 | Time b/t entries: 0.001 sec\n", + "Bundle: 1970-01-01 00:00:01.001000 | Distance trex <-> lara: 2.8284271247461903 | Time b/t entries: 0.001 sec\n", + "Bundle: 1970-01-01 00:00:02.001000 | Distance trex <-> lara: 2.8284271247461903 | Time b/t entries: 0.001 sec\n", + "Bundle: 1970-01-01 00:00:03.001000 | Distance trex <-> lara: 2.8284271247461903 | Time b/t entries: 0.001 sec\n", + "Bundle: 1970-01-01 00:00:04.001000 | Distance trex <-> lara: 2.8284271247461903 | Time b/t entries: 0.001 sec\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(path='example.zip', msg_classes=(Position,))\n", + "\n", + "topics = [\"positions/trex\", \"positions/lara\"]\n", + "\n", + "# Select the topics we want\n", + "sel = protobag.SelectionBuilder.select_window(topics=topics)\n", + "\n", + "# Configure synchronization\n", + "from protobag.protobag_native import MaxSlopTimeSyncSpec\n", + "spec = MaxSlopTimeSyncSpec()\n", + "spec.topics = topics\n", + "spec.set_max_slop(seconds=2, nanos=0) # Both topics are about 1Hz; we'll be sloppy and provide a 2-second window\n", + "\n", + "for bundle in bag.iter_entries(selection=sel, sync_using_max_slop=spec):\n", + " topic_to_entry = dict((entry.topic, entry) for entry in bundle)\n", + " lara_entry = topic_to_entry[\"positions/lara\"]\n", + " trex_entry = topic_to_entry[\"positions/trex\"]\n", + " \n", + " delta_t = trex_entry.timestamp.ToDatetime() - lara_entry.timestamp.ToDatetime()\n", + " \n", + " import numpy as np\n", + " lara_pos = np.array([lara_entry.msg.x, lara_entry.msg.y])\n", + " trex_pos = np.array([trex_entry.msg.x, trex_entry.msg.y])\n", + " distance = np.linalg.norm(lara_pos - trex_pos)\n", + " print(\"Bundle: %s | Distance trex <-> lara: %s | Time b/t entries: %s sec\" % (\n", + " trex_entry.timestamp.ToDatetime(), distance, delta_t.total_seconds()))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Raw Data (`protobag.RawEntry`)\n", + "We can also just put raw data like text files or images into our protobag archive because it's just a zip file. You might need this API if you have consumers of your data who don't have Protobuf or `protobag` available. Note that the C++ `protobag` API even includes an [`ArchiveUtil.hpp` module](https://github.com/StandardCyborg/protobag/blob/master/c++/protobag/protobag/ArchiveUtil.hpp) that has helper functions for common archive operations like zipping a directory, unarchiving a tar file, etc. (In python, one would probably use the excellent `zipfile` or `tarfile` libraries, but those don't exist in C++). The raw write API makes `protobag` skip all indexing and type-tracking activity.\n", + "\n", + "To **write** data, just use the `write_raw` API:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: example.zip\n", + " Length Date Time Name\n", + "--------- ---------- ----- ----\n", + " 19 1980-01-01 00:00 raw_data\n", + " 129 1980-01-01 00:00 /_protobag_index/bag_index/1596831108.0.stampedmsg.protobin\n", + "--------- -------\n", + " 148 2 files\n", + "Reading the raw data using unzip:\n", + "\u0012\u0011i am a raw string" + ] + } + ], + "source": [ + "bag = protobag.Protobag(path='example.zip')\n", + "writer = bag.create_writer()\n", + "\n", + "s = b\"i am a raw string\"\n", + "writer.write_raw(\"raw_data\", s)\n", + "\n", + "writer.close()\n", + "\n", + "!unzip -l example.zip\n", + "!echo \"Reading the raw data using unzip:\"\n", + "!unzip -p example.zip raw_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To **read** raw data, you can use the same API we used for regular messages (but `protobag` will do no deserialization):" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RawEntry:\n", + " entryname: raw_data\n", + " raw_bytes: i am a raw string ... (17 bytes)\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(path='example.zip')\n", + "entry = bag.get_entry(\"raw_data\")\n", + "print(entry)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Converting `protobag` archives to DataFrames\n", + "\n", + "For various data mining tasks, dealing with files and archives can be cumbersome. Suppose we wanted to:\n", + " * Load our `protobag` data into a `DataFrame` (a database table)\n", + " * Compute some quick statistics of our data using python code\n", + " * Query our `protobag` contents using SQL\n", + "\n", + "In each of these above cases, we might think of each entry of our `protobag` archive as a row in a table. (You might also skip some entries, or want a single entry to map to several rows). Protobag provides a `DictRowEntry` utility to help aid this usage.\n", + "\n", + "Protobuf 3.x introduced mature `json` support, and in python, `google.protobuf.json_format` also provides a means to convert Protobuf messages to and from *python dicts*. `DictRowEntry` leverages this feature to help you translate between `protobag` archive entries and table rows using whatever database or DataFrame tool you have at hand.\n", + "\n", + "\n", + "### From `protobag` to `pandas` and back\n", + "In this section, we'll show how to use `DictRowEntry` to create a `pandas.DataFrame` from our `protobag` archive. We can also convert from `DataFrame` to `protobag` if desired.\n", + "\n", + "First we need to have `pandas` available, as well as an example `protobag` archive:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pandas in /usr/local/lib/python3.6/dist-packages\r\n", + "Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas)\r\n", + "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas)\r\n", + "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas)\r\n", + "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas)\r\n" + ] + } + ], + "source": [ + "!pip3 install pandas\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: pandas_example.zip\r\n", + " Length Date Time Name\r\n", + "--------- ---------- ----- ----\r\n", + " 72 1980-01-01 00:00 hunters/py_max\r\n", + " 86 1980-01-01 00:00 hunters/py_lara\r\n", + " 100 1980-01-01 00:00 positions/lara/0.0.stampedmsg.protobin\r\n", + " 105 1980-01-01 00:00 positions/trex/0.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/1.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/trex/1.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/2.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/trex/2.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/3.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/trex/3.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/lara/4.0.stampedmsg.protobin\r\n", + " 107 1980-01-01 00:00 positions/trex/4.0.stampedmsg.protobin\r\n", + " 19 1980-01-01 00:00 raw_data\r\n", + " 10675 1980-01-01 00:00 /_protobag_index/bag_index/1596831111.0.stampedmsg.protobin\r\n", + "--------- -------\r\n", + " 11913 14 files\r\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(path='pandas_example.zip')\n", + "writer = bag.create_writer()\n", + "\n", + "# Include some protobuf messages in the example\n", + "writer.write_msg(\"hunters/py_max\", max_hunter)\n", + "writer.write_msg(\"hunters/py_lara\", lara_hunter)\n", + "\n", + "# Include some time series data in the example\n", + "for t in range(5):\n", + " writer.write_stamped_msg(\"positions/lara\", Position(x=t, y=t+1), t_sec=t)\n", + " writer.write_stamped_msg(\"positions/trex\", Position(x=t+2, y=t+3), t_sec=t)\n", + "\n", + "# Include some raw data\n", + "s = b\"i am a raw string\"\n", + "writer.write_raw(\"raw_data\", s)\n", + "\n", + "writer.close()\n", + "\n", + "!unzip -l pandas_example.zip" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll read back that `protobag` data and load it into a `pandas.DataFrame`. First, let's take a close look at what `DictRowEntry` actually provides for a single entry:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "protobag.DictRowEntry:\n", + " entryname: hunters/py_lara\n", + " (not a time-series entry)\n", + " type_url: type.googleapis.com/my_messages.DinoHunter\n", + " has serdes: True\n", + " descriptor_data: (protobuf message) google.protobuf.FileDescriptorSet file {\n", + " name: \"MyMe (422 bytes)\n", + " msg_dict:\n", + " {'dinos': [{'name': 'py_bites', 'type': 'PEOPLEEATINGSAURUS'},\n", + " {'name': 'py_stinky', 'type': 'VEGGIESAURUS'}],\n", + " 'firstName': 'py_lara',\n", + " 'id': 2}\n" + ] + } + ], + "source": [ + "bag = protobag.Protobag(path='pandas_example.zip', msg_classes=(DinoHunter, Position))\n", + "\n", + "entry = bag.get_entry(\"hunters/py_lara\")\n", + "row = protobag.DictRowEntry.from_entry(entry)\n", + "print(row)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's create the `DataFrame`. For simplicity, we just create one `DataFrame` table row per archive file:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Read 13 rows\n", + "\n", + "RangeIndex: 13 entries, 0 to 12\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 entryname 13 non-null object \n", + " 1 type_url 13 non-null object \n", + " 2 msg_dict 13 non-null object \n", + " 3 topic 13 non-null object \n", + " 4 timestamp 10 non-null datetime64[ns]\n", + " 5 descriptor_data 12 non-null object \n", + "dtypes: datetime64[ns](1), object(5)\n", + "memory usage: 752.0+ bytes\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
entrynametype_urlmsg_dicttopictimestampdescriptor_data
0hunters/py_maxtype.googleapis.com/my_messages.DinoHunter{'firstName': 'py_max', 'id': 1, 'dinos': [{'n...NaTb'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
1hunters/py_laratype.googleapis.com/my_messages.DinoHunter{'firstName': 'py_lara', 'id': 2, 'dinos': [{'...NaTb'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
2positions/lara/0.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'y': 1.0}positions/lara1970-01-01 00:00:00b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
3positions/trex/0.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 2.0, 'y': 3.0}positions/trex1970-01-01 00:00:00b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
4positions/lara/1.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 1.0, 'y': 2.0}positions/lara1970-01-01 00:00:01b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
5positions/trex/1.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 3.0, 'y': 4.0}positions/trex1970-01-01 00:00:01b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
6positions/lara/2.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 2.0, 'y': 3.0}positions/lara1970-01-01 00:00:02b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
7positions/trex/2.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 4.0, 'y': 5.0}positions/trex1970-01-01 00:00:02b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
8positions/lara/3.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 3.0, 'y': 4.0}positions/lara1970-01-01 00:00:03b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
9positions/trex/3.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 5.0, 'y': 6.0}positions/trex1970-01-01 00:00:03b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
10positions/lara/4.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 4.0, 'y': 5.0}positions/lara1970-01-01 00:00:04b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
11positions/trex/4.0.stampedmsg.protobintype.googleapis.com/my_messages.Position{'x': 6.0, 'y': 7.0}positions/trex1970-01-01 00:00:04b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m...
12raw_data{'protobag_raw_entry_bytes': b'i am a raw stri...NaTNone
\n", + "
" + ], + "text/plain": [ + " entryname \\\n", + "0 hunters/py_max \n", + "1 hunters/py_lara \n", + "2 positions/lara/0.0.stampedmsg.protobin \n", + "3 positions/trex/0.0.stampedmsg.protobin \n", + "4 positions/lara/1.0.stampedmsg.protobin \n", + "5 positions/trex/1.0.stampedmsg.protobin \n", + "6 positions/lara/2.0.stampedmsg.protobin \n", + "7 positions/trex/2.0.stampedmsg.protobin \n", + "8 positions/lara/3.0.stampedmsg.protobin \n", + "9 positions/trex/3.0.stampedmsg.protobin \n", + "10 positions/lara/4.0.stampedmsg.protobin \n", + "11 positions/trex/4.0.stampedmsg.protobin \n", + "12 raw_data \n", + "\n", + " type_url \\\n", + "0 type.googleapis.com/my_messages.DinoHunter \n", + "1 type.googleapis.com/my_messages.DinoHunter \n", + "2 type.googleapis.com/my_messages.Position \n", + "3 type.googleapis.com/my_messages.Position \n", + "4 type.googleapis.com/my_messages.Position \n", + "5 type.googleapis.com/my_messages.Position \n", + "6 type.googleapis.com/my_messages.Position \n", + "7 type.googleapis.com/my_messages.Position \n", + "8 type.googleapis.com/my_messages.Position \n", + "9 type.googleapis.com/my_messages.Position \n", + "10 type.googleapis.com/my_messages.Position \n", + "11 type.googleapis.com/my_messages.Position \n", + "12 \n", + "\n", + " msg_dict topic \\\n", + "0 {'firstName': 'py_max', 'id': 1, 'dinos': [{'n... \n", + "1 {'firstName': 'py_lara', 'id': 2, 'dinos': [{'... \n", + "2 {'y': 1.0} positions/lara \n", + "3 {'x': 2.0, 'y': 3.0} positions/trex \n", + "4 {'x': 1.0, 'y': 2.0} positions/lara \n", + "5 {'x': 3.0, 'y': 4.0} positions/trex \n", + "6 {'x': 2.0, 'y': 3.0} positions/lara \n", + "7 {'x': 4.0, 'y': 5.0} positions/trex \n", + "8 {'x': 3.0, 'y': 4.0} positions/lara \n", + "9 {'x': 5.0, 'y': 6.0} positions/trex \n", + "10 {'x': 4.0, 'y': 5.0} positions/lara \n", + "11 {'x': 6.0, 'y': 7.0} positions/trex \n", + "12 {'protobag_raw_entry_bytes': b'i am a raw stri... \n", + "\n", + " timestamp descriptor_data \n", + "0 NaT b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "1 NaT b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "2 1970-01-01 00:00:00 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "3 1970-01-01 00:00:00 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "4 1970-01-01 00:00:01 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "5 1970-01-01 00:00:01 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "6 1970-01-01 00:00:02 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "7 1970-01-01 00:00:02 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "8 1970-01-01 00:00:03 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "9 1970-01-01 00:00:03 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "10 1970-01-01 00:00:04 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "11 1970-01-01 00:00:04 b'\\n\\xa3\\x03\\n\\x10MyMessages.proto\\x12\\x0bmy_m... \n", + "12 NaT None " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Collect \"rows\" here, which will be just python dicts. Pandas is happy to convert dicts to rows.\n", + "rows = []\n", + "for entry in bag.iter_entries():\n", + " # ignore the index\n", + " if '_protobag_index' in entry.entryname:\n", + " continue\n", + "\n", + " rows.append(protobag.DictRowEntry.from_entry(entry))\n", + "\n", + "print(\"Read %s rows\" % len(rows))\n", + "\n", + "df = pd.DataFrame([\n", + "# Convert to pyarrow-friendly types\n", + " dict(\n", + " entryname=row.entryname,\n", + " type_url=row.type_url,\n", + " msg_dict=row.msg_dict,\n", + " topic=row.topic,\n", + " timestamp=\n", + " row.timestamp.ToDatetime() if row.timestamp else None,\n", + " descriptor_data=\n", + " row.descriptor_data.SerializeToString() if row.descriptor_data else None,\n", + " )\n", + " for row in rows\n", + "])\n", + "df.info()\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now suppose we want to convert back to `protobag` format. We need a few key things from our table:\n", + " * A `msg_dict` message dictionary that we want to convert back to a Protobuf message.\n", + " * Some `descriptor_data`, which Protobuf's underlying `json_format.ParseDict()` needs in order to convert `msg_dict` into a protobuf message.\n", + " * Some other context like `entryname` or a `topic`-`timestamp` that defines where to put the message in the `protobag` archive.\n", + " \n", + "Note that `msg_dict` and `descriptor_data` are only required if you want a lossless / bijective transform between your Protobuf messages and table row data. You may very well have a more complex mapping between your Protobuf and table data; `protobag` simply provides tooling to support as much of a basic bijective mapping as possible.\n", + " \n", + "Our table above has all the required data, so conversion is simple:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: pandas_to_protobag_example.zip\r\n", + " Length Date Time Name\r\n", + "--------- ---------- ----- ----\r\n", + " 72 1980-01-01 00:00 hunters/py_max\r\n", + " 86 1980-01-01 00:00 hunters/py_lara\r\n", + " 49 1980-01-01 00:00 positions/lara/0.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/trex/0.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/lara/1.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/trex/1.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/lara/2.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/trex/2.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/lara/3.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/trex/3.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/lara/4.0.stampedmsg.protobin\r\n", + " 54 1980-01-01 00:00 positions/trex/4.0.stampedmsg.protobin\r\n", + " 19 1980-01-01 00:00 raw_data\r\n", + " 9141 1980-01-01 00:00 /_protobag_index/bag_index/1596831111.0.stampedmsg.protobin\r\n", + "--------- -------\r\n", + " 9853 14 files\r\n" + ] + } + ], + "source": [ + "# Get row dicts from pandas\n", + "rows = df.to_dict(orient='records')\n", + "\n", + "# We'll write to this archive:\n", + "bag = protobag.Protobag(path='pandas_to_protobag_example.zip')\n", + "writer = bag.create_writer()\n", + "\n", + "# Loop through rows, convering them back to `protobag` Entries using `DictRowEntry`\n", + "for row in rows:\n", + " dict_row_entry = protobag.DictRowEntry(**row)\n", + " entry = dict_row_entry.to_entry()\n", + " writer.write_entry(entry)\n", + "writer.close()\n", + "\n", + "!unzip -l pandas_to_protobag_example.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/protobag-to-parquet/my_parquet_writer.py b/examples/protobag-to-parquet/my_parquet_writer.py new file mode 100644 index 0000000..2fd2aed --- /dev/null +++ b/examples/protobag-to-parquet/my_parquet_writer.py @@ -0,0 +1,108 @@ +import protobag + +from MyMessages_pb2 import DinoHunter +from MyMessages_pb2 import Position + +if __name__ == '__main__': + bag = protobag.Protobag(path='example_bag.zip') + writer = bag.create_writer() + + max_hunter = DinoHunter( + first_name='py_max', + id=1, + dinos=[ + {'name': 'py_nibbles', 'type': DinoHunter.PEOPLEEATINGSAURUS}, + ]) + writer.write_msg("hunters/py_max", max_hunter) + + lara_hunter = DinoHunter( + first_name='py_lara', + id=2, + dinos=[ + {'name': 'py_bites', 'type': DinoHunter.PEOPLEEATINGSAURUS}, + {'name': 'py_stinky', 'type': DinoHunter.VEGGIESAURUS}, + ]) + writer.write_msg("hunters/py_lara", lara_hunter) + + # A Chase! + for t in range(10): + lara_pos = Position(x=t, y=t+1) + writer.write_stamped_msg("positions/lara", lara_pos, t_sec=t) + + toofz_pos = Position(x=t+2, y=t+3) + writer.write_stamped_msg("positions/toofz", toofz_pos, t_sec=t) + + + # Use Raw API + s = b"i am a raw string" + writer.write_raw("raw_data", s) + + writer.close() + print("Wrote to %s" % bag.path) + + + + + + path = 'example_bag.zip' + print("Using protobag library %s" % protobag.__file__) + print("Reading bag %s" % path) + + bag = protobag.Protobag( + path=path, + msg_classes=( + DinoHunter, + Position)) + rows = [] + for entry in bag.iter_entries(): + # ignore the index + if '_protobag_index' in entry.entryname: + continue + + print("Entry:") + print(entry) + print() + print() + + row = protobag.DictRowEntry.from_entry(entry) + rows.append(row) + + + + import pandas as pd + import attr + df = pd.DataFrame([ + # Convert to pyarrow-friendly types + dict( + entryname=row.entryname, + type_url=row.type_url, + msg_dict=row.msg_dict, + topic=row.topic, + timestamp= + row.timestamp.ToDatetime() if row.timestamp else None, + descriptor_data= + row.descriptor_data.SerializeToString() if row.descriptor_data else None, + ) + for row in rows + ]) + print(df) + print(df.info()) + print() + + import pyarrow as pa + import pyarrow.parquet as pq + table = pa.Table.from_pandas(df) + pq.write_table(table, 'example.parquet') + + + + + # Nope they don't have read support yet + # table2 = pq.read_table('example.parquet') + # df2 = table2.to_pandas() + # print(df2) + # print(df2.info()) + + parquet_file = pq.ParquetFile('example.parquet') + print(parquet_file.metadata) + print(parquet_file.schema) \ No newline at end of file diff --git a/pb-dev b/pb-dev index 32cd167..d15c7c0 100755 --- a/pb-dev +++ b/pb-dev @@ -3,12 +3,24 @@ """ pb-dev: A CLI for Protobag development -TODO examples + +## Quickstart + +First, you need to build the dockerized environment if you don't already have +the image locally: + $ ./pb-dev --build-env + +Then drop into a development shell: + $ ./pb-dev --shell + ## Development When creating a new version: - 1. Edit protobag_version.txt + 1. Edit `protobag_version.txt` + and `__version__` in `python/protobag/__init__.py` + 2. Build a new environment using: `./pb-dev --build-env` + """ import os @@ -34,7 +46,7 @@ https://github.com/NVIDIA/nvidia-docker#ubuntu-16041804-debian-jessiestretchbust """ INSTALL_DOCKER_MAC = """ -To install Docker on a Mac, we advocate you install docker-machine (which uses +To install Docker on a Mac, we advocate you install docker-machine (which uses VirtualBox-based virtual machines) versus Docker Desktop for Mac (which uses the xhyve hypervisor) because: * The VirtualBox-based environment is more similar to Docker on Linux @@ -54,7 +66,7 @@ To install docker-machine: ``` $ eval `docker-machine env` # Tells the docker client how to reach your # VirtualBox; run once per shell session. - + $ docker run hello-world ... Hello from Docker! @@ -143,7 +155,7 @@ def raise_if_no_docker(): $ docker-machine start default Error: %s """ % (e,)) - + ############################################################################### @@ -165,14 +177,14 @@ def create_arg_parser(): parser.add_argument( '--build-env', default=False, action='store_true', help='Build the Dockerized dev env') - + parser.add_argument( '--shell', default=False, action='store_true', help='Drop into a dockerized dev environment') parser.add_argument( '--shell-rm', default=False, action='store_true', help='Delete the dockerized dev environment') - + parser.add_argument( '--test', default=False, action='store_true', help='Build and run all tests') @@ -184,10 +196,6 @@ def create_arg_parser(): '--run-protoc', default=False, action='store_true', help='Re-generate protoc-generated files, updating the source tree ' 'in-place') - parser.add_argument( - '--post-release', default=False, action='store_true', - help='Run the post-release workflow; run this on master after merging ' - 'your feature branch') return parser @@ -196,7 +204,7 @@ def build_env(args): raise_if_no_docker() version = get_version(args.root) - + CMD = """ docker build -t {image}:{version} -f {dockerfile} {docker_root} """.format( @@ -204,18 +212,7 @@ def build_env(args): dockerfile=os.path.join(args.root, 'docker/Dockerfile'), docker_root=args.root, version=version) - - run_cmd(CMD) - -def post_release(args): - version = get_version(args.root) - - # Tag for cocoapods release - print('todo: podspec version needs to match version') - CMD = """ - git tag -f {version} && git push -f origin {version} - """.format(version=version) run_cmd(CMD) @@ -223,7 +220,7 @@ def start_container( src_root, container_name=CONTAINER_NAME, mnt_local_root=True): - + local_mount = '' if mnt_local_root: local_mount = '-v %s:/opt/protobag:z' % src_root @@ -256,8 +253,8 @@ def run_tests(src_root): log.info("Running C++ tests ...") CMD = """ cd {src_root} && - mkdir -p c++/build && - cd c++/build && + mkdir -p c++/test_build && + cd c++/test_build && cmake -DCMAKE_BUILD_TYPE=DEBUG .. && make -j {n_proc} && ./protobag_test @@ -268,6 +265,30 @@ def run_tests(src_root): log.info("... done with C++ tests.") + log.info("Running python tests ...") + CMD = """ + cd {src_root}/python && + python3 setup.py test + """.format( + src_root=src_root) + run_cmd(CMD) + log.info("... done with python tests.") + + + log.info("Running Jupyter Notebook smoke tests ...") + notebook_path = os.path.join( + src_root, 'examples/notebook-demo/protobag-demo-full.ipynb') + CMD = """ + jupyter-nbconvert \ + --ExecutePreprocessor.timeout=3600 \ + --to notebook --execute --output /tmp/out \ + %s + """ % notebook_path + # For helpful discussion, see http://www.blog.pythonlibrary.org/2018/10/16/testing-jupyter-notebooks/ + run_cmd(CMD) + log.info("... done with Jupyter Notebook tests.") + + def run_in_container(cmd, src_root, container_name='pbdev-temp'): start_container(src_root, container_name=container_name) EXEC_CMD = """ @@ -284,8 +305,6 @@ def main(args=None): if args.build_env: build_env(args) - elif args.post_release: - post_release(args) elif args.shell: start_container(args.root) enter_container() @@ -301,7 +320,8 @@ def main(args=None): cd /opt/protobag/examples/c++-writer && protoc MyMessages.proto --cpp_out=. && protoc MyMessages.proto --python_out=../../examples/python-reader && - protoc MyMessages.proto --python_out=../../examples/python-writer + protoc MyMessages.proto --python_out=../../examples/python-writer && + protoc MyMessages.proto --python_out=../../examples/notebook-demo """ run_in_container( CMD, diff --git a/protobag_version.txt b/protobag_version.txt index 4e379d2..bcab45a 100644 --- a/protobag_version.txt +++ b/protobag_version.txt @@ -1 +1 @@ -0.0.2 +0.0.3 diff --git a/python/protobag/__init__.py b/python/protobag/__init__.py index 679bc49..16cf74a 100644 --- a/python/protobag/__init__.py +++ b/python/protobag/__init__.py @@ -1 +1,3 @@ -from protobag.protobag import Protobag \ No newline at end of file +from protobag.protobag import * + +__version__ = '0.0.3' diff --git a/python/protobag/protobag.py b/python/protobag/protobag.py index 98c0124..cb3af73 100644 --- a/python/protobag/protobag.py +++ b/python/protobag/protobag.py @@ -1,6 +1,10 @@ import copy import datetime +import attr +import six + +from google.protobuf import json_format from google.protobuf.timestamp_pb2 import Timestamp from protobag.ProtobagMsg_pb2 import BagIndex @@ -10,18 +14,336 @@ from protobag.ProtobagMsg_pb2 import TopicTime + +## ============================================================================ +## == Utils =================================================================== +## ============================================================================ + +def get_type_url(pb_msg): + # See also `protobag::GetTypeURL()` + return 'type.googleapis.com/' + pb_msg.DESCRIPTOR.full_name + + +def to_pb_timestamp(v): + """Try to convert value `v` to a Protobuf `Timestamp` instance.""" + if isinstance(v, Timestamp): + return v + elif isinstance(v, (list, tuple)) and len(v) == 2: + return Timestamp(seconds=v[0], nanos=v[1]) + elif isinstance(v, datetime.datetime): + ts = Timestamp() + ts.FromDatetime(v) + return ts + elif isinstance(v, int): + ts = Timestamp() + ts.FromSeconds(v) + return ts + elif isinstance(v, float): + sec = int(v) + nsec = int((v - sec) * 1e9) + ts = Timestamp(seconds=sec, nanos=nsec) + return ts + else: + raise ValueError( + "Don't know what to do with timestamp %s" % (v,)) + + +def to_sec_nanos(v): + """Try to convert value `v` to a (seconds, nanoseconds) tuple""" + if isinstance(v, (tuple, list)) and len(v) == 2: + return v + elif isinstance(v, Timestamp): + return (v.seconds, v.nanos) + elif isinstance(v, datetime.datetime): + import calendar + return ( + calendar.timegm(v.utctimetuple()), # seconds + v.microsecond * 1000) # nanos + elif isinstance(v, int): + return (v, 0) + elif isinstance(v, float): + sec = int(v) + nsec = int((v - sec) * 1e9) + return (sec, nsec) + else: + raise ValueError( + "Don't know what to do with value %s" % (v,)) + + +def to_topic_time(v): + """Try to convert value `v` to a TopicTime instance.""" + if isinstance(v, TopicTime): + return v + elif isinstance(v, dict): + tt = TopicTime( + topic=v['topic'], + timestamp=to_pb_timestamp(v['timestamp'])) + if 'entryname' in v: + tt.entryname = v['entryname'] + return tt + elif isinstance(v, (tuple, list)): + entryname = None + if len(v) == 2: + topic, ts = v + elif len(v) == 3: + topic, ts, entryname = v + else: + raise ValueError("Can't unpack to TopicTime: %s" % (v,)) + tt = TopicTime( + topic=topic, + timestamp=to_pb_timestamp(ts)) + if entryname is not None: + tt.entryname = entryname + return tt + else: + raise ValueError( + "Don't know what to do with value %s" % (v,)) + + + +def build_fds_for_msg(msg): + """ + Given a Protobuf message `msg` (or message class), build a + `FileDescriptorSet` that can be used with `DynamicMessageFactory` below (or + `protobag::DynamicMsgFactory` in C++) to dynamically deserialize instances + of `msg` at runtime (when the Protobuf-generated code for `msg` is + unavailable). + + See also `protobag::DynamicMsgFactory` in C++. + + We run a BFS of `msg`'s descriptor and its dependencies to collect all + data necessary to decode a `msg` instance. (NB: the current search is today + over-complete and pulls in unrelated types, too). The algorithm below + mirrors that in `protobag::BagIndexBuilder::Observe()`. We must run this + collection in python (and not C++) because we assume we only have the + Protobuf python-generated code available for `msg` in this code path. + + Args: + msg (Protobuf message or class): Build a `FileDescriptorSet` based upon + the `DESCRIPTOR` of this message. + + Returns: + A `FileDescriptorSet` protobuf message instance. + """ + + from google.protobuf.descriptor_pb2 import FileDescriptorProto + from google.protobuf.descriptor_pb2 import FileDescriptorSet + + q = [msg.DESCRIPTOR.file] + visited = set() + files = [] + while q: + current = q.pop() + if current.name not in visited: + # Visit! + visited.add(current.name) + + fd = FileDescriptorProto() + current.CopyToProto(fd) + files.append(fd) + + q.extend(current.dependencies) + + return FileDescriptorSet(file=files) + + + ## ============================================================================ ## == Public API ============================================================== ## ============================================================================ + +### +### Entries that one might find in a Protobag +### + +@attr.s(slots=True, eq=True, weakref_slot=False) +class Entry(object): + """A Protobag Entry, similar to a c++ `protobag::Entry`. You should probably + use a subclass like `MessageEntry`, `StampedEntry`, or `RawEntry` below.""" + + ## Core Data + + entryname = attr.ib(default='', type='str') + """str: Location of the entry in the Protobag archive.""" + + type_url = attr.ib(default='', type='str') + """str: The Protobuf Type URL (if any) documenting the type of the message. + This field is empty for `RawEntry`s""" + + + ## Optional Context + + serdes = attr.ib(default=None) + """PBSerdes: Handle to SERDES instance, if available""" + + descriptor_data = attr.ib(default=None) + """object: Protobuf data needed to decode messages of this type when + protoc-generated code is not available.""" + + + @classmethod + def from_nentry(cls, nentry, serdes=None): + """Construct and return an `Entry` subclass from the given + `protobag_native.nentry` `nentry`.""" + serdes = serdes or DEFAULT_SERDES + if nentry.type_url == '': + return RawEntry.from_nentry(nentry, serdes=serdes) + elif nentry.is_stamped: + return StampedEntry.from_nentry(nentry, serdes=serdes) + else: + return MessageEntry.from_nentry(nentry, serdes=serdes) + + +@attr.s(slots=True, eq=True, weakref_slot=False) +class MessageEntry(Entry): + """A Protobuf message entry. + + Entrynames look like: 'foo/bar' + """ + + msg = attr.ib(default=None) + """google.protobuf.message.Message: Raw message contents""" + + @classmethod + def from_msg(cls, entryname, msg, **kwargs): + return cls( + entryname=entryname, + msg=msg, + type_url=get_type_url(msg), + **kwargs) + + @classmethod + def from_nentry(cls, nentry, serdes=None): + msg = serdes.msg_from_typed_bytes( + TypedBytes( + type_url=nentry.type_url, + entryname=nentry.entryname, + msg_bytes=nentry.msg_bytes)) + + return cls( + entryname=nentry.entryname, + type_url=nentry.type_url, + serdes=serdes, + msg=msg) + + def __str__(self): + lines = [ + 'MessageEntry:', + ' entryname: %s' % self.entryname, + ' type_url: %s' % self.type_url, + ' has serdes: %s' % (self.serdes is not None), + ' has descriptor_data: %s' % (self.descriptor_data is not None), + ' msg:\n%s' % str(self.msg), # Uses protobuf text_format + ] + return "\n".join(lines) + + +@attr.s(slots=True, eq=True, weakref_slot=False) +class StampedEntry(MessageEntry): + """A Protobag StampedMessage entry""" + + topic = attr.ib(default='', type='str') + """str: The topic (or channel) of time-series data for this message. This is + *not* the entryname, which might not be known unless the message has been + written. + + Example topic: '/sensor/data' + """ + + timestamp = attr.ib(default='', type=Timestamp, converter=to_pb_timestamp) + """google.protobuf.timestamp_pb2.Timestamp: The time associated with this + entry""" + + @classmethod + def from_msg(cls, topic, timestamp, msg, **kwargs): + return cls( + topic=topic, + timestamp=to_pb_timestamp(timestamp), + type_url=get_type_url(msg), + msg=msg, + **kwargs) + + @classmethod + def from_nentry(cls, nentry, serdes=None): + msg_entry = MessageEntry.from_nentry(nentry, serdes=serdes) + + assert nentry.is_stamped, "Not a stamped message" + return cls( + topic=nentry.topic, + timestamp=Timestamp( + seconds=nentry.sec, + nanos=nentry.nanos), + **attr.asdict(msg_entry)) + + def __str__(self): + lines = [ + 'StampedEntry:', + ' topic: %s' % self.topic, + ' timestamp: %s sec %s ns' % ( + self.timestamp.seconds, self.timestamp.nanos), + ' type_url: %s' % self.type_url, + ' entryname: %s' % self.entryname, + ' has serdes: %s' % (self.serdes is not None), + ' has descriptor_data: %s' % (self.descriptor_data is not None), + ' msg:\n%s' % str(self.msg), # Uses protobuf text_format + ] + return "\n".join(lines) + + +@attr.s(slots=True, eq=True, weakref_slot=False) +class RawEntry(Entry): + """A Raw entry with no known type url (might not even be + a Protobuf message!). + + Entrynames look like: 'foo/bar.png' + """ + + raw_bytes = attr.ib(default='', type='bytearray') + """bytearray: Raw message contents""" + + @classmethod + def from_bytes(cls, entryname, raw_bytes, **kwargs): + return cls( + entryname=entryname, + raw_bytes=raw_bytes, + type_url='', # Raw messages have no type + **kwargs) + + @classmethod + def from_nentry(cls, nentry, serdes=None): + return cls( + entryname=nentry.entryname, + type_url=nentry.type_url, + serdes=serdes, + + raw_bytes=nentry.msg_bytes) + + def __str__(self): + lines = [ + 'RawEntry:', + ' entryname: %s' % self.entryname, + ' raw_bytes: %s ... (%s bytes)' % ( + self.raw_bytes[:20].decode() if self.raw_bytes is not None else 'None', + len(self.raw_bytes) if self.raw_bytes is not None else 0), + ] + return "\n".join(lines) + + +### +### Interacting with Protobag files +### + class Protobag(object): - def __init__(self, path=None, decoder=None, msg_classes=None): - """TODO + def __init__(self, path=None, serdes=None, msg_classes=None): + """Handle to a Protobag archive on disk at the given `path`. Use this + object to help organize your reads and writes to an existing or new + Protobag archive. """ self._path = str(path or '') - self._decoder = decoder + self._serdes = serdes self._writer = None if msg_classes is not None: for msg_cls in msg_classes: @@ -31,10 +353,10 @@ def __init__(self, path=None, decoder=None, msg_classes=None): ## Utils @property - def decoder(self): - if self._decoder is None: - self._decoder = copy.deepcopy(_DefaultPBDecoder) - return self._decoder + def serdes(self): + if self._serdes is None: + self._serdes = copy.deepcopy(DEFAULT_SERDES) + return self._serdes @property def path(self): @@ -42,20 +364,29 @@ def path(self): def register_msg_type(self, msg_cls): """Shortcut to update the wrapped decoder""" - self.decoder.register_msg_type(msg_cls) + self.serdes.register_msg_type(msg_cls) ## Reading def get_bag_index(self): """Get the (latest) `BagIndex` instance from this protobag.""" - from protobag.protobag_native import Reader - bag_index_str = Reader.get_index(self._path) + from protobag.protobag_native import PyReader + bag_index_str = PyReader.get_index(self._path) msg = BagIndex() msg.ParseFromString(bag_index_str) return msg - def iter_entries(self, selection=None, dynamic_decode=True): + def get_topics(self): + """Get the list of topics for any time-series data in this protobag.""" + from protobag.protobag_native import PyReader + return PyReader.get_topics(self._path) + + def iter_entries( + self, + selection=None, + dynamic_decode=True, + sync_using_max_slop=None): """Create a `ReadSession` and iterate through entries specified by the given `selection`; by default "SELECT ALL" (read all entries in the protobag). @@ -70,9 +401,13 @@ def iter_entries(self, selection=None, dynamic_decode=True): If you lack the generated protobuf message definition code for your messages, try this option; note that dynamic decoding is slower than normal protobuf deserialization. + sync_using_max_slop (optional protobag_native.MaxSlopTimeSyncSpec): + Synchronize StampedEntry instances in the `selection` using + a max slop algorithm. FMI see `protobag_native.PyMaxSlopTimeSync`. Returns: - Generates `PBEntry` instances + Generates `Entry` subclass instances (or a list of `Entry` instances + when synchronization is requested) """ if selection is None: @@ -83,17 +418,48 @@ def iter_entries(self, selection=None, dynamic_decode=True): selection_bytes = selection if dynamic_decode: - self.decoder.register_dynamic_types_from_index(self.get_bag_index()) - - from protobag.protobag_native import Reader - reader = Reader() + self.serdes.register_dynamic_types_from_index(self.get_bag_index()) + + def iter_results(pb_seq, unpack): + while True: + res = pb_seq.get_next() + # NB: We use this exception instead of pybind11::stop_iteration due + # to a bug in pybind related to libc++. FMI see: + # * https://gitter.im/pybind/Lobby?at=5f18cfc9361e295cf01fd21a + # * (This fix appears to still have a bug) + # https://github.com/pybind/pybind11/pull/949 + if res is not None: + yield unpack(res) + else: + return + + from protobag.protobag_native import PyReader + reader = PyReader() reader.start(self._path, selection_bytes) - for nentry in reader: - yield PBEntry(nentry=nentry, decoder=self.decoder) + + if sync_using_max_slop is not None: + # Synchronize! + from protobag.protobag_native import PyMaxSlopTimeSync + sync = PyMaxSlopTimeSync() + sync.start(reader, sync_using_max_slop) + + def unpack_bundle(bundle): + return [ + Entry.from_nentry(nentry, serdes=self.serdes) + for nentry in bundle + ] + for entry in iter_results(sync, unpack_bundle): + yield entry + + else: + + unpack = lambda nentry: Entry.from_nentry(nentry, serdes=self.serdes) + for entry in iter_results(reader, unpack): + yield entry def get_entry(self, entryname): """Convenience for getting a single entry with `entryname`.""" - sel = SelectionBuilder.get_entry(entryname) + sel = SelectionBuilder.select_entry(entryname) for entry in self.iter_entries(selection=sel): return entry raise KeyError("Protobag %s missing entry %s" % (self._path, entryname)) @@ -139,18 +505,34 @@ def create_writer( spec.save_descriptor_index = save_descriptor_index return _Writer(spec) + class _Writer(object): def __init__(self, spec): self._spec = spec self._indexed_type_urls = set() - from protobag.protobag_native import Writer - self._writer = Writer() + from protobag.protobag_native import PyWriter + self._writer = PyWriter() self._writer.start(spec) def close(self): self._writer.close() + + def write_entry(self, entry): + if isinstance(entry, RawEntry): + self.write_raw(entry.entryname, entry.raw_bytes) + elif isinstance(entry, MessageEntry): + self.write_msg(entry.entryname, entry.msg) + elif isinstance(entry, StampedEntry): + self.write_stamped_msg( + entry.topic, + entry.msg, + timestamp=entry.timestamp) + else: + raise ValueError("Don't know what to do with %s" % (entry,)) + + def write_raw(self, entryname, raw_bytes): self._writer.write_raw(entryname, raw_bytes) @@ -183,138 +565,77 @@ def _maybe_get_serialized_fds(self, msg): return fds_bytes -## ============================================================================ -## == Utils =================================================================== -## ============================================================================ - -def get_type_url(pb_msg): - # See also `protobag::GetTypeURL()` - return 'type.googleapis.com/' + pb_msg.DESCRIPTOR.full_name - - -def to_pb_timestamp(v): - """Try to convert value `v` to a Protobuf `Timestamp` instance.""" - if isinstance(v, Timestamp): - return v - elif isinstance(v, datetime.datetime): - ts = Timestamp() - ts.FromDatetime(v) - return ts - elif isinstance(v, int): - ts = Timestamp() - ts.FromSeconds(v) - return ts - elif isinstance(v, float): - sec = int(v) - nsec = int((v - sec) * 1e9) - ts = Timestamp(seconds=sec, nanos=nsec) - return ts - else: - raise ValueError( - "Don't know what to do with timestamp %s" % (v,)) - - -def to_sec_nanos(v): - """Try to convert value `v` to a (seconds, nanoseconds) tuple""" - if isinstance(v, (tuple, list)) and len(v) == 2: - return v - elif isinstance(v, Timestamp): - return (v.seconds, v.nanos) - elif isinstance(v, datetime.datetime): - import calendar - return ( - calendar.timegm(dt.utctimetuple()), # seconds - dt.microsecond * 1000) # nanos - elif isinstance(v, int): - return (v, 0) - elif isinstance(v, float): - sec = int(v) - nsec = int((v - sec) * 1e9) - return (sec, nsec) - else: - raise ValueError( - "Don't know what to do with value %s" % (v,)) - - -def to_topic_time(v): - """Try to convert value `v` to a TopicTime instance.""" - if isinstance(v, TopicTime): - return v - elif isinstance(v, dict): - tt = TopicTime( - topic=v['topic'], - timestamp=to_pb_timestamp(v['timestamp'])) - if 'entryname' in v: - tt.entryname = v['entryname'] - return tt - elif isinstance(v, (tuple, list)): - entryname = None - if len(v) == 2: - topic, ts = v - elif len(v) == 3: - topic, ts, entryname = v - else: - raise ValueError("Can't unpack to TopicTime: %s" % (v,)) - tt = TopicTime( - topic=topic, - timestamp=to_pb_timestamp(ts)) - if entryname is not None: - tt.entryname = entryname - return tt - else: - raise ValueError( - "Don't know what to do with value %s" % (v,)) - - +### +### Selecting Data from Protobags +### -def build_fds_for_msg(msg): - """ - Given a Protobuf message `msg` (or message class), build a - `FileDescriptorSet` that can be used with `DynamicMessageFactory` below (or - `protobag::DynamicMsgFactory` in C++) to dynamically deserialize instances - of `msg` at runtime (when the Protobuf-generated code for `msg` is - unavailable). - - See also `protobag::DynamicMsgFactory` in C++. - - We run a BFS of `msg`'s descriptor and its dependencies to collect all - data necessary to decode a `msg` instance. The algorithm below mirrors that - in `protobag::BagIndexBuilder::Observe()`. We must run this collection in - python because (we assume) we only have the Protobuf python-generated code - available for `msg` in this code path. +class SelectionBuilder(object): + """Helper for creating Protobag (read) Selections.""" + + @classmethod + def select_all(cls, all_entries_are_raw=False): + return Selection( + select_all={'all_entries_are_raw': all_entries_are_raw}) - Args: - msg (Protobuf message or class): Build a `FileDescriptorSet` based upon - the `DESCRIPTOR` of this message. + @classmethod + def select_entries( + cls, + entrynames, + ignore_missing_entries=False, + entries_are_raw=False): - Returns: - A `FileDescriptorSet` protobuf message instance. - """ - - from google.protobuf.descriptor_pb2 import FileDescriptorProto - from google.protobuf.descriptor_pb2 import FileDescriptorSet + return Selection( + entrynames={ + 'entrynames': entrynames, + 'ignore_missing_entries': ignore_missing_entries, + 'entries_are_raw': entries_are_raw, + }) + + @classmethod + def select_entry(cls, entryname, **kwargs): + return cls.select_entries([entryname], **kwargs) - q = [msg.DESCRIPTOR.file] - visited = set() - files = [] - while q: - current = q.pop() - if current.name not in visited: - # Visit! - visited.add(current.name) - - fd = FileDescriptorProto() - current.CopyToProto(fd) - files.append(fd) + @classmethod + def select_window( + cls, + topics=None, + start_time=None, + end_time=None, + exclude_topics=None): - q.extend(current.dependencies) + spec = {} + if topics is not None: + spec['topics'] = topics + if start_time is not None: + spec['start'] = to_pb_timestamp(start_time) + if end_time is not None: + spec['end'] = to_pb_timestamp(end_time) + if exclude_topics is not None: + spec['exclude_topics'] = exclude_topics + return Selection(window=spec) - return FileDescriptorSet(file=files) - + @classmethod + def select_window_all(cls): + return cls.select_window() + + @classmethod + def select_events( + cls, + topic_times=None, + require_all=False): + spec = {} + if topic_times is not None: + spec['events'] = [ + to_topic_time(tt) + for tt in topic_times + ] + spec['require_all'] = require_all + return Selection(events=spec) + ## ============================================================================ -## == Protobuf Message Decoding =============================================== +## == SERDES: Decoding / Encoding Protobuf Messages =========================== ## ============================================================================ DEFAULT_MSG_TYPES = ( @@ -332,11 +653,43 @@ def build_fds_for_msg(msg): TopicTime, ) -class PBDecoder(object): +@attr.s(slots=True, eq=True, weakref_slot=False) +class TypedBytes(object): + type_url = attr.ib(default='', type='str') + entryname = attr.ib(default='', type='str') + msg_bytes = attr.ib(default=None, type='bytearray') - def __init__(self): - self._type_url_to_cls = {} - self._dynamic_factory = None + def __str__(self): + lines = [ + 'TypedBytes:', + ' type_url: %s' % self.type_url, + ' entryname: %s' % self.entryname, + ' msg_bytes: %s ... (%s bytes)' % ( + self.msg_bytes[:20].decode() if self.msg_bytes is not None else 'None', + len(self.msg_bytes) if self.msg_bytes is not None else 0), + ] + return "\n".join(lines) + + +class PBSerdes(object): + """A SERDES utility for Protobuf messages. Not a primary public protobag + API; you probably want to use protobag.Protobag directly. + How `PBSerdes` helps: + Decoding: + * When `protoc`-generated Protobuf python code is available for your + messages, you can register that code with `PBSerdes` and then `PBSerdes` + will help you decode arbitrary messages given a `TypedBytes` + instance. + * When `protoc`-generated code is unavailable, you can register descriptor + data (e.g. the kind that `protobag` indexes at recording time) with + `PBSerdes`, and `PBSerdes` will use Protobuf's dynamic message support + for decoding `TypedBytes`. + Encoding: + * Facilitates lookups of cached descriptor data (primarily for DictRowEntry + API). + """ + + ## Setup @classmethod def create_with_types(cls, pb_msg_clss): @@ -355,10 +708,39 @@ def register_dynamic_types_from_index(self, bag_index): self._dynamic_factory = \ DynamicMessageFactory.create_from_descriptor_pool_data(dpd) # TODO support multiple indices + + self._type_url_to_descriptor_data = dict(dpd.type_url_to_descriptor) + + def register_descriptor_data(self, type_url, descriptor_data): + if type_url not in self._type_url_to_descriptor_data: + is_binary = ( + isinstance(descriptor_data, six.string_types) or + isinstance(descriptor_data, six.binary_type) or + isinstance(descriptor_data, bytearray)) + if is_binary: + from google.protobuf.descriptor_pb2 import FileDescriptorSet + fds = FileDescriptorSet() + fds.ParseFromString(descriptor_data) + descriptor_data = fds + + self._type_url_to_descriptor_data[type_url] = descriptor_data + + if self._dynamic_factory is None: + self._dynamic_factory = DynamicMessageFactory() + self._dynamic_factory.register_type(type_url, descriptor_data) - def decode(self, type_url, msg_bytes, entryname=None): - """Decode string-serialized Protobuf message `msg_bytes`, interpreting - the bytes as `type_url`, and return a decoded Protobuf message instance. + ## I/O + + @classmethod + def msg_to_typed_bytes(cls, msg): + """Serialize protobuf `msg` and return a `TypedBytes` wrapper""" + return TypedBytes( + type_url=get_type_url(msg), + msg_bytes=msg.SerializeToString()) # TODO support text format + + def msg_from_typed_bytes(self, typed_bytes): + """Decode string-serialized Protobuf message (wraped in `TypedBytes`) + `typed_bytes` and return a decoded Protobuf message instance. Picks a message deserializer based upon: * `type_url`, the identifer of a message class that the user registered using `register_msg_type()` @@ -366,27 +748,59 @@ def decode(self, type_url, msg_bytes, entryname=None): indexed to the Protobag at write time (and that data has been made available through `register_dynamic_types_from_index()`) """ - if type_url in self._type_url_to_cls: - - msg_cls = self._type_url_to_cls[type_url] + if typed_bytes.type_url in self._type_url_to_cls: + msg_cls = self._type_url_to_cls[typed_bytes.type_url] msg = msg_cls() - msg.ParseFromString(msg_bytes) # TODO support text format + msg.ParseFromString(typed_bytes.msg_bytes) # TODO support text format return msg - elif self._dynamic_factory is not None: - - return self._dynamic_factory.dynamic_decode( - msg_bytes, - type_url=type_url, - entryname=entryname) - + return self._dynamic_factory.dynamic_decode(typed_bytes) else: - - raise ValueError("Could not decode message for type %s " % type_url) + raise ValueError( + "Could not decode message from %s \n%s" % ( + typed_bytes, str(self))) + + def get_msg_cls_for_type(self, type_url): + if type_url in self._type_url_to_cls: + return self._type_url_to_cls[type_url] + elif type_url in self._type_url_to_descriptor_data: + assert self._dynamic_factory + return self._dynamic_factory.get_msg_cls_for_type_url(type_url) + + def get_descriptor_data_for_type( + self, + type_url, + msg=None, + lazyily_register=True): + """Fetch the descriptor data for `type_url`; lazily deduce such + descriptor data and register it only if `lazyily_register`.""" + if type_url in self._type_url_to_descriptor_data: + return self._type_url_to_descriptor_data[type_url] + else: + if type_url in self._type_url_to_cls: + msg_cls = self._type_url_to_cls[type_url] + descriptor_data = build_fds_for_msg(msg_cls) + elif msg is not None: + descriptor_data = build_fds_for_msg(msg) + else: + raise KeyError("Can't find or build descriptor data for %s" % type_url) + + if lazyily_register: + self.register_descriptor_data(type_url, descriptor_data) + return descriptor_data + + + ## Misc + + def __init__(self): + self._type_url_to_cls = {} + self._dynamic_factory = None + self._type_url_to_descriptor_data = {} + def __str__(self): return '\n'.join(( - 'PBDecoder', + 'PBSerdes', 'User-registered types:', '\n'.join(sorted(self._type_url_to_cls.keys())), '', @@ -394,7 +808,8 @@ def __str__(self): str(self._dynamic_factory), )) -_DefaultPBDecoder = PBDecoder.create_with_types(DEFAULT_MSG_TYPES) +# NB: each protobag.Protobag instance owns a *copy* of this default +DEFAULT_SERDES = PBSerdes.create_with_types(DEFAULT_MSG_TYPES) @@ -402,6 +817,7 @@ class DynamicMessageFactory(object): def __init__(self): from google.protobuf import symbol_database self._db = symbol_database.Default() + self._registered_type_urls = set() self._entryname_to_type_url = {} @classmethod @@ -410,8 +826,8 @@ def create_from_descriptor_pool_data(cls, dpd): `protobag.DescriptorPoolData` message.""" f = cls() f.register_entries(dpd.entryname_to_type_url) - for fds in dpd.type_url_to_descriptor.values(): - f.register_types(fds) + for type_url, fds in dpd.type_url_to_descriptor.items(): + f.register_type(type_url, fds) return f def register_entry(self, entryname, type_url): @@ -421,38 +837,50 @@ def register_entries(self, entryname_to_type_url): for entryname, type_url in entryname_to_type_url.items(): self.register_entry(entryname, type_url) - def register_types(self, fds): - for fd in fds.file: - self._db.pool.Add(fd) + def register_type(self, type_url, fds): + if type_url not in self._registered_type_urls: + for fd in fds.file: + self._db.pool.Add(fd) + self._registered_type_urls.add(type_url) - def dynamic_decode(self, msg_bytes, type_url=None, entryname=None): - """Decode the given `msg_bytes` into a Protobuf message instance of either - type `type_url` or whatever type the entry `entryname` was indexed to have. + def dynamic_decode(self, typed_bytes): + """Decode the given `typed_bytes` into a Protobuf message instance of + either type `typed_bytes.type_url` or whatever type the entry + `typed_bytes.entryname` was indexed to have. """ - assert type_url or entryname, "Need a type_url or entryname" + assert typed_bytes.type_url or typed_bytes.entryname, \ + "Need a type_url or entryname" # Prefer entryname, which Protobag pins to a specific FileDescriptorSet # at time of writing (in case the message type evolves between write # sessions). - if entryname is not None: - assert entryname in self._entryname_to_type_url, \ - "Unregistered protobag entry: %s" % entryname - type_url = self._entryname_to_type_url[entryname] + if typed_bytes.entryname: + assert typed_bytes.entryname in self._entryname_to_type_url, \ + "Unregistered protobag entry: %s" % typed_bytes.entryname + typed_bytes.type_url = self._entryname_to_type_url[typed_bytes.entryname] + + try: + msg_cls = self.get_msg_cls_for_type_url(typed_bytes.type_url) + except Exception as e: + raise KeyError("Cannot dynamic decode %s: %s" % (typed_bytes, e)) + msg = msg_cls() + msg.ParseFromString(typed_bytes.msg_bytes) + # TODO support text format + return msg + + def get_msg_cls_for_type_url(self, type_url): # Based upon https://github.com/protocolbuffers/protobuf/blob/86b3ccf28ca437330cc42a2b3a75a1314977fcfd/python/google/protobuf/json_format.py#L397 type_name = type_url.split('/')[-1] try: descriptor = self._db.pool.FindMessageTypeByName(type_name) except Exception as e: raise KeyError( - "Could not find descriptor for %s: %s" % ((type_url, entryname), e)) + "Could not find descriptor for %s: %s" % (type_url, e)) msg_cls = self._db.GetPrototype(descriptor) - msg = msg_cls() - msg.ParseFromString(msg_bytes) - # TODO support text format - return msg - + return msg_cls + def __str__(self): return '\n'.join(( 'protobag.DynamicMessageFactory', @@ -468,149 +896,147 @@ def __str__(self): ## ============================================================================ -## == Entries (Reading) ======================================================= +## == Protobag <-> Tables of Rows ============================================= ## ============================================================================ -class PBEntry(object): - """A single entry in a protobag; analogous to a C++ `protobag::Entry`""" - - __slots__ = ['_nentry', '_pb_msg', '_decoder', '_topic', '_timestamp'] - - def __init__(self, nentry=None, decoder=None): - if nentry is None: - from protobag.protobag_native import native_entry - nentry = native_entry() - self._nentry = nentry - self._decoder = decoder or _DefaultPBDecoder - self._pb_msg = None - self._topic = None - self._timestamp = None - - def __str__(self): - lines = [] - if self.is_stamped_message(): - lines += [ - "Topic: %s" % self.topic, - "Timestamp: %s sec %s ns" % ( - self.timestamp.seconds, self.timestamp.nanos), - ] - lines += [ - "Entryname: %s" % self.entryname, - "type_url: %s" % self.type_url, - "size: %s bytes" % len(self.raw_msg_bytes), - ] - - if self._pb_msg: - lines.append("msg: \n%s\n" % str(self._pb_msg)) - else: - lines.append("msg: (not yet deserialized)") - - return "\n".join(lines) +def to_pb_timestamp_safe(v): + try: + return to_pb_timestamp(v) + except Exception: + return None + +@attr.s(slots=True, eq=True, weakref_slot=False) +class DictRowEntry(object): + """Utility for converting Protobag entries to and from python dicts / + table "rows". + + # Examples: + ## Entry -> dict + >>> bag = protobag.Protobag(path='my_bag.zip') + >>> entry = bag.get_entry('foo') + >>> row = DictRowEntry.from_entry(entry) + # row is now a DictRowEntry; use the attributes directly or try + # `attrs.asdict()` + + ## dict -> Entry + >>> bag = protobag.Protobag(path='my_bag.zip', msg_classes=(MyPbMsgType,)) + >>> d = dict( + entryname='foo', + type_url='type.googleapis.com/MyPbMsgType', + # Can get this using `get_type_url(MyPbMsgType)` + msg_dict={'x': 5, 'y': 7}, + serdes=bag.serdes) + # Need to provide the `serdes` that knows how to encode `MyPbMsgType` + # instances from python dicts. If your dict has `descriptor_data`, + # you can omit the serdes + >>> row = DictRowEntry(**d) + >>> entry = row.to_entry() + # entry is now a MessageEntry instance + >>> writer = bag.create_writer() + >>> writer.write_entry(entry) - @property - def entryname(self): - return self._nentry.entryname + """ - @property - def raw_msg_bytes(self): - return self._nentry.msg_bytes + msg_dict = attr.ib(factory=dict, type=dict) - @property - def type_url(self): - return self._nentry.type_url - - def get_msg(self): - if not self._pb_msg: - if self.type_url: - # NB: If this is a Stamped Message, protobag_native will have already - # unwrapped the StampedMessage wrapper. - self._pb_msg = self._decoder.decode(self.type_url, self.raw_msg_bytes) - else: - # This message is raw - self._pb_msg = self.raw_msg_bytes - return self._pb_msg - - - # For Stamped Messages only - - def is_stamped_message(self): - return self._nentry.is_stamped - - @property - def topic(self): - assert self._nentry.is_stamped, "Not a stamped message" - return self._nentry.topic - - @property - def timestamp(self): - if not self._timestamp: - assert self._nentry.is_stamped, "Not a stamped message" - self._timestamp = Timestamp( - seconds=self._nentry.sec, - nanos=self._nentry.nanos) - return self._timestamp + entryname = attr.ib(default='', type='str') + type_url = attr.ib(default='', type='str') + topic = attr.ib(default='', type='str') + timestamp = attr.ib( + default=None, type=Timestamp, converter=to_pb_timestamp_safe) + descriptor_data = attr.ib(default=None) + serdes = attr.ib(default=DEFAULT_SERDES) + @classmethod + def from_entry(cls, entry): + if isinstance(entry, RawEntry): + msg_dict = {'protobag_raw_entry_bytes': entry.raw_bytes} + else: + msg_dict = json_format.MessageToDict(entry.msg) + return cls( + entryname=entry.entryname, + type_url=entry.type_url, + msg_dict=msg_dict, + + serdes=entry.serdes, + descriptor_data= + entry.serdes.get_descriptor_data_for_type(entry.type_url) + if not isinstance(entry, RawEntry) + else None, -## ============================================================================ -## == Selections ============================================================== -## ============================================================================ + topic=entry.topic if isinstance(entry, StampedEntry) else '', + timestamp=entry.timestamp if isinstance(entry, StampedEntry) else None) -class SelectionBuilder(object): - """Helper for creating Protobag (read) Selections.""" + def is_raw_entry(self): + return 'protobag_raw_entry_bytes' in self.msg_dict - @classmethod - def select_all(cls, all_entries_are_raw=False): - return Selection( - select_all={'all_entries_are_raw': all_entries_are_raw}) + def is_stamped_entry(self): + return self.topic and (self.timestamp is not None) + + def to_entry(self): + if self.is_raw_entry(): + return RawEntry( + entryname=self.entryname, + type_url='', + raw_bytes=self.msg_dict['protobag_raw_entry_bytes'], + serdes=self.serdes) + else: + if self.descriptor_data: + # Maybe use this data to facilitate messsage parsing below + self.serdes.register_descriptor_data( + self.type_url, + self.descriptor_data) + msg_cls = self.serdes.get_msg_cls_for_type(self.type_url) + msg = msg_cls() + json_format.ParseDict(self.msg_dict, msg) + + if self.is_stamped_entry(): + return StampedEntry( + entryname=self.entryname, + type_url=self.type_url, + msg=msg, + + topic=self.topic, + timestamp=self.timestamp, + + serdes=self.serdes, + descriptor_data=self.descriptor_data) + else: + return MessageEntry( + entryname=self.entryname, + type_url=self.type_url, + msg=msg, + + serdes=self.serdes, + descriptor_data=self.descriptor_data) - @classmethod - def select_entries( - cls, - entrynames, - ignore_missing_entries=False, - entries_are_raw=False): - - return Selection( - entrynames={ - 'entrynames': entrynames, - 'ignore_missing_entries': ignore_missing_entries, - 'entries_are_raw': entries_are_raw, - }) - - @classmethod - def select_entry(cls, entryname, **kwargs): - return cls.select_entries([entryname], **kwargs) + def __str__(self): + import pprint + + def get_descriptor_data_formatted(): + v = self.descriptor_data + if isinstance(v, six.string_types): + return "(binary) %s ... (%s bytes)" % (v[:20].decode(), len(v)) + elif hasattr(v, 'DESCRIPTOR'): + return "(protobuf message) %s %s (%s bytes)" % ( + v.DESCRIPTOR.full_name, + str(v)[:20], + len(v.SerializeToString())) + else: + return str(v) - @classmethod - def select_window( - cls, - topics=None, - start_time=None, - end_time=None, - exclude_topics=None): + return '\n'.join(( + 'protobag.DictRowEntry:', + ' entryname: %s' % self.entryname, + ' topic: %s timestamp: %s' % ( + self.topic, + '%s sec %s ns' % (self.timestamp.seconds, self.timestamp.nanos)) + if self.timestamp is not None + else ' (not a time-series entry)', + ' type_url: %s' % self.type_url, + ' has serdes: %s' % (self.serdes is not None), + ' descriptor_data: %s' % get_descriptor_data_formatted(), + ' msg_dict:\n %s' % pprint.pformat(self.msg_dict), + )) - spec = {} - if topics is not None: - spec['topics'] = topics - if start_time is not None: - spec['start'] = to_pb_timestamp(start_time) - if end_time is not None: - spec['end'] = to_pb_timestamp(end_time) - if exclude_topics is not None: - spec['exclude_topics'] = exclude_topics - return Selection(window=spec) - - @classmethod - def select_events( - cls, - topic_times=None, - require_all=False): - spec = {} - if topic_times is not None: - spec['events'] = [ - to_topic_time(tt) - for tt in topic_times - ] - spec['require_all'] = require_all - return Selection(events=spec) diff --git a/python/protobag_test/test_protobag.py b/python/protobag_test/test_protobag.py index b913b12..34121fc 100644 --- a/python/protobag_test/test_protobag.py +++ b/python/protobag_test/test_protobag.py @@ -1,18 +1,46 @@ +import copy +import itertools +import os -from protobag import Protobag +import pytest -def test(): - p = Protobag() +import protobag +### +### Test Utils +### +def mkdir(path): + import errno + try: + os.makedirs(path) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise -from protobag import ProtoBag -from protobag import Entry -from protobag.ProtobagMsg_pb2 import StdMsg +def rm_rf(path): + import shutil + shutil.rmtree(path) + + +def get_test_tempdir(testname, clean=True): + import tempfile + path = os.path.join(tempfile.gettempdir(), testname) + + if clean: + mkdir(path) + rm_rf(path) + mkdir(path) + + return path + def to_std_msg(v): + from protobag.ProtobagMsg_pb2 import StdMsg if isinstance(v, bool): return StdMsg.Bool(value=v) elif isinstance(v, int): @@ -20,8 +48,8 @@ def to_std_msg(v): elif isinstance(v, float): return StdMsg.Float(value=v) elif isinstance(v, str): - # TODO python2 support ? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - # NB: deprecation warning fixed in pb 3.3 https://github.com/protocolbuffers/protobuf/pull/2922 + # TODO python2 support? NB: deprecation warning fixed in protobuf 3.3 + # https://github.com/protocolbuffers/protobuf/pull/2922 return StdMsg.String(value=v) elif isinstance(v, (bytes, bytearray)): return StdMsg.Bytes(value=v) @@ -29,46 +57,443 @@ def to_std_msg(v): raise TypeError(v) -def to_unixtime(dt): - import time - return int(time.mktime(dt.timetuple())) +## ============================================================================ +## == Test Protobag Utils ===================================================== +## ============================================================================ +def test_type_url(): + msg = to_std_msg("foo") + assert protobag.get_type_url(msg) == \ + 'type.googleapis.com/protobag.StdMsg.String' -def test_demo(): - import tempfile + msg = to_std_msg(1337) + assert protobag.get_type_url(msg) == \ + 'type.googleapis.com/protobag.StdMsg.Int' + + with pytest.raises(Exception): + protobag.get_type_url("junk") + + +def test_to_pb_timestamp(): + import datetime + from google.protobuf.timestamp_pb2 import Timestamp + + expected = Timestamp(seconds=1337, nanos=1337) + assert protobag.to_pb_timestamp(expected) == expected + + assert protobag.to_pb_timestamp(1337) == Timestamp(seconds=1337) + assert protobag.to_pb_timestamp(1337.1337) == \ + Timestamp(seconds=1337, nanos=133700000) + assert protobag.to_pb_timestamp((1337, 1337)) == expected + + assert protobag.to_pb_timestamp( + datetime.datetime( + year=1970, month=1, day=1, second=13, microsecond=37)) == \ + Timestamp(seconds=13, nanos=37000) + + with pytest.raises(ValueError): + protobag.to_pb_timestamp("123") + + +def test_to_sec_nanos(): + import datetime + from google.protobuf.timestamp_pb2 import Timestamp + + assert protobag.to_sec_nanos((1337, 1337)) == (1337, 1337) + + assert \ + protobag.to_sec_nanos(Timestamp(seconds=1337, nanos=1337)) == (1337, 1337) + + assert protobag.to_sec_nanos(1337) == (1337, 0) + assert protobag.to_sec_nanos(1337.1337) == (1337, 133700000) + + assert \ + protobag.to_sec_nanos( + datetime.datetime( + year=1970, month=1, day=1, second=13, microsecond=37)) \ + == (13, 37000) + + with pytest.raises(ValueError): + protobag.to_sec_nanos("123") + + with pytest.raises(ValueError): + protobag.to_sec_nanos((1, 2, 3)) + + +def test_to_topic_time(): + from google.protobuf.timestamp_pb2 import Timestamp + from protobag.ProtobagMsg_pb2 import TopicTime - f = tempfile.NamedTemporaryFile(suffix='.zip') - print('temp file', f.name) - b = ProtoBag.write_zip(f) - - ENTRIES = ( - Entry( - topic='/topic1', - timestamp=0, - msg=to_std_msg("foo")), - Entry( - topic='/topic1', - timestamp=1, - msg=to_std_msg("bar")), - Entry( - topic='/topic2', - timestamp=0, - msg=to_std_msg(1337)), + assert protobag.to_topic_time(TopicTime()) == TopicTime() + + assert \ + protobag.to_topic_time({'topic': 't', 'timestamp': 1.1}) == \ + TopicTime(topic='t', timestamp=Timestamp(seconds=1, nanos=100000000)) + + assert \ + protobag.to_topic_time(('t', 1.1)) == \ + TopicTime(topic='t', timestamp=Timestamp(seconds=1, nanos=100000000)) + + with pytest.raises(ValueError): + protobag.to_topic_time("foo") + + +def test_build_fds_for_msg(): + from protobag.ProtobagMsg_pb2 import TopicTime + fds = protobag.build_fds_for_msg(TopicTime) + actual_types = sorted(list(itertools.chain.from_iterable( + (f.package + '.' + t.name for t in f.message_type) + for f in fds.file))) + + expected_types = ( + # These types get pulled in by `build_fds_for_msg()` naively grabbing + # everything else in the package that contains the target type TopicTime + 'protobag.BagIndex', + 'protobag.Selection', + 'protobag.StampedMessage', + 'protobag.StdMsg', + 'protobag.TopicTime', + + # These protobuf types get pulled in by `build_fds_for_msg()` + # looking for transitive dependencies. So for example, if google's + # Timestamp message changes, then our snapshot of TopicTime will + # capture the version of Timestamp used when the TopicTime message was + # recorded + 'google.protobuf.Any', + 'google.protobuf.DescriptorProto', + 'google.protobuf.EnumDescriptorProto', + 'google.protobuf.EnumOptions', + 'google.protobuf.EnumValueDescriptorProto', + 'google.protobuf.EnumValueOptions', + 'google.protobuf.ExtensionRangeOptions', + 'google.protobuf.FieldDescriptorProto', + 'google.protobuf.FieldOptions', + 'google.protobuf.FileDescriptorProto', + 'google.protobuf.FileDescriptorSet', + 'google.protobuf.FileOptions', + 'google.protobuf.GeneratedCodeInfo', + 'google.protobuf.MessageOptions', + 'google.protobuf.MethodDescriptorProto', + 'google.protobuf.MethodOptions', + 'google.protobuf.OneofDescriptorProto', + 'google.protobuf.OneofOptions', + 'google.protobuf.ServiceDescriptorProto', + 'google.protobuf.ServiceOptions', + 'google.protobuf.SourceCodeInfo', + 'google.protobuf.Timestamp', + 'google.protobuf.UninterpretedOption', ) + assert actual_types == sorted(expected_types) + + + +## ============================================================================ +## == Tet Public API ========================================================== +## ============================================================================ + +def test_msg_entry_print(): + entry = protobag.MessageEntry.from_msg( + entryname='my_entry', + msg=to_std_msg('foo')) + assert str(entry) == ( + "MessageEntry:\n" + " entryname: my_entry\n" + " type_url: type.googleapis.com/protobag.StdMsg.String\n" + " has serdes: False\n" + " has descriptor_data: False\n" + " msg:\n" + "value: \"foo\"\n") + + +def test_raw_entry_print(): + entry = protobag.RawEntry.from_bytes( + entryname='my_entry', + raw_bytes=bytearray(b'abcabcabcabcabcabcabcabcabcabc')) + assert str(entry) == ( + "RawEntry:\n" + " entryname: my_entry\n" + " raw_bytes: abcabcabcabcabcabcab ... (30 bytes)") + + +def test_stamped_entry_print(): + entry = protobag.StampedEntry.from_msg( + topic='my_topic', + timestamp=(1337, 1337), + msg=to_std_msg('foo')) + assert str(entry) == ( + "StampedEntry:\n" + " topic: my_topic\n" + " timestamp: 1337 sec 1337 ns\n" + " type_url: type.googleapis.com/protobag.StdMsg.String\n" + " entryname: \n" + " has serdes: False\n" + " has descriptor_data: False\n" + " msg:\n" + "value: \"foo\"\n") + + + +## ============================================================================ +## == Test SERDES ============================================================= +## ============================================================================ + +def test_typed_bytes(): + t = protobag.TypedBytes(type_url='type_url', entryname='entryname') + assert str(t) == ( + "TypedBytes:\n" + " type_url: type_url\n" + " entryname: entryname\n" + " msg_bytes: None ... (0 bytes)") + + t = protobag.TypedBytes( + type_url='type_url', + entryname='entryname', + msg_bytes=bytearray(b'abcabcabcabcabcabcabcabcabcabc')) + assert str(t) == ( + "TypedBytes:\n" + " type_url: type_url\n" + " entryname: entryname\n" + " msg_bytes: abcabcabcabcabcabcab ... (30 bytes)") + + +_to_typed_bytes = protobag.PBSerdes.msg_to_typed_bytes + +def test_serdes_msg_from_typed_bytes_empty(): + tb = _to_typed_bytes(to_std_msg('moof')) + serdes = protobag.PBSerdes() + with pytest.raises(ValueError): + msg = serdes.msg_from_typed_bytes(tb) + - for entry in ENTRIES: - b.write_entry(entry) +def test_serdes_msg_from_typed_bytes_default_serdes(): + tb = _to_typed_bytes(to_std_msg('moof')) + serdes = copy.deepcopy(protobag.DEFAULT_SERDES) + # The DEFAULT_SERDES has built-in support for protobag standard messages + msg = serdes.msg_from_typed_bytes(tb) + assert msg.value == 'moof' + + +def test_serdes_msg_from_typed_bytes_user_registered(): + tb = _to_typed_bytes(to_std_msg('moof')) + + from protobag.ProtobagMsg_pb2 import StdMsg + serdes = protobag.PBSerdes.create_with_types([StdMsg.String]) + + msg = serdes.msg_from_typed_bytes(tb) + assert msg.value == 'moof' + + +def test_serdes_msg_from_typed_bytes_dynamic_decode(): + tb = _to_typed_bytes(to_std_msg('moof')) + + serdes = protobag.PBSerdes() + + from protobag.ProtobagMsg_pb2 import StdMsg + fds = protobag.build_fds_for_msg(StdMsg.String) + descriptor_data = fds.SerializeToString() + serdes.register_descriptor_data(tb.type_url, descriptor_data) + + msg = serdes.msg_from_typed_bytes(tb) + assert msg.value == 'moof' + + + +## ============================================================================ +## == Test Public API ========================================================= +## ============================================================================ + +def _check_zip_has_expected_files(path, expected_files): + import zipfile + f = zipfile.ZipFile(path, 'r') + actual = set(f.namelist()) + expected = set(expected_files) + missing = (expected - actual) + assert not missing, "Expected\n%s\nActual\n%s" % (expected, actual) + + +def test_write_read_msg(): + test_root = get_test_tempdir('test_write_read_msg') + path = os.path.join(test_root, 'bag.zip') + + bag = protobag.Protobag(path=path) + writer = bag.create_writer() + writer.write_msg('txt_foo', to_std_msg("foo")) + writer.write_msg('int_1337', to_std_msg(1337)) + writer.close() + + # Test zip archive contents using protobuf-blind zipfile + _check_zip_has_expected_files(path, ('int_1337', 'txt_foo')) + + # Read messages back + bag = protobag.Protobag(path=path) + with pytest.raises(KeyError): + entry = bag.get_entry('does_not_exist') + + entry = bag.get_entry('txt_foo') + assert entry.entryname == 'txt_foo' + assert entry.msg == to_std_msg("foo") + + entry = bag.get_entry('int_1337') + assert entry.entryname == 'int_1337' + assert entry.msg == to_std_msg(1337) + + +def test_write_read_stamped_msg(): + test_root = get_test_tempdir('test_write_read_stamped_msg') + path = os.path.join(test_root, 'bag.zip') + + bag = protobag.Protobag(path=path) + writer = bag.create_writer() + for t in range(3): + writer.write_stamped_msg("my_t1", to_std_msg(t), t_sec=t) + writer.write_stamped_msg("my_t2", to_std_msg(t+1), t_sec=t+1) + writer.close() + + # Test zip archive contents using protobuf-blind zipfile + _check_zip_has_expected_files( + path, + ( + 'my_t1/0.0.stampedmsg.protobin', + 'my_t1/1.0.stampedmsg.protobin', + 'my_t1/2.0.stampedmsg.protobin', + + 'my_t2/1.0.stampedmsg.protobin', + 'my_t2/2.0.stampedmsg.protobin', + 'my_t2/3.0.stampedmsg.protobin', + )) + + ## Read messages back + bag = protobag.Protobag(path=path) + with pytest.raises(KeyError): + entry = bag.get_entry('does_not_exist') + + # Test getting topic list + assert sorted(bag.get_topics()) == sorted(['my_t1', 'my_t2']) + + # Test getting time series data + def _check_expected_topic_t_value(sel, expected_topic_t_value): + actual_topic_t_value = [] + for entry in bag.iter_entries(selection=sel): + actual_topic_t_value.append( + (entry.topic, entry.timestamp.seconds, entry.msg.value)) + assert expected_topic_t_value == actual_topic_t_value + + _check_expected_topic_t_value( + protobag.SelectionBuilder.select_window_all(), + [ + ('my_t1', 0, 0), ('my_t1', 1, 1), ('my_t2', 1, 1), ('my_t1', 2, 2), + ('my_t2', 2, 2), ('my_t2', 3, 3) + ]) + _check_expected_topic_t_value( + protobag.SelectionBuilder.select_window(topics=['my_t1']), + [ + ('my_t1', 0, 0), ('my_t1', 1, 1), ('my_t1', 2, 2), + ]) + _check_expected_topic_t_value( + protobag.SelectionBuilder.select_window(topics=['my_t2']), + [ + ('my_t2', 1, 1), ('my_t2', 2, 2), ('my_t2', 3, 3) + ]) + _check_expected_topic_t_value( + protobag.SelectionBuilder.select_window(topics=['does_not_exist']), + []) + + # Test sync + from protobag.protobag_native import MaxSlopTimeSyncSpec + spec = MaxSlopTimeSyncSpec() + spec.topics = ['my_t1', 'my_t2'] + spec.set_max_slop(seconds=2, nanos=0) + + actual_bundles = [] + sel = protobag.SelectionBuilder.select_window_all() + for bundle in bag.iter_entries(selection=sel, sync_using_max_slop=spec): + actual_bundles.append(sorted( + (entry.topic, entry.timestamp.seconds, entry.msg.value) + for entry in bundle + )) + + expected_bundles = [ + [('my_t1', 1, 1), ('my_t2', 1, 1)], + [('my_t1', 2, 2), ('my_t2', 2, 2)], + ] + assert actual_bundles == expected_bundles + + +def test_write_read_raw(): + test_root = get_test_tempdir('test_write_read_raw') + path = os.path.join(test_root, 'bag.zip') + + bag = protobag.Protobag(path=path) + writer = bag.create_writer() + writer.write_raw('raw_data', b"i am a raw string") + writer.close() + + # Test zip archive contents using protobuf-blind zipfile + _check_zip_has_expected_files(path, ('raw_data',)) - # Flush / close the zip - del b - - b = ProtoBag.read_zip(f) - expected = sorted( - (entry.topic, entry.timestamp, entry.msg.value) - for entry in ENTRIES) - actual = sorted( - (entry.topic, to_unixtime(entry.timestamp), entry.msg.value) - for entry in b.iter_entries()) + # Read messages back + bag = protobag.Protobag(path=path) + with pytest.raises(KeyError): + entry = bag.get_entry('does_not_exist') - assert expected == actual + entry = bag.get_entry("raw_data") + assert entry.entryname == 'raw_data' + assert entry.raw_bytes == b"i am a raw string" + + +## ============================================================================ +## == Test DictRowEntry ======================================================= +## ============================================================================ + +def test_dict_row_entry_round_trip(): + ## First create a fixture; need a backing protobag to create entries + test_root = get_test_tempdir('test_dict_row_entry_round_trip') + path = os.path.join(test_root, 'bag.zip') + + bag = protobag.Protobag(path=path) + writer = bag.create_writer() + writer.write_msg('txt_foo', to_std_msg("foo")) + for t in range(3): + writer.write_stamped_msg("my_t1", to_std_msg(str(t)), t_sec=t) + writer.write_raw('raw_data', b"i am a raw string") + writer.close() + + + ## Now read entries and test round trip entry -> dict -> entry + path_rewrite = os.path.join(test_root, 'bag_rewrite.zip') + + bag = protobag.Protobag(path=path) + bag_rewrite = protobag.Protobag(path=path_rewrite) + writer = bag_rewrite.create_writer() + + # Simple messages + entry = bag.get_entry("txt_foo") + row = protobag.DictRowEntry.from_entry(entry) + assert row.entryname == "txt_foo" + assert row.type_url == 'type.googleapis.com/protobag.StdMsg.String' + assert row.msg_dict == {'value': 'foo'} + writer.write_entry(row.to_entry()) + + # Time-series data + sel = protobag.SelectionBuilder.select_window(topics=['my_t1']) + entry = None + for e in bag.iter_entries(selection=sel): + entry = e + break + row = protobag.DictRowEntry.from_entry(entry) + assert row.type_url == 'type.googleapis.com/protobag.StdMsg.String' + assert row.msg_dict == {'value': '0'} + assert row.topic == 'my_t1' + assert row.timestamp == protobag.to_pb_timestamp(0) + writer.write_entry(row.to_entry()) + + # Raw data + entry = bag.get_entry("raw_data") + row = protobag.DictRowEntry.from_entry(entry) + assert row.entryname == "raw_data" + assert row.type_url == '' + assert row.msg_dict == {'protobag_raw_entry_bytes': b"i am a raw string"} + writer.write_entry(row.to_entry()) + + writer.close() diff --git a/python/requirements.txt b/python/requirements.txt index bc172f6..35a8bdf 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1 +1,3 @@ -protobuf>=3.11.3 \ No newline at end of file +attrs +protobuf>=3.11.3 +six diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 0000000..65f8181 --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,5 @@ +[aliases] +test="pytest" + +[tool:pytest] +addopts = -v --durations=0 diff --git a/python/setup.py b/python/setup.py index 01fd4c4..8ba224b 100644 --- a/python/setup.py +++ b/python/setup.py @@ -14,6 +14,12 @@ with open('protobag_version.txt', 'r') as f: PROTOBAG_VERSION = f.readlines()[0].strip() +with open('protobag/__init__.py') as f: + import re + v = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", f.read(), re.M).groups()[0] + assert v == PROTOBAG_VERSION, \ + ("Please make protobag/__init__.py __version__ match protobag_version.txt" + "%s != %s" % (v, PROTOBAG_VERSION)) ## Based upon https://github.com/pybind/cmake_example/blob/11a644072b12ad78352b6e6649db9dfe7f406676/setup.py#L1