Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
121 commits
Select commit Hold shift + click to select a range
17e567f
skeleton
scpwais Apr 23, 2020
dc9a101
linux works
scpwais Apr 23, 2020
c472cf4
save
scpwais Apr 25, 2020
ca28fcf
save fixme
scpwais Apr 25, 2020
1b835b3
save
scpwais Apr 26, 2020
4023f66
save
scpwais Apr 26, 2020
ff977f6
save
scpwais Apr 26, 2020
86da4d2
save
scpwais Apr 26, 2020
0461b90
remove submodules
scpwais Apr 26, 2020
6e9af67
remove submodules
scpwais Apr 26, 2020
53a6794
save
scpwais Apr 26, 2020
73f39af
idk
scpwais Apr 27, 2020
dc111eb
xcode build and run demo test
scpwais Apr 29, 2020
43d2ac0
save
scpwais Apr 30, 2020
838ffa5
pybind stuff basic
scpwais Apr 30, 2020
db77632
save
scpwais Apr 30, 2020
a0dfd5d
xcode pybind works
scpwais Apr 30, 2020
dbb9d80
use xcode pybind
scpwais May 1, 2020
51224c5
fixup
scpwais May 1, 2020
c6484b5
iface
scpwais May 1, 2020
05a7d66
pybind draft
scpwais May 1, 2020
a178765
save
scpwais May 1, 2020
8665daa
save
scpwais May 2, 2020
abf027e
save
scpwais May 2, 2020
6e35e1c
save
scpwais May 2, 2020
e6b4136
sugar
scpwais May 14, 2020
1b21b54
sugar
scpwais May 14, 2020
1a8738d
save
scpwais May 16, 2020
453fc7c
save
scpwais May 16, 2020
597e757
Merge branch 'v0.0.1' of github.com:StandardCyborg/protobag into v0.0.1
scpwais May 16, 2020
5054ad7
hell yes
scpwais May 16, 2020
bd4fd51
save
scpwais May 17, 2020
c6c7787
rename
scpwais May 17, 2020
fe12539
save
scpwais May 19, 2020
6ed4b89
merged
scpwais May 19, 2020
782bd66
save
scpwais May 20, 2020
b99d294
save
scpwais May 21, 2020
310f11b
save
scpwais May 23, 2020
a24b3e1
save
scpwais May 24, 2020
5272b2f
save
scpwais May 24, 2020
2bed7e6
save
scpwais May 24, 2020
a16c6a0
save
scpwais May 24, 2020
0e0d812
save
scpwais May 24, 2020
8586866
save
scpwais May 25, 2020
1d6b609
make reading easier
scpwais May 29, 2020
ebb3288
save
scpwais May 30, 2020
c15ecd7
save
scpwais May 30, 2020
422c28f
save
scpwais May 30, 2020
dd3f941
save
scpwais May 30, 2020
13db4d7
save
scpwais May 30, 2020
4355c1c
save
scpwais May 30, 2020
b2f7e16
save
scpwais May 30, 2020
3df5531
save
scpwais May 30, 2020
fe12f8a
save
scpwais May 30, 2020
f6133c6
oops use matching protobuf
scpwais May 30, 2020
4664394
Merge branch 'v0.0.2' of github.com:StandardCyborg/protobag into v0.0.2
scpwais May 30, 2020
76463d1
cleanup from self review
scpwais Jun 4, 2020
b36f4d9
Merge branch 'v0.0.2' of github.com:StandardCyborg/protobag into v0.0.2
scpwais Jun 4, 2020
5fc969a
save
scpwais Jun 5, 2020
9c05e1b
save
scpwais Jun 7, 2020
163becc
save
scpwais Jun 7, 2020
30e59bb
save
scpwais Jun 7, 2020
c0b3716
draft of time sync compiles
scpwais Jun 7, 2020
3e49c0f
memory archive for testing
scpwais Jun 8, 2020
e60c468
tests pass needs cleanup
scpwais Jun 8, 2020
60020c8
fix xcode warnings
scpwais Jun 13, 2020
fbc619b
cleanup
scpwais Jun 15, 2020
b2f7413
save
scpwais Jun 17, 2020
271c8ad
save
scpwais Jun 18, 2020
c8aca5a
save
scpwais Jun 18, 2020
68fcdb0
save
scpwais Jun 18, 2020
13cfa59
save
scpwais Jun 18, 2020
1c7cd5d
save
scpwais Jun 18, 2020
b4c397b
save
scpwais Jun 18, 2020
565d92b
save
scpwais Jun 18, 2020
e2cd034
save
scpwais Jun 18, 2020
ffd8da1
add versioning and test
scpwais Jul 1, 2020
ef3a2ad
enable circleci
scpwais Jul 1, 2020
5e83641
enable circleci
scpwais Jul 1, 2020
9eb4e83
enable circleci
scpwais Jul 1, 2020
8fd289c
skip tests that dont work on ci
scpwais Jul 1, 2020
3e7f1ab
badge
scpwais Jul 2, 2020
60c21de
badge
scpwais Jul 2, 2020
736daab
save
scpwais Jul 3, 2020
083f80c
save
scpwais Jul 4, 2020
b8115e5
rw works
scpwais Jul 4, 2020
e675c2c
python reading with dynamic message decode works
scpwais Jul 4, 2020
19d9741
fix more indexing stuff
scpwais Jul 4, 2020
5cf7051
fix build and version embed
scpwais Jul 4, 2020
282004e
python writing example and works
scpwais Jul 6, 2020
d54fe0f
save
scpwais Jul 17, 2020
203af58
save
scpwais Jul 17, 2020
174e29c
cleanup tests and pyarrow demo
scpwais Jul 21, 2020
25c9d21
save
scpwais Jul 21, 2020
3740c7a
save
scpwais Jul 21, 2020
9469421
save
scpwais Jul 21, 2020
280ccfe
save
scpwais Jul 21, 2020
aed9ea0
save
scpwais Jul 21, 2020
7ce8633
no more toofz
scpwais Jul 22, 2020
9e293fb
google colab debugging
scpwais Jul 22, 2020
a68d20f
colab fix.. just dont use exceptions at all
scpwais Jul 23, 2020
60a0e0a
save
scpwais Jul 24, 2020
ffe365c
draft python timesync
scpwais Jul 30, 2020
293ae8e
TimeSync python fixes and notebook demo
scpwais Jul 31, 2020
036796a
save
scpwais Aug 5, 2020
d641580
fix up libarchive spec
scpwais Aug 5, 2020
bf321a9
bump v 3
scpwais Aug 5, 2020
a3edb3a
bump v 3
scpwais Aug 5, 2020
6e3b141
save
scpwais Aug 7, 2020
c6e2f9e
Merge branch 'v0.0.3' of github.com:StandardCyborg/protobag into v0.0.3
scpwais Aug 7, 2020
6c3f2d2
update tests
scpwais Aug 7, 2020
6376a3b
update tests
scpwais Aug 7, 2020
b368dd6
move demo notebook
scpwais Aug 17, 2020
40facd9
clean up and test
scpwais Aug 18, 2020
fc99189
fix build
scpwais Aug 18, 2020
85d90bb
fix tests
scpwais Aug 19, 2020
9a93e60
dont index empty topics
scpwais Aug 19, 2020
eae5a7b
better fix
scpwais Aug 19, 2020
9f01a6f
docs
scpwais Aug 19, 2020
bf7276e
merged
scpwais Aug 20, 2020
489921d
cleanup
scpwais Aug 20, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,6 @@ dist

# Don't put Mac stuff in the docker build env
cocoa

c\+\+/build

1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ eggs
build
dist
*~
test_build
4 changes: 2 additions & 2 deletions ProtobagCocoa.podspec.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "ProtobagCocoa",
"version": "0.0.2",
"version": "0.0.3",
"summary": "Protobag: an archive of string-serialized Protobufs",
"homepage": "https://github.com/StandardCyborg/protobag",
"license": "Apache 2",
Expand All @@ -10,7 +10,7 @@
"cocoapods_version": ">= 1.0",
"source": {
"git": "[email protected]:StandardCyborg/protobag.git",
"tag": "v0.0.2"
"tag": "v0.0.3"
},
"public_header_files": [
"c++/protobag/**/*.{hpp,h}"
Expand Down
102 changes: 90 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
# Protobag: A bag o' String-serialized Protobuf Messages
# Protobag: A bag o' Serialized Protobuf Messages
_With built-in support for time-series data_

[![Build Status](https://circleci.com/gh/StandardCyborg/protobag.svg?style=svg&circle-token=ed56e2ec32789fa3e5f664bc8ea73c55e119de4b)](https://app.circleci.com/pipelines/github/StandardCyborg/protobag)

## Quickstart & Demo

See [this python noteboook](examples/notebook-demo/protobag-demo-full.ipynb)
for a demo of key features.

Or you can drop into a Protobag development shell using a clone of this repo
and Docker; FMI see:
`./pb-dev --help`

## Summary

[Protobuf](https://github.com/protocolbuffers/protobuf) is a popular data
Expand Down Expand Up @@ -60,31 +69,100 @@ wrappers over `libarchive`. See
[ArchiveUtil](c++/protobag/protobag/ArchiveUtil.hpp).


## Development


## Discussion of Key Features

### Protobag indexes Protobuf message Descriptors

By default, `protobag` not only saves those messages but also
**indexes Protobuf message descriptors** so that your `protobag` readers don't
need your proto schemas to decode your messages.

#### Wat?
In order to deserialize a Protobuf message, typically you need
`protoc`-generated code for that message type (and you need `protoc`-generated
code for your specific programming language). This `protoc`-generated code is
engineered for efficiency and provides a clean API for accessing message
attributes. But what if you don't have that `protoc`-generated code? Or you
don't even have the `.proto` message definitions to generate such code?

In Protobuf version 3.x, the authors added official support for
[the self-describing message paradigm](https://developers.google.com/protocol-buffers/docs/techniques).
Now a user can serialize not just a message but Protobuf Descriptor data that
describes the message schema and enables deserialzing the message
*without protoc-generated code*-- all you need is the `protobuf` library itself.
(This is a core feature of other serialization libraries
[like Avro](http://avro.apache.org/docs/1.6.1/)).

Note: dynamic message decoding is slower than using `protoc`-generated code.
Furthermore, the `protoc`-generated code makes defensive programming a bit
easier. You probably want to use the `protoc`-generated code for your
messages if you can.

#### Protobag enables all messages to be self-describing messages
While Protobuf includes tools for using self-describing messages, the feature
isn't simply a toggle in your `.proto` file, and the API is a bit complicated
(because Google claims they don't use it much internally).

TODO: quickstart and stuff
`protobag` automatically indexes the Protobuf Descriptor data for your messages
at write time. (And you can disable this indexing if so desired). At read
time, `protobag` automatically uses this indexed Descriptor data if the user
reading your `protobag` file lacks the needed `protoc`-generated code to
deserialize a message.

TODO: bag index doc
What if a message type evolves? `protobag` indexes each distinct message type
for each write session. If you change your schema for a message type between
write sessions, `protobag` will have indexed both schemas and will use the
proper one for dynamic deserialization.

TODO: "treat as a map" API
#### For More Detail

For Python, see:
* `protobag.build_fds_for_msg()` -- This method collects the descriptor data
needed for any Protobuf Message instance or class.
* `protobag.DynamicMessageFactory::dynamic_decode()` -- This method uses
standard Protobuf APIs to deserialize messages given only Protobuf
Descriptor data.

coming soon
For C++, see:
* `BagIndexBuilder::DescriptorIndexer::Observe()` -- This method collects the
descriptor data needed for any Protobuf Message instance or class.
* `DynamicMsgFactory` -- This utility uses uses standard Protobuf APIs to
deserialize messages given only Protobuf Descriptor data.


## Cocoa Pods

You can integrate Protobag into an iOS or OSX application using the CocoaPod `ProtobagCocoa.podspec.json`
podspec included in this repo. Protobag is explicitly designed to be cross-platform (and utilize only C++
features friendly to iOS) to facilitate such interoperability.

Note: before pushing, be sure to edit the "version" field of the `ProtobagCocoa.podspec.json` file
to match the version you're pushing.
```
indocker % cd /opt/protobag/cxx
indocker % mkdir -p build && cd build
indocker % cmake -DCMAKE_BUILD_TYPE=DEBUG ..
indocker % make -j `nproc` && ./protobag_test --gtest_filter=DemoTest*
pod repo push SCCocoaPods ProtobagCocoa.podspec.json --use-libraries --verbose --allow-warnings
```

## C++ Build

Use the existing CMake-based build system.

In c++ subdir:
```
pod repo push SCCocoaPods ProtobagCocoa.podspec.json --use-libraries --verbose --allow-warnings
mkdir build && cd build
cmake ..
make -j
make test
```

## Python Build

in python subdir:
The Python library includes a wheel that leverages the above C++ CMake build system.

In python subdir:
```
python3 setup.py bdist_wheel
```
for both linux and xcode

46 changes: 20 additions & 26 deletions c++/protobag/protobag/BagIndexBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
#include <tuple>
#include <unordered_set>

#include <iostream>

#include <google/protobuf/descriptor.h>
#include <google/protobuf/util/time_util.h>

Expand Down Expand Up @@ -156,40 +154,36 @@ BagIndex_TopicStats &BagIndexBuilder::GetMutableStats(const std::string &topic)
return topic_to_stats[topic];
}

// uint64_t BagIndexBuilder::GetNextFilenum(const std::string &topic) {
// const auto &stats = GetMutableStats(topic);
// return stats.n_messages() + 1;
// }


void BagIndexBuilder::Observe(
const Entry &entry, const std::string &final_entryname) {

const std::string entryname =
final_entryname.empty() ? entry.entryname : final_entryname;

if (_do_timeseries_indexing) {
const auto &maybe_tt = entry.GetTopicTime();
if (maybe_tt.has_value()) {
TopicTime tt = *maybe_tt;
tt.set_entryname(entryname);

{
auto &stats = GetMutableStats(tt.topic());
stats.set_n_messages(stats.n_messages() + 1);
}
if (entry.IsStampedMessage()) {
const auto &maybe_tt = entry.GetTopicTime();
if (maybe_tt.has_value()) {
TopicTime tt = *maybe_tt;
tt.set_entryname(entryname);

{
auto &stats = GetMutableStats(tt.topic());
stats.set_n_messages(stats.n_messages() + 1);
}

{
if (!_tto) {
_tto.reset(new TopicTimeOrderer());
{
if (!_tto) {
_tto.reset(new TopicTimeOrderer());
}
_tto->Observe(tt);
}
_tto->Observe(tt);
}

{
const auto &t = tt.timestamp();
*_index.mutable_start() = std::min(_index.start(), t);
*_index.mutable_end() = std::max(_index.end(), t);
{
const auto &t = tt.timestamp();
*_index.mutable_start() = std::min(_index.start(), t);
*_index.mutable_end() = std::max(_index.end(), t);
}
}
}
}
Expand Down
13 changes: 12 additions & 1 deletion c++/protobag/protobag/Entry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,18 @@ std::string Entry::ToString() const {
// }

bool MaybeEntry::IsNotFound() const {
return error == archive::Archive::ReadStatus::EntryNotFound().error;
static const std::string kIsNotFoundPrefix =
archive::Archive::ReadStatus::EntryNotFound().error + ": ";
return error.find(kIsNotFoundPrefix) == 0;
}

MaybeEntry MaybeEntry::NotFound(const std::string &entryname) {
MaybeEntry m;
m.error = fmt::format(
"{}: {}",
archive::Archive::ReadStatus::EntryNotFound().error,
entryname);
return m;
}

std::string GetTopicFromEntryname(const std::string &entryname) {
Expand Down
12 changes: 11 additions & 1 deletion c++/protobag/protobag/Entry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ struct Entry {

// == Raw Mode ================================

// Create a raw entry from a Protobuf message instance (force-skips indexing)
template <typename MT>
static Result<Entry> CreateRaw(
const std::string &entryname,
Expand All @@ -149,6 +150,7 @@ struct Entry {
};
}

// Create a raw entry from raw bytes
static Entry CreateRawFromBytes(
const std::string &entryname,
std::string &&raw_msg_contents) {
Expand Down Expand Up @@ -249,14 +251,15 @@ struct Entry {
return
IsA<StampedMessage>() || (
// An unpacked StampedDatum is OK too
GetTopicTime().has_value());
HasTopic());
}

bool IsRaw() const {
return msg.type_url().empty();
}

std::optional<TopicTime> GetTopicTime() const;
bool HasTopic() const;

template <typename MT>
Result<MT> GetAs(bool validate_type_url = true) const {
Expand Down Expand Up @@ -306,6 +309,8 @@ struct MaybeEntry : public Result<Entry> {
// See Archive::ReadStatus for definition; this can be an acceptible error
bool IsNotFound() const;

static MaybeEntry NotFound(const std::string &entryname);

static MaybeEntry Err(const std::string &s) {
MaybeEntry m; m.error = s; return m;
}
Expand Down Expand Up @@ -361,4 +366,9 @@ inline std::optional<TopicTime> Entry::GetTopicTime() const {
}
}

inline bool Entry::HasTopic() const {
auto maybe_tt = GetTopicTime();
return maybe_tt.has_value() && !maybe_tt->topic().empty();
}

} /* namespace protobag */
31 changes: 24 additions & 7 deletions c++/protobag/protobag/ReadSession.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ MaybeEntry ReadSession::ReadEntryFrom(

const auto maybe_bytes = archive->ReadAsStr(entryname);
if (maybe_bytes.IsEntryNotFound()) {
return MaybeEntry::Err(maybe_bytes.error);
return MaybeEntry::NotFound(entryname);
} else if (!maybe_bytes.IsOk()) {
return MaybeEntry::Err(
fmt::format("Read error for {}: {}", entryname, maybe_bytes.error));
Expand All @@ -56,7 +56,7 @@ MaybeEntry ReadSession::ReadEntryFrom(

auto maybe_any =
PBFactory::LoadFromContainer<google::protobuf::Any>(*maybe_bytes.value);
// do we need to handle text format separately ? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// TODO maybe handle text format separately ?
if (!maybe_any.IsOk()) {
return MaybeEntry::Err(fmt::format(
"Could not read protobuf from {}: {}", entryname, maybe_any.error));
Expand Down Expand Up @@ -103,7 +103,7 @@ MaybeEntry ReadSession::GetNext() {
_archive, entryname, _plan.raw_mode, _spec.unpack_stamped_messages);
if (maybe_entry.IsNotFound()) {
if (_plan.require_all) {
return MaybeEntry::Err(fmt::format("Entry not found: {}", entryname));
return MaybeEntry::NotFound(entryname);
} else {
return GetNext();
}
Expand All @@ -126,6 +126,21 @@ Result<BagIndex> ReadSession::GetIndex(const std::string &path) {
return ReadLatestIndex(rp->_archive);
}

Result<std::vector<std::string>> ReadSession::GetAllTopics(const std::string &path) {
auto maybe_index = GetIndex(path);
if (!maybe_index.IsOk()) {
return {.error = maybe_index.error};
}

const BagIndex &index = *maybe_index.value;
std::vector<std::string> topics;
topics.reserve(index.topic_to_stats_size());
for (const auto &entry : index.topic_to_stats()) {
topics.push_back(entry.first);
}
return {.value = topics};
}

Result<BagIndex> ReadSession::ReadLatestIndex(archive::Archive::Ptr archive) {
if (!archive) {
return {.error = "No archive to read"};
Expand Down Expand Up @@ -171,11 +186,12 @@ Result<ReadSession::ReadPlan> ReadSession::GetEntriesToRead(
return {.error = "No archive to read"};
}

auto maybe_index = ReadLatestIndex(archive); // TODO support multiple indices ~~~~~~~~~~~~~~~~
auto maybe_index = ReadLatestIndex(archive); // TODO support multiple indices
if (!maybe_index.IsOk()) {
// TODO: support reindexing
// // Then create one!
// maybe_index = GetReindexed(archive);
return {.error = "Unindexed protobag not supported right now"}; // ~~~~~~~~~~~~~~~~~~~~~~~~~
return {.error = "Unindexed protobag not currently supported"};
}

if (!maybe_index.IsOk()) {
Expand Down Expand Up @@ -291,12 +307,13 @@ Result<ReadSession::ReadPlan> ReadSession::GetEntriesToRead(
if (window.has_end() && (window.end() < tt.timestamp())) {
continue;
}
// std::cout << "entries_to_read: " << tt.entryname() << std::endl;

entries_to_read.push(tt.entryname());
}
return {.value = ReadPlan{
.entries_to_read = entries_to_read,
.require_all = false, // TODO should we report if index and archive don't match? ~~~~~~~~~~~~~~
.require_all = false,
// TODO should we report if index and archive don't match?
.raw_mode = false,
}};

Expand Down
Loading