Skip to content

Add example data for ProtoBuf format examples #289

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions formats/ProtoBuf/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# Generating `protobuf_message.bin`

We'll use Python to serialize some data into `protobuf_message.bin`.

Run the following command to generate a Python file named `schemafile_pb2.py` in the same directory as `schemafile.proto`.
This file contains the Python classes that represent your `MessageType` Protobuf message:

```
protoc --python_out=. schemafile.proto
```

Now, create a new Python file named `generate_protobuf_data.py`, in the same directory as `schemafile_pb2.py`. Paste the following code into it:

```
import schemafile_pb2 # Module generated by 'protoc'
from google.protobuf import text_format
from google.protobuf.internal.encoder import _VarintBytes # Import the internal varint encoder

def create_user_data_message(name, surname, birthDate, phoneNumbers):
"""
Creates and populates a UserData Protobuf message.
"""
message = schemafile_pb2.MessageType()
message.name = name
message.surname = surname
message.birthDate = birthDate
message.phoneNumbers.extend(phoneNumbers)
return message

# The data for our example users
data_to_serialize = [
{"name": "Aisha", "surname": "Khan", "birthDate": 19920815, "phoneNumbers": ["(555) 247-8903", "(555) 612-3457"]},
{"name": "Javier", "surname": "Rodriguez", "birthDate": 20001015, "phoneNumbers": ["(555) 891-2046", "(555) 738-5129"]},
{"name": "Mei", "surname": "Ling", "birthDate": 19980616, "phoneNumbers": ["(555) 956-1834", "(555) 403-7682"]},
]

output_filename = "protobuf_messages.bin"

# Open the binary file in write-binary mode ('wb')
with open(output_filename, "wb") as f:
for item in data_to_serialize:
# Create a Protobuf message instance for the current user
message = create_user_data_message(
item["name"],
item["surname"],
item["birthDate"],
item["phoneNumbers"]
)

# Serialize the message
serialized_data = message.SerializeToString()

# Get the length of the serialized data
message_length = len(serialized_data)

# Use the Protobuf library's internal _VarintBytes to encode the length
length_prefix = _VarintBytes(message_length)

# Write the length prefix
f.write(length_prefix)
# Write the serialized message data
f.write(serialized_data)

print(f"Protobuf messages (length-delimited) written to {output_filename}")

# --- Optional: Verification (reading back and printing) ---
# For reading back, we'll also use the internal Protobuf decoder for varints.
from google.protobuf.internal.decoder import _DecodeVarint32

print("\n--- Verifying by reading back ---")
with open(output_filename, "rb") as f:
buf = f.read() # Read the whole file into a buffer for easier varint decoding
n = 0
while n < len(buf):
# Decode the varint length prefix
msg_len, new_pos = _DecodeVarint32(buf, n)
n = new_pos

# Extract the message data
message_data = buf[n:n+msg_len]
n += msg_len

# Parse the message
decoded_message = schemafile_pb2.MessageType()
decoded_message.ParseFromString(message_data)
print(text_format.MessageToString(decoded_message, as_utf8=True))
```

Now run the script from the command line. It is recommended to run it from a python virtual environment, for example using `uv`:

```
uv venv proto-venv
source proto-venv/bin/activate
```

You will need to install the following python libraries:

```
uv pip install --upgrade protobuf
```

```
python generate_protobuf_data.py
```

Create a ClickHouse table matching the schema:

```
CREATE DATABASE IF NOT EXISTS test;
CREATE TABLE IF NOT EXISTS test.protobuf_messages (
name String,
surname String,
birthDate UInt32,
phoneNumbers Array(String)
)
ENGINE = MergeTree()
ORDER BY tuple()
```

Insert the data into ClickHouse:

```
cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.protobuf_messages SETTINGS format_schema='schemafile:MessageType' FORMAT Protobuf"
```

You can now read the data back:

```
SELECT * FROM test.protobuf_messages FORMAT Protobuf SETTINGS format_schema = 'schemafile:MessageType'
```
36 changes: 36 additions & 0 deletions formats/ProtoBuf/generate_protobuf_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import schemafile_pb2 # Module generated by 'protoc'
from google.protobuf import text_format

def create_user_data_message(name, surname, age, emails): # Modified arguments
"""
Creates and populates a UserData Protobuf message.
"""
user = schemafile_pb2.UserData()
user.name = name
user.surname = surname
user.age = age
user.email_addresses.extend(emails)
return user

# The data for our example users
data_to_serialize = [
{"name": "Aisha", "surname": "Khan", "age": 28, "emails": ["[email protected]", "[email protected]"]},
{"name": "Javier", "surname": "Rodriguez", "age": 35, "emails": ["[email protected]", "[email protected]"]},
{"name": "Mei", "surname": "Ling", "age": 22, "emails": ["[email protected]", "[email protected]"]},
]

output_filename = "protobuf_messages.bin"

# Open the binary file in write-binary mode ('wb')
with open(output_filename, "wb") as f:
for item in data_to_serialize:
# Create a Protobuf message instance for the current user
message = create_user_data_message(
item["name"],
item["surname"],
item["age"],
item["emails"]
)

# Serialize the message and add the length prefix
f.write(message.SerializeDelimitedToString())
4 changes: 4 additions & 0 deletions formats/ProtoBuf/protobuf_messages.bin
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
2
AishaKhan�� "(555) 247-8903"(555) 612-34578
Javier Rodriguez��� "(555) 891-2046"(555) 738-51290
MeiLing��� "(555) 956-1834"(555) 403-7682
8 changes: 8 additions & 0 deletions formats/ProtoBuf/schemafile.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
syntax = "proto3";

message MessageType {
string name = 1;
string surname = 2;
uint32 birthDate = 3;
repeated string phoneNumbers = 4;
};
36 changes: 36 additions & 0 deletions formats/ProtoBuf/schemafile_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.