datafold
diff --git a/‎.github/workflows/ci.yml
Lines changed: 5 additions & 2 deletions b/‎.github/workflows/ci.yml
Lines changed: 5 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 0 deletions b/‎README.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎data_diff/databases/__init__.py
Lines changed: 1 addition & 0 deletions b/‎data_diff/databases/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎data_diff/databases/connect.py
Lines changed: 5 additions & 0 deletions b/‎data_diff/databases/connect.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎data_diff/databases/vertica.py
Lines changed: 125 additions & 0 deletions b/‎data_diff/databases/vertica.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎dev/dev.env
Lines changed: 10 additions & 0 deletions b/‎dev/dev.env
Lines changed: 10 additions & 0 deletions
diff --git a/‎docker-compose.yml
Lines changed: 21 additions & 0 deletions b/‎docker-compose.yml
Lines changed: 21 additions & 0 deletions
@@ -34,7 +34,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
 
       - name: Build the stack
-        run: docker-compose up -d mysql postgres presto trino clickhouse
+        run: docker-compose up -d mysql postgres presto trino clickhouse vertica
 
       - name: Install Poetry
         run: pip install poetry
@@ -48,4 +48,7 @@ jobs:
             DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
             DATADIFF_TRINO_URI: '${{ secrets.DATADIFF_TRINO_URI }}'
             DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
-        run: poetry run unittest-parallel -j 16
+            DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
+        run: |
+          chmod +x tests/waiting_for_stack_up.sh
+          ./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16
@@ -133,6 +133,7 @@ $ data-diff \
 | Databricks    | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>`                                                      |  💛    |
 | Trino         | `trino://<username>:<password>@<hostname>:8080/<database>`                                                                          |  💛    |
 | Clickhouse    | `clickhouse://<username>:<password>@<hostname>:9000/<database>`                                                                     |  💛    |
+| Vertica       | `vertica://<username>:<password>@<hostname>:5433/<database>`                                                                        |  💛    |
 | ElasticSearch |                                                                                                                                     |  📝    |
 | Planetscale   |                                                                                                                                     |  📝    |
 | Pinot         |                                                                                                                                     |  📝    |
@@ -177,6 +178,8 @@ While you may install them manually, we offer an easy way to install them along
 
 - `pip install 'data-diff[clickhouse]'`
 
+- `pip install 'data-diff[vertica]'`
+
 - For BigQuery, see: https://pypi.org/project/google-cloud-bigquery/
 
 
 
@@ -10,5 +10,6 @@
 from .databricks import Databricks
 from .trino import Trino
 from .clickhouse import Clickhouse
+from .vertica import Vertica
 
 from .connect import connect_to_uri
@@ -15,6 +15,7 @@
 from .databricks import Databricks
 from .trino import Trino
 from .clickhouse import Clickhouse
+from .vertica import Vertica
 
 
 @dataclass
@@ -87,6 +88,7 @@ def match_path(self, dsn):
     ),
     "trino": MatchUriPath(Trino, ["catalog", "schema"], help_str="trino://<user>@<host>/<catalog>/<schema>"),
     "clickhouse": MatchUriPath(Clickhouse, ["database?"], help_str="clickhouse://<user>:<pass>@<host>/<database>"),
+    "vertica": MatchUriPath(Vertica, ["database?"], help_str="vertica://<user>:<pass>@<host>/<database>"),
 }
 
 
@@ -113,6 +115,7 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
     - databricks
     - trino
     - clickhouse
+    - vertica
     """
 
     dsn = dsnparse.parse(db_uri)
@@ -200,6 +203,8 @@ def connect(db_conf: Union[str, dict], thread_count: Optional[int] = 1) -> Datab
     - presto
     - databricks
     - trino
+    - clickhouse
+    - vertica
     """
     if isinstance(db_conf, str):
         return connect_to_uri(db_conf, thread_count)
 
@@ -0,0 +1,125 @@
+from typing import List
+
+from ..utils import match_regexps
+from .base import (
+    CHECKSUM_HEXDIGITS,
+    MD5_HEXDIGITS,
+    TIMESTAMP_PRECISION_POS,
+    ConnectError,
+    DbPath,
+    ColType,
+    ColType_UUID,
+    ThreadedDatabase,
+    import_helper,
+)
+from .database_types import Decimal, Float, FractionalType, Integer, TemporalType, Text, Timestamp, TimestampTZ
+
+
+@import_helper("vertica")
+def import_vertica():
+    import vertica_python
+
+    return vertica_python
+
+
+class Vertica(ThreadedDatabase):
+    default_schema = "public"
+
+    TYPE_CLASSES = {
+        # Timestamps
+        "timestamp": Timestamp,
+        "timestamptz": TimestampTZ,
+        # Numbers
+        "numeric": Decimal,
+        "int": Integer,
+        "float": Float,
+        # Text
+        "char": Text,
+        "varchar": Text,
+    }
+
+    ROUNDS_ON_PREC_LOSS = True
+
+    def __init__(self, *, thread_count, **kw):
+        self._args = kw
+        self._args["AUTOCOMMIT"] = False
+
+        super().__init__(thread_count=thread_count)
+
+    def create_connection(self):
+        vertica = import_vertica()
+        try:
+            c = vertica.connect(**self._args)
+            return c
+        except vertica.errors.ConnectionError as e:
+            raise ConnectError(*e.args) from e
+
+    def _parse_type(
+        self,
+        table_path: DbPath,
+        col_name: str,
+        type_repr: str,
+        datetime_precision: int = None,
+        numeric_precision: int = None,
+        numeric_scale: int = None,
+    ) -> ColType:
+        timestamp_regexps = {
+            r"timestamp\(?(\d?)\)?": Timestamp,
+            r"timestamptz\(?(\d?)\)?": TimestampTZ,
+        }
+        for m, t_cls in match_regexps(timestamp_regexps, type_repr):
+            precision = int(m.group(1)) if m.group(1) else 6
+            return t_cls(precision=precision, rounds=self.ROUNDS_ON_PREC_LOSS)
+
+        number_regexps = {
+            r"numeric\((\d+),(\d+)\)": Decimal,
+        }
+        for m, n_cls in match_regexps(number_regexps, type_repr):
+            _prec, scale = map(int, m.groups())
+            return n_cls(scale)
+
+        string_regexps = {
+            r"varchar\((\d+)\)": Text,
+            r"char\((\d+)\)": Text,
+        }
+        for m, n_cls in match_regexps(string_regexps, type_repr):
+            return n_cls()
+
+        return super()._parse_type(table_path, col_name, type_repr, datetime_precision, numeric_precision)
+
+    def select_table_schema(self, path: DbPath) -> str:
+        schema, table = self._normalize_table_path(path)
+
+        return (
+            "SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale "
+            "FROM V_CATALOG.COLUMNS "
+            f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
+        )
+
+    def quote(self, s: str):
+        return f'"{s}"'
+
+    def concat(self, l: List[str]) -> str:
+        return " || ".join(l)
+
+    def md5_to_int(self, s: str) -> str:
+        return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))"
+
+    def to_string(self, s: str) -> str:
+        return f"CAST({s} AS VARCHAR)"
+
+    def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
+        if coltype.rounds:
+            return f"TO_CHAR({value}::TIMESTAMP({coltype.precision}), 'YYYY-MM-DD HH24:MI:SS.US')"
+
+        timestamp6 = f"TO_CHAR({value}::TIMESTAMP(6), 'YYYY-MM-DD HH24:MI:SS.US')"
+        return (
+            f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
+        )
+
+    def normalize_number(self, value: str, coltype: FractionalType) -> str:
+        return self.to_string(f"CAST({value} AS NUMERIC(38, {coltype.precision}))")
+
+    def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
+        # Trim doesn't work on CHAR type
+        return f"TRIM(CAST({value} AS VARCHAR))"
@@ -11,3 +11,13 @@ CLICKHOUSE_USER=clickhouse
 CLICKHOUSE_PASSWORD=Password1
 CLICKHOUSE_DB=clickhouse
 CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
+
+# Vertica credentials
+APP_DB_USER=vertica
+APP_DB_PASSWORD=Password1
+VERTICA_DB_NAME=vertica
+
+# To prevent generating sample demo VMart data (more about it here https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/GettingStartedGuide/IntroducingVMart/IntroducingVMart.htm),
+# leave VMART_DIR and VMART_ETL_SCRIPT empty.
+VMART_DIR=
+VMART_ETL_SCRIPT=
@@ -97,10 +97,31 @@ services:
         networks:
             - local
 
+    vertica:
+      container_name: vertica
+      image: vertica/vertica-ce:12.0.0-0
+      restart: always
+      volumes:
+          - vertica-data:/data:delegated
+      ports:
+        - '5433:5433'
+        - '5444:5444'
+      expose:
+        - '5433'
+        - '5444'
+      env_file:
+        - dev/dev.env
+      tty: true
+      networks:
+      - local
+
+
+
 volumes:
   postgresql-data:
   mysql-data:
   clickhouse-data:
+  vertica-data:
 
 networks:
   local: