Hamstring-NDR · maldwg · Jun 27, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -325,3 +325,6 @@ cython_debug/
 # Others
 docs/api/
 !/docs/api/index.rst
+
+# requirements.txt
+!*/requirements.*.txt
diff --git a/README.md b/README.md
@@ -56,7 +56,7 @@
 
 ## About the Project
 
-![Pipeline overview](https://raw.githubusercontent.com/stefanDeveloper/heiDGAF/main/docs/media/heidgaf_overview_detailed.drawio.png?raw=true)
+![Pipeline overview](./assets/heidgaf_architecture.svg)
 
 ## Getting Started
 
@@ -276,6 +276,31 @@ This will create a `rules.txt` file containing the innards of the model, explain
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
 
+### Data
+
+> [!IMPORTANT]
+> We support custom schemes.
+
+Depending on your data and usecase, you can customize the data scheme to fit your needs.
+The below configuration is part of the [main configuration file](./config.yaml) which is detailed in our [documentation](https://heidgaf.readthedocs.io/en/latest/usage.html#id2)
+
+```yml
+loglines:
+  fields:
+    - [ "timestamp", RegEx, '^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$' ]
+    - [ "status_code", ListItem, [ "NOERROR", "NXDOMAIN" ], [ "NXDOMAIN" ] ]
+    - [ "src_ip", IpAddress ]
+    - [ "dns_server_ip", IpAddress ]
+    - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?<!-)\.)+[A-Za-z]{2,63}$' ]
+    - [ "record_type", ListItem, [ "A", "AAAA" ] ]
+    - [ "response_ip", IpAddress ]
+    - [ "size", RegEx, '^\d+b$' ]
+```
+
+
+
+<p align="right">(<a href="#readme-top">back to top</a>)</p>
+
 <!-- CONTRIBUTING -->
 ## Contributing
 

diff --git a/assets/heidgaf_architecture.svg b/assets/heidgaf_architecture.svg
diff --git a/assets/heidgaf_cicd.svg b/assets/heidgaf_cicd.svg
diff --git a/config.yaml b/config.yaml
@@ -20,26 +20,72 @@ pipeline:
     logserver:
       input_file: "/opt/file.txt"
 
+
+
   log_collection:
-    collector:
-      logline_format:
-        - [ "timestamp", Timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" ]
-        - [ "status_code", ListItem, [ "NOERROR", "NXDOMAIN" ], [ "NXDOMAIN" ] ]
-        - [ "client_ip", IpAddress ]
-        - [ "dns_server_ip", IpAddress ]
-        - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?<!-)\.)+[A-Za-z]{2,63}$' ]
-        - [ "record_type", ListItem, [ "A", "AAAA" ] ]
-        - [ "response_ip", IpAddress ]
-        - [ "size", RegEx, '^\d+b$' ]
-    batch_handler:
-      batch_size: 10000
+    default_batch_handler_config:
+      batch_size: 2000
       batch_timeout: 30.0
       subnet_id:
         ipv4_prefix_length: 24
         ipv6_prefix_length: 64
+    collectors:
+      - name: "dga_collector"
+        protocol_base: dns
+        required_log_information:
+          - [ "ts", Timestamp, "%Y-%m-%dT%H:%M:%S" ]
+          - [ "status_code", ListItem, [ "NOERROR", "NXDOMAIN" ], [ "NXDOMAIN" ] ]
+          - [ "src_ip", IpAddress ]
+          - [ "dns_server_ip", IpAddress ]
+          - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?<!-)\.)+[A-Za-z]{2,63}$' ]
+          - [ "record_type", ListItem, [ "A", "AAAA" ] ]
+          # - [ "response_ip", IpAddress ]
+          - [ "size", RegEx, '^\d+$' ]
+        batch_handler_config_override:
+          batch_timeout: 30.1
+
+      - name: "dominator_collector"
+        protocol_base: dns
+        required_log_information:
+          - [ "ts", Timestamp, "%Y-%m-%dT%H:%M:%S" ]
+          - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?<!-)\.)+[A-Za-z]{2,63}$' ]
+          - [ "src_ip", IpAddress ]
+
+
+      - name: "my_http_collector"
+        protocol_base: http
+        required_log_information:
+          - [ "ts", Timestamp, "%Y-%m-%dT%H:%M:%S" ]
+          - [ "src_ip", IpAddress ]
+
+        batch_handler_config_override:
+          batch_size: 200000
+          batch_timeout: 50.0
+          subnet_id:
+            ipv4_prefix_length: 16
+            ipv6_prefix_length: 32
+
+  log_filtering:
+    - name: "dga_filter"
+      # method to apply for rule based prefiltering according to the needs
+      relevance_method: no_relevance_check # check_dga_relevance
+      collector_name: dga_collector
+    - name: "dominator_filter"
+      relevance_method: no_relevance_check
+      collector_name: dominator_collector
+    - name: "examplary_filter"
+      relevance_method: no_relevance_check
+      collector_name: dga_collector
+    - name: "xss_filter"
+      relevance_method: no_relevance_check
+      collector_name: my_http_collector
+
 
   data_inspection:
-    inspector:
+    - name: dga_inspector
+      inspector_module_name: "stream_ad_inspector"
+      inspector_class_name: "StreamADInspector"
+      prefilter_name: dga_filter
       mode: univariate # multivariate, ensemble
       # Only used when mode is set to ensemble
       ensemble:
@@ -51,32 +97,157 @@ pipeline:
           module: streamad.model
           model_args:
             is_global: false
-      anomaly_threshold: 0.01
-      score_threshold: 0.5
+      anomaly_threshold: 0.0001
+      score_threshold: 0.005
+      time_type: ms
+      time_range: 20
+    - name: no_inspector
+      prefilter_name: dga_filter
+      inspector_module_name: "no_inspector"
+      inspector_class_name: "NoInspector"
+      anomaly_threshold: 0.0001
+      score_threshold: 0.005
       time_type: ms
       time_range: 20
+      mode: univariate # multivariate, ensemble
+      # Only used when mode is set to ensemble
+      ensemble:
+        model: WeightEnsemble
+        module: streamad.process
+        model_args:
+      models:
+        - model: ZScoreDetector
+          module: streamad.model
+          model_args:
+            is_global: false
+    - name: skip_the_inspector_stage_as_well_with_this
+      prefilter_name: dominator_filter
+      inspector_module_name: "no_inspector"
+      inspector_class_name: "NoInspector"
+    # - name: domniator_inspector
+    #   inspector_module_name: "no_inspector"
+    #   inspector_class_name: "NoInspector"
+    #   prefilter_name: "dominator_filter"
+    #   mode: univariate # multivariate, ensemble
+    #   # Only used when mode is set to ensemble
+    #   ensemble:
+    #     model: WeightEnsemble
+    #     module: streamad.process
+    #     model_args:
+    #   models:
+    #     - model: ZScoreDetector
+    #       module: streamad.model
+    #       model_args:
+    #         is_global: false
+    #   anomaly_threshold: 0.01
+    #   score_threshold: 0.5
+    #   time_type: ms
+      # time_range: 20
+    # - name: exemplary_inspector
+    #   inspector_module_name: "no_inspector"
+    #   inspector_class_name: "NoInspector"
+    #   prefilter_name: dga_filter
+    #   mode: univariate # multivariate, ensemble
+    #   # Only used when mode is set to ensemble
+    #   ensemble:
+    #     model: WeightEnsemble
+    #     module: streamad.process
+    #     model_args:
+    #   models:
+    #     - model: ZScoreDetector
+    #       module: streamad.model
+    #       model_args:
+    #         is_global: false
+    #   anomaly_threshold: 0.01
+    #   score_threshold: 0.5
+    #   time_type: ms
+    #   time_range: 20
+    # - name: xss_inspector
+    #   inspector_module_name: "no_inspector"
+    #   inspector_class_name: "NoInspector"
+    #   prefilter_name: xss_filter
+    #   mode: univariate # multivariate, ensemble
+    #   # Only used when mode is set to ensemble
+    #   ensemble:
+    #     model: WeightEnsemble
+    #     module: streamad.process
+    #     model_args:
+    #   models:
+    #     - model: ZScoreDetector
+    #       module: streamad.model
+    #       model_args:
+    #         is_global: false
+    #   anomaly_threshold: 0.01
+    #   score_threshold: 0.5
+    #   time_type: ms
+    #   time_range: 20
 
   data_analysis:
-    detector:
+    - name: "RF-dga_detector"
+      detector_module_name: "dga_detector"
+      detector_class_name: "DGADetector"
+      model: rf # XGBoost
+      checksum: 021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc
+      base_url: https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/
+      threshold: 0.005
+      inspector_name: dga_inspector
+    - name: "RF-dga_detector-no-inspector-basis"
+      detector_module_name: "dga_detector"
+      detector_class_name: "DGADetector"
       model: rf # XGBoost
       checksum: 021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc
       base_url: https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/
-      threshold: 0.5
+      threshold: 0.005
+      inspector_name: no_inspector
+    # - name: "XGBoost-dga_detector"
+    #   model: XGBoost
+    #   checksum: ba1f718179191348fe2abd51644d76191d42a5d967c6844feb3371b6f798bf06
+    #   base_url: https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/
+    #   threshold: 0.005
+    #   inspector_name: dga_inspector
 
   monitoring:
     clickhouse_connector:
       batch_size: 50  # do not set higher
       batch_timeout: 2.0
 
+
+  zeek:
+    sensors:
+      zeek1:
+        static_analysis: true
+        protocols:
+        # by convention the protocol name is the topic name for the logsever + preficx of the logserver
+          - http
+          - dns
+        interfaces:
+          - enx84ba5960ffe6
+      zeek2:
+        static_analysis: false # default
+        protocols:
+          - http
+          - dns
+        interfaces:
+          - eth0
+          - dummy
+      zeek3:
+        static_analysis: true
+        protocols:
+          - http
+          - dns
+
 environment:
   kafka_brokers:
     - hostname: kafka1
-      port: 19092
+      port: 8097
+      node_ip: 192.168.175.69
     - hostname: kafka2
-      port: 19093
+      port: 8098
+      node_ip: 192.168.175.69
     - hostname: kafka3
-      port: 19094
-  kafka_topics:
+      port: 8099
+      node_ip: 192.168.175.69
+  kafka_topics_prefix:
     pipeline:
       logserver_in: "pipeline-logserver_in"
       logserver_to_collector: "pipeline-logserver_to_collector"

diff --git a/data/test_pcaps/cic-ids-2017-sample.pcap2 b/data/test_pcaps/cic-ids-2017-sample.pcap2
diff --git a/data/test_pcaps/ctu-sample.pcap b/data/test_pcaps/ctu-sample.pcap
diff --git a/data/test_pcaps/unsw-sample.pcap2 b/data/test_pcaps/unsw-sample.pcap2
diff --git a/docker/.env b/docker/.env
@@ -1 +1,2 @@
 MOUNT_PATH=../../default.txt
+KAFKA_HOST_IP=192.168.175.69
diff --git a/docker/create_tables/alerts.sql b/docker/create_tables/alerts.sql
@@ -1,10 +1,10 @@
 CREATE TABLE IF NOT EXISTS alerts (
-    client_ip String NOT NULL,
+    src_ip String NOT NULL,
     alert_timestamp DateTime64(6) NOT NULL,
     suspicious_batch_id UUID NOT NULL,
     overall_score Float32 NOT NULL,
     domain_names String NOT NULL,
     result String,
 )
 ENGINE = MergeTree
-PRIMARY KEY(client_ip, alert_timestamp);
+PRIMARY KEY(src_ip, alert_timestamp);
diff --git a/docker/create_tables/batch_timestamps.sql b/docker/create_tables/batch_timestamps.sql
@@ -1,10 +1,12 @@
 CREATE TABLE IF NOT EXISTS batch_timestamps (
     batch_id UUID NOT NULL,
+    instance_name String NOT NULL,
     stage String NOT NULL,
     status String NOT NULL,
     timestamp DateTime64(6) NOT NULL,
     message_count UInt32,
     is_active Bool NOT NULL
 )
 ENGINE = MergeTree
+-- keep the PK as the UUID even thogh it is not uinque for indexing reasons
 PRIMARY KEY (batch_id);
diff --git a/docker/create_tables/batch_tree.sql b/docker/create_tables/batch_tree.sql
@@ -0,0 +1,14 @@
+-- Table to be able to reconstruct where the batch was processed in
+-- used in grafana to calculate the elapsed time between stages
+CREATE TABLE IF NOT EXISTS batch_tree (
+    batch_row_id String NOT NULL,
+    batch_id UUID NOT NULL,
+    parent_batch_row_id Nullable(String), -- Default of Null indicates a root element
+    instance_name String NOT NULL,
+    stage String NOT NULL,
+    status String NOT NULL,
+    timestamp DateTime64(6) NOT NULL,
+)
+ENGINE = MergeTree
+-- keep the PK as the UUID even thogh it is not uinque for indexing reasons
+PRIMARY KEY (batch_row_id);
diff --git a/docker/create_tables/failed_dns_loglines.sql → docker/create_tables/failed_loglines.sql b/docker/create_tables/failed_dns_loglines.sql → docker/create_tables/failed_loglines.sql
@@ -1,4 +1,4 @@
-CREATE TABLE IF NOT EXISTS failed_dns_loglines (
+CREATE TABLE IF NOT EXISTS failed_loglines (
     message_text String NOT NULL,
     timestamp_in DateTime64(6) NOT NULL,
     timestamp_failed DateTime64(6) NOT NULL,

diff --git a/docker/create_tables/dns_loglines.sql → docker/create_tables/loglines.sql b/docker/create_tables/dns_loglines.sql → docker/create_tables/loglines.sql
@@ -1,10 +1,8 @@
-CREATE TABLE IF NOT EXISTS dns_loglines (
+CREATE TABLE IF NOT EXISTS loglines (
     logline_id UUID NOT NULL,
-    subnet_id String NOT NULL,
     timestamp DateTime64(6) NOT NULL,
-    status_code String NOT NULL,
-    client_ip String NOT NULL,
-    record_type String NOT NULL,
+    subnet_id String NOT NULL,
+    src_ip String NOT NULL,
     additional_fields String
 )
 ENGINE = MergeTree

diff --git a/docker/create_tables/suspicious_batch_timestamps.sql b/docker/create_tables/suspicious_batch_timestamps.sql
@@ -1,11 +1,13 @@
 CREATE TABLE IF NOT EXISTS suspicious_batch_timestamps (
     suspicious_batch_id UUID NOT NULL,
-    client_ip String NOT NULL,
+    src_ip String NOT NULL,
+    instance_name String NOT NULL,
     stage String NOT NULL,
     status String NOT NULL,
     timestamp DateTime64(6) NOT NULL,
     message_count UInt32,
     is_active Bool NOT NULL
 )
 ENGINE = MergeTree
+-- keep the PK as the UUID even thogh it is not uinque for indexing reasons
 PRIMARY KEY (suspicious_batch_id);
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		MOUNT_PATH=../../default.txt
		KAFKA_HOST_IP=192.168.175.69