diff --git a/.gitignore b/.gitignore index 9b888ef6..8a8d76bb 100644 --- a/.gitignore +++ b/.gitignore @@ -325,3 +325,6 @@ cython_debug/ # Others docs/api/ !/docs/api/index.rst + +# requirements.txt +!*/requirements.*.txt \ No newline at end of file diff --git a/README.md b/README.md index dfed7640..9145d9b7 100644 --- a/README.md +++ b/README.md @@ -56,7 +56,7 @@ ## About the Project -![Pipeline overview](https://raw.githubusercontent.com/stefanDeveloper/heiDGAF/main/docs/media/heidgaf_overview_detailed.drawio.png?raw=true) +![Pipeline overview](./assets/heidgaf_architecture.svg) ## Getting Started @@ -276,6 +276,31 @@ This will create a `rules.txt` file containing the innards of the model, explain

(back to top)

+### Data + +> [!IMPORTANT] +> We support custom schemes. + +Depending on your data and usecase, you can customize the data scheme to fit your needs. +The below configuration is part of the [main configuration file](./config.yaml) which is detailed in our [documentation](https://heidgaf.readthedocs.io/en/latest/usage.html#id2) + +```yml +loglines: + fields: + - [ "timestamp", RegEx, '^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z$' ] + - [ "status_code", ListItem, [ "NOERROR", "NXDOMAIN" ], [ "NXDOMAIN" ] ] + - [ "src_ip", IpAddress ] + - [ "dns_server_ip", IpAddress ] + - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?(back to top)

+ ## Contributing diff --git a/assets/heidgaf_architecture.svg b/assets/heidgaf_architecture.svg new file mode 100644 index 00000000..bef8170d --- /dev/null +++ b/assets/heidgaf_architecture.svg @@ -0,0 +1,4 @@ + + + +
Log Server
Log Collector
Batch Sender
Prefilter
Inspector
Detector
ZooKeeper
Kafka Broker
Zeek Sensor
Kafka Broker
Kafka Broker
Log Generation
Log Aggregation
Collection
Filtering
Inspection
Detection
Legend
Consume
Produce
diff --git a/assets/heidgaf_cicd.svg b/assets/heidgaf_cicd.svg new file mode 100644 index 00000000..6c89576c --- /dev/null +++ b/assets/heidgaf_cicd.svg @@ -0,0 +1,4 @@ + + + +
Self-hosted CI/CD Runner
Triggered Workflow
on GitHub
Job 1
Test 1
start
Test N
start
Job N
Test 1
start
Test N
start
diff --git a/config.yaml b/config.yaml index ba2dc06b..aceeee6b 100644 --- a/config.yaml +++ b/config.yaml @@ -20,26 +20,72 @@ pipeline: logserver: input_file: "/opt/file.txt" + + log_collection: - collector: - logline_format: - - [ "timestamp", Timestamp, "%Y-%m-%dT%H:%M:%S.%fZ" ] - - [ "status_code", ListItem, [ "NOERROR", "NXDOMAIN" ], [ "NXDOMAIN" ] ] - - [ "client_ip", IpAddress ] - - [ "dns_server_ip", IpAddress ] - - [ "domain_name", RegEx, '^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?= toStartOf${Granularity}(toDateTime64($__fromTime, 6)) AND alert_timestamp <= $__toTime\nGROUP BY time_bucket\nORDER BY time_bucket\nWITH FILL\nFROM toStartOf${Granularity}(toDateTime64($__fromTime, 6))\nTO toStartOf${Granularity}(toDateTime64($__toTime, 6))\nSTEP toInterval${Granularity}(1);", + "rawSql": "SELECT \n toStartOf${Granularity}(toDateTime64(alert_timestamp, 6)) AS time_bucket,\n count(src_ip) AS \"count\"\nFROM alerts\nWHERE alert_timestamp >= toStartOf${Granularity}(toDateTime64($__fromTime, 6)) AND alert_timestamp <= $__toTime\nGROUP BY time_bucket\nORDER BY time_bucket\nWITH FILL\nFROM toStartOf${Granularity}(toDateTime64($__fromTime, 6))\nTO toStartOf${Granularity}(toDateTime64($__toTime, 6))\nSTEP toInterval${Granularity}(1);", "refId": "B" } ], @@ -197,7 +197,7 @@ }, "pluginVersion": "4.6.0", "queryType": "table", - "rawSql": "SELECT count(*)\nFROM (\n SELECT DISTINCT client_ip\n FROM alerts\n WHERE alert_timestamp >= $__fromTime AND alert_timestamp <= $__toTime\n);", + "rawSql": "SELECT count(*)\nFROM (\n SELECT DISTINCT src_ip\n FROM alerts\n WHERE alert_timestamp >= $__fromTime AND alert_timestamp <= $__toTime\n);", "refId": "A" } ], @@ -629,7 +629,7 @@ }, "pluginVersion": "4.6.0", "queryType": "table", - "rawSql": "SELECT DISTINCT client_ip AS \"Client IP address\", arrayStringConcat(JSONExtract(domain_names, 'Array(String)'), ', ') AS \"Domains used\"\nFROM alerts\nORDER BY alert_timestamp DESC\nLIMIT 20", + "rawSql": "SELECT DISTINCT src_ip AS \"Client IP address\", arrayStringConcat(JSONExtract(domain_names, 'Array(String)'), ', ') AS \"Domains used\"\nFROM alerts\nORDER BY alert_timestamp DESC\nLIMIT 20", "refId": "A" } ], @@ -714,7 +714,7 @@ }, "pluginVersion": "4.6.0", "queryType": "table", - "rawSql": "SELECT concat(rowNumberInAllBlocks() + 1, '.') AS \"Rank\", client_ip AS \"Client IP address\", count(logline_id) AS \"# Total Requests\"\nFROM dns_loglines\nWHERE \"Client IP address\" IN (\n SELECT DISTINCT client_ip\n FROM alerts\n WHERE alert_timestamp >= $__fromTime AND alert_timestamp <= $__toTime\n)\nGROUP BY \"Client IP address\"\nORDER BY \"# Total Requests\" DESC\nLIMIT 5", + "rawSql": "SELECT concat(rowNumberInAllBlocks() + 1, '.') AS \"Rank\", src_ip AS \"Client IP address\", count(logline_id) AS \"# Total Requests\"\nFROM loglines\nWHERE \"Client IP address\" IN (\n SELECT DISTINCT src_ip\n FROM alerts\n WHERE alert_timestamp >= $__fromTime AND alert_timestamp <= $__toTime\n)\nGROUP BY \"Client IP address\"\nORDER BY \"# Total Requests\" DESC\nLIMIT 5", "refId": "A" } ], diff --git a/docker/grafana-provisioning/dashboards/latencies.json b/docker/grafana-provisioning/dashboards/latencies.json index 8b455b46..b929e5f2 100644 --- a/docker/grafana-provisioning/dashboards/latencies.json +++ b/docker/grafana-provisioning/dashboards/latencies.json @@ -18,6 +18,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": 2, "links": [], "liveNow": false, "panels": [ @@ -160,7 +161,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -252,13 +253,13 @@ "table": "" } }, - "pluginVersion": "4.7.0", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT *\nFROM (\n SELECT 'LogServer' AS name, median(value) as median\n FROM (\n SELECT dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Collector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n lt2.stage = 'log_collection.batch_handler' AND lt2.status = 'batched' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Inspector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n\n UNION ALL\n\n SELECT dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Detector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n )\n)\nWHERE name IN (${include_modules:csv});\n", + "rawSql": "SELECT *\nFROM (\n SELECT 'LogServer' AS name, median(value) as median\n FROM (\n SELECT dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Collector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n lt2.stage = 'log_collection.batch_handler' AND lt2.status = 'batched' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n\n\n\n\n UNION ALL\n\n SELECT 'Inspector' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Detector' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n)\nWHERE name IN (${include_modules:csv});\n", "refId": "A" } ], - "title": "Module latency comparison", + "title": "Module latency comparison ", "transformations": [ { "id": "rowsToFields", @@ -402,7 +403,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -494,9 +495,9 @@ "table": "" } }, - "pluginVersion": "4.7.0", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT *\nFROM (\n SELECT 'Data analysis phase' AS name, sum(Median)\n FROM (\n SELECT 'Inspector' AS name, median(value) AS \"Median\"\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n\n UNION ALL\n\n SELECT dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Detector' AS name, median(value) AS \"Median\"\n FROM (\n SELECT dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n )\n )\n\n UNION ALL\n\n SELECT 'Data preparation phase' AS name, sum(Median) as median\n FROM (\n SELECT 'LogServer' AS name, median(value) AS \"Median\"\n FROM (\n SELECT slt.event_timestamp AS time, dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'LogCollector' AS name, median(value) AS \"Median\"\n FROM (\n SELECT lt2.timestamp as time, dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) AS \"Median\"\n FROM (\n SELECT lt2.timestamp AS time, dateDiff('microsecond', lt1.timestamp, lt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n lt2.stage = 'log_collection.batch_handler' AND lt2.status = 'batched' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) AS \"Median\"\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n )\n);\n", + "rawSql": "SELECT *\nFROM (\n SELECT 'Data analysis phase' AS name, sum(Median)\n FROM (\n SELECT 'Inspector' AS name, median(value) AS \"Median\"\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Detector' AS name, median(value) AS \"Median\"\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n )\n\n UNION ALL\n\n SELECT 'Data preparation phase' AS name, sum(Median) as median\n FROM (\n SELECT 'LogServer' AS name, median(value) AS \"Median\"\n FROM (\n SELECT slt.event_timestamp AS time, dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'LogCollector' AS name, median(value) AS \"Median\"\n FROM (\n SELECT lt2.timestamp as time, dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) AS \"Median\"\n FROM (\n SELECT lt2.timestamp AS time, dateDiff('microsecond', lt1.timestamp, lt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n lt2.stage = 'log_collection.batch_handler' AND lt2.status = 'batched' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) AS \"Median\"\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n )\n);\n", "refId": "phases" } ], @@ -523,7 +524,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -615,9 +616,9 @@ "table": "" } }, - "pluginVersion": "4.6.0", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT 'Including transport and wait time' AS name, sum(median)\nFROM (\n SELECT 'LogServer' AS name, median(value) as median\n FROM (\n SELECT dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'LogCollection' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, bt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_to_batches ltb ON ltb.logline_id = lt1.logline_id\n INNER JOIN batch_timestamps bt2 ON bt2.batch_id = ltb.batch_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n bt2.stage = 'log_collection.batch_handler' AND bt2.status = 'completed' AND\n lt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Between BatchHandler and Prefilter' AS name, median(value) AS median\n FROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Between Prefilter and Inspector' AS name, median(value) AS median\n FROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Inspector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n\n UNION ALL\n\n SELECT dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n )\n\n UNION ALL\n \n SELECT 'Between Inspector and Detector' AS name, median(value) AS median\n FROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM suspicious_batch_timestamps bt1\n INNER JOIN suspicious_batch_timestamps bt2\n ON bt1.suspicious_batch_id = bt2.suspicious_batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n \n UNION ALL\n\n SELECT 'Detector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n )\n)\n\nUNION ALL\n\nSELECT 'Excluding transport and wait time' AS name, sum(median)\nFROM (\n SELECT 'LogServer' AS name, median(value) as median\n FROM (\n SELECT dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'LogCollection' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt2.logline_id = lt1.logline_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n lt2.stage = 'log_collection.batch_handler' AND lt2.status = 'batched' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Inspector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n\n UNION ALL\n\n SELECT dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Detector' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n )\n);\n", + "rawSql": "SELECT 'Including transport and wait time' AS name, sum(median)\nFROM (\n SELECT 'LogServer' AS name, median(value) as median\n FROM (\n SELECT dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'LogCollection' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, bt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_to_batches ltb ON ltb.logline_id = lt1.logline_id\n INNER JOIN batch_timestamps bt2 ON bt2.batch_id = ltb.batch_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n bt2.stage = 'log_collection.batch_handler' AND bt2.status = 'completed' AND\n lt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Between BatchHandler and Prefilter' AS name, median(value) AS median\n FROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n\n\n\n\n UNION ALL\n\n SELECT 'Between Prefilter and Inspector' AS name, median(value) AS median\n FROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Inspector' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n \n SELECT 'Between Inspector and Detector' AS name, median(value) AS median\n FROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n\n \n UNION ALL\n\n SELECT 'Detector' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n)\n\nUNION ALL\n\nSELECT 'Excluding transport and wait time' AS name, sum(median)\nFROM (\n SELECT 'LogServer' AS name, median(value) as median\n FROM (\n SELECT dateDiff(microsecond, sl.timestamp_in, slt.event_timestamp) AS value\n FROM server_logs sl\n INNER JOIN server_logs_timestamps slt ON sl.message_id = slt.message_id\n WHERE slt.event = 'timestamp_out' AND\n sl.timestamp_in >= $__fromTime AND slt.event_timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'LogCollection' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) as value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt1.logline_id = lt2.logline_id\n WHERE lt1.status = 'in_process' AND lt2.status = 'finished' AND\n lt1.stage = 'log_collection.collector' AND lt2.stage = 'log_collection.collector' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'BatchHandler' AS name, median(value) as median\n FROM (\n SELECT dateDiff('microsecond', lt1.timestamp, lt2.timestamp) AS value\n FROM logline_timestamps lt1\n INNER JOIN logline_timestamps lt2 ON lt2.logline_id = lt1.logline_id\n WHERE lt1.stage = 'log_collection.batch_handler' AND lt1.status = 'in_process' AND\n lt2.stage = 'log_collection.batch_handler' AND lt2.status = 'batched' AND\n lt1.timestamp >= $__fromTime AND lt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Prefilter' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n\n UNION ALL\n\n SELECT 'Inspector' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n\n UNION ALL\n\n SELECT 'Detector' AS name, median(value) as median\n FROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\n);\n", "refId": "A" } ], @@ -925,7 +926,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -1076,9 +1077,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "timeseries", - "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_timestamps bt1\nINNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\nWHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", + "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_tree bt1\nINNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\nWHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;\n\n\n", "refId": "Latency" }, { @@ -1099,9 +1100,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(latency) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS latency\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", + "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(latency) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS latency\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", "refId": "Median" } ], @@ -1110,7 +1111,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -1180,9 +1181,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT median(value) AS \"Median\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT median(value) AS \"Median\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\n\n\n\n", "refId": "A" } ], @@ -1271,7 +1272,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -1339,9 +1340,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT min(value) AS \"Minimum\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT min(value) AS \"Minimum\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.status = 'in_process' AND bt2.status = 'finished' AND\n bt1.stage = 'log_filtering.prefilter' AND bt2.stage = 'log_filtering.prefilter' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\n", "refId": "A" } ], @@ -1618,7 +1619,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -1769,9 +1770,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "timeseries", - "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_timestamps bt1\nINNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\nWHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC\n\nUNION ALL\n\nSELECT sbt.timestamp AS time, dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\nFROM batch_timestamps bt\nINNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\nINNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\nWHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\nORDER BY time ASC;", + "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\nWHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", "refId": "Latency" }, { @@ -1792,9 +1793,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(latency) as value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS latency\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n ORDER BY time ASC\n\n UNION ALL\n\n SELECT sbt.timestamp AS time, dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS latency\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", + "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(latency) as value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS latency\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n )\nGROUP BY time_bucket\nORDER BY time_bucket;", "refId": "Median" } ], @@ -1803,7 +1804,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -1873,9 +1874,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT median(value) AS \"Median\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n\n UNION ALL\n\n SELECT sbt.timestamp AS time, dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n)", + "rawSql": "SELECT median(value) AS \"Median\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime)", "refId": "A" } ], @@ -1964,7 +1965,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -2032,9 +2033,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT min(value) AS \"Minimum\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2 ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.is_active = False AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n\n UNION ALL\n\n SELECT sbt.timestamp AS time, dateDiff('microsecond', bt.timestamp, sbt.timestamp) AS value\n FROM batch_timestamps bt\n INNER JOIN suspicious_batches_to_batch sbtb ON bt.batch_id = sbtb.batch_id\n INNER JOIN suspicious_batch_timestamps sbt ON sbtb.suspicious_batch_id = sbt.suspicious_batch_id\n WHERE bt.stage = 'data_inspection.inspector' AND bt.status = 'in_process' AND\n sbt.stage = 'data_inspection.inspector' AND sbt.status = 'finished' AND\n bt.timestamp >= $__fromTime AND sbt.timestamp <= $__toTime\n)", + "rawSql": "SELECT min(value) AS \"Minimum\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_inspection.inspector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", "refId": "A" } ], @@ -2310,7 +2311,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -2461,9 +2462,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "timeseries", - "rawSql": "SELECT sbt2.timestamp AS time, dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\nFROM suspicious_batch_timestamps sbt1\nINNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\nWHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\nORDER BY time ASC;", + "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\nWHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", "refId": "Latency" }, { @@ -2484,9 +2485,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(latency) AS value\nFROM (\n SELECT sbt2.timestamp AS time, dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS latency\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", + "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(latency) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS latency\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", "refId": "Median" } ], @@ -2495,7 +2496,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -2565,9 +2566,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT median(value) AS \"Median\"\nFROM (\n SELECT sbt2.timestamp AS time, dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT median(value) AS \"Median\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", "refId": "A" } ], @@ -2656,7 +2657,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -2724,9 +2725,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT min(value) AS \"Minimum\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT sbt2.timestamp AS time, dateDiff('microsecond', sbt1.timestamp, sbt2.timestamp) AS value\n FROM suspicious_batch_timestamps sbt1\n INNER JOIN suspicious_batch_timestamps sbt2 ON sbt1.suspicious_batch_id = sbt2.suspicious_batch_id\n WHERE sbt1.stage = 'data_analysis.detector' AND sbt1.status = 'in_process' AND\n sbt2.stage = 'data_analysis.detector' AND sbt2.is_active = False AND\n sbt1.timestamp >= $__fromTime AND sbt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT min(value) AS \"Minimum\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2 ON bt1.parent_batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_analysis.detector' AND bt1.status = 'in_process' AND\n bt2.stage = 'data_analysis.detector' AND bt2.status = 'finished' AND\n bt1.instance_name = bt2.instance_name AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\n", "refId": "A" } ], @@ -2748,7 +2749,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -2901,9 +2902,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "timeseries", - "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_timestamps bt1\nINNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\nWHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", + "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;\n\n\n\n\n", "refId": "Latency" }, { @@ -2924,9 +2925,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(value) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", + "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(value) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", "refId": "Median" } ], @@ -2935,7 +2936,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -3086,9 +3087,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "timeseries", - "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM batch_timestamps bt1\nINNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\nWHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", + "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", "refId": "Latency" }, { @@ -3109,9 +3110,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(value) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", + "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(value) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", "refId": "Median" } ], @@ -3120,7 +3121,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -3271,9 +3272,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "timeseries", - "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\nFROM suspicious_batch_timestamps bt1\nINNER JOIN suspicious_batch_timestamps bt2\n ON bt1.suspicious_batch_id = bt2.suspicious_batch_id\nWHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;", + "rawSql": "SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\nORDER BY time ASC;\n\n\n", "refId": "Latency" }, { @@ -3294,9 +3295,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(value) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microsecond', bt1.timestamp, bt2.timestamp) AS value\n FROM suspicious_batch_timestamps bt1\n INNER JOIN suspicious_batch_timestamps bt2\n ON bt1.suspicious_batch_id = bt2.suspicious_batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", + "rawSql": "SELECT\n toStartOfMinute(time) AS time_bucket,\n median(value) AS value\nFROM (\n SELECT bt2.timestamp AS time, dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\nGROUP BY time_bucket\nORDER BY time_bucket;", "refId": "Median" } ], @@ -3305,7 +3306,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -3388,9 +3389,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT min(value) AS \"Minimum\", median(value) AS \"Median\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT min(value) AS \"Minimum\", median(value) AS \"Median\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_collection.batch_handler' AND bt1.status = 'completed'\n AND bt2.stage = 'log_filtering.prefilter' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", "refId": "A" } ], @@ -3398,7 +3399,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -3481,9 +3482,9 @@ "table": "" } }, - "pluginVersion": "4.5.1", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT min(value) AS \"Minimum\", median(value) AS \"Median\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_timestamps bt1\n INNER JOIN batch_timestamps bt2\n ON bt1.batch_id = bt2.batch_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT min(value) AS \"Minimum\", median(value) AS \"Median\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'log_filtering.prefilter' AND bt1.status = 'finished'\n AND bt2.stage = 'data_inspection.inspector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", "refId": "A" } ], @@ -3491,7 +3492,7 @@ }, { "datasource": { - "default": true, + "default": false, "type": "grafana-clickhouse-datasource", "uid": "PDEE91DDB90597936" }, @@ -3574,16 +3575,16 @@ "table": "" } }, - "pluginVersion": "4.6.0", + "pluginVersion": "4.10.1", "queryType": "table", - "rawSql": "SELECT min(value) AS \"Minimum\", median(value) AS \"Median\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM suspicious_batch_timestamps bt1\n INNER JOIN suspicious_batch_timestamps bt2\n ON bt1.suspicious_batch_id = bt2.suspicious_batch_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)", + "rawSql": "SELECT min(value) AS \"Minimum\", median(value) AS \"Median\", avg(value) AS \"Average\", max(value) AS \"Maximum\"\nFROM (\n SELECT dateDiff('microseconds', bt1.timestamp, bt2.timestamp) as value\n FROM batch_tree bt1\n INNER JOIN batch_tree bt2\n ON bt1.batch_row_id = bt2.parent_batch_row_id\n WHERE bt1.stage = 'data_inspection.inspector' AND bt1.status = 'finished'\n AND bt2.stage = 'data_analysis.detector' AND bt2.status = 'in_process' AND\n dateDiff('microsecond', bt1.timestamp, bt2.timestamp) > 0 AND\n bt1.timestamp >= $__fromTime AND bt2.timestamp <= $__toTime\n)\n\n", "refId": "A" } ], "type": "stat" } ], - "refresh": "auto", + "refresh": "5s", "schemaVersion": 39, "tags": [], "templating": { diff --git a/docker/grafana-provisioning/dashboards/log_volumes.json b/docker/grafana-provisioning/dashboards/log_volumes.json index 18f1a967..df50f55f 100644 --- a/docker/grafana-provisioning/dashboards/log_volumes.json +++ b/docker/grafana-provisioning/dashboards/log_volumes.json @@ -177,7 +177,7 @@ }, "pluginVersion": "4.7.0", "queryType": "table", - "rawSql": "SELECT timestamp, count(DISTINCT value) OVER (ORDER BY timestamp) AS cumulative_count\nFROM (\n SELECT timestamp_failed AS timestamp, message_text AS value\n FROM failed_dns_loglines\n WHERE timestamp >= $__fromTime AND timestamp <= $__toTime\n\n UNION ALL\n\n SELECT timestamp, toString(logline_id) AS value\n FROM logline_timestamps\n WHERE is_active = False\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n);", + "rawSql": "SELECT timestamp, count(DISTINCT value) OVER (ORDER BY timestamp) AS cumulative_count\nFROM (\n SELECT timestamp_failed AS timestamp, message_text AS value\n FROM failed_loglines\n WHERE timestamp >= $__fromTime AND timestamp <= $__toTime\n\n UNION ALL\n\n SELECT timestamp, toString(logline_id) AS value\n FROM logline_timestamps\n WHERE is_active = False\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n);", "refId": "B" } ], @@ -371,7 +371,7 @@ }, "pluginVersion": "4.8.2", "queryType": "table", - "rawSql": "SELECT time_bucket, -sum(count) AS \" (negative)\"\nFROM (\n SELECT toStartOf${Granularity}(timestamp) AS time_bucket, count(DISTINCT logline_id) AS count\n FROM (\n SELECT logline_id, timestamp\n FROM (\n SELECT logline_id, timestamp, ROW_NUMBER() OVER (PARTITION BY logline_id ORDER BY timestamp DESC) AS rn\n FROM logline_timestamps\n WHERE is_active = false\n )\n WHERE rn = 1\n AND timestamp >= toStartOf${Granularity}(toDateTime64($__fromTime, 6)) AND timestamp <= toStartOf${Granularity}(toDateTime64($__toTime, 6))\n )\n GROUP BY time_bucket\n\n UNION ALL\n\n SELECT toStartOf${Granularity}(timestamp_failed) AS time_bucket, count(DISTINCT message_text) AS count\n FROM failed_dns_loglines\n WHERE timestamp_failed >= toStartOf${Granularity}(toDateTime64($__fromTime, 6)) AND timestamp_failed <= toStartOf${Granularity}(toDateTime64($__toTime, 6))\n GROUP BY time_bucket\n)\nGROUP BY time_bucket\nORDER BY time_bucket\nWITH FILL\nFROM toStartOf${Granularity}(toDateTime64($__fromTime, 6))\nTO toStartOf${Granularity}(toDateTime64($__toTime, 6))\nSTEP toInterval${Granularity}(1);", + "rawSql": "SELECT time_bucket, -sum(count) AS \" (negative)\"\nFROM (\n SELECT toStartOf${Granularity}(timestamp) AS time_bucket, count(DISTINCT logline_id) AS count\n FROM (\n SELECT logline_id, timestamp\n FROM (\n SELECT logline_id, timestamp, ROW_NUMBER() OVER (PARTITION BY logline_id ORDER BY timestamp DESC) AS rn\n FROM logline_timestamps\n WHERE is_active = false\n )\n WHERE rn = 1\n AND timestamp >= toStartOf${Granularity}(toDateTime64($__fromTime, 6)) AND timestamp <= toStartOf${Granularity}(toDateTime64($__toTime, 6))\n )\n GROUP BY time_bucket\n\n UNION ALL\n\n SELECT toStartOf${Granularity}(timestamp_failed) AS time_bucket, count(DISTINCT message_text) AS count\n FROM failed_loglines\n WHERE timestamp_failed >= toStartOf${Granularity}(toDateTime64($__fromTime, 6)) AND timestamp_failed <= toStartOf${Granularity}(toDateTime64($__toTime, 6))\n GROUP BY time_bucket\n)\nGROUP BY time_bucket\nORDER BY time_bucket\nWITH FILL\nFROM toStartOf${Granularity}(toDateTime64($__fromTime, 6))\nTO toStartOf${Granularity}(toDateTime64($__toTime, 6))\nSTEP toInterval${Granularity}(1);", "refId": "Processed" } ], @@ -2086,7 +2086,7 @@ }, "pluginVersion": "4.6.0", "queryType": "table", - "rawSql": "SELECT 'LogCollector' AS id, 'LogCollector' AS title, (\n SELECT count(*)\n FROM logline_timestamps\n WHERE stage = 'log_collection.collector'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT count(*)\n FROM failed_dns_loglines\n WHERE timestamp_failed >= $__fromTime AND timestamp_failed <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL \n\nSELECT 'BatchHandler' AS id, 'BatchHandler' AS title, \n0 AS successful_count, \n0 AS filteredout_count, \nif((\n SELECT count(*) = 0\n FROM batch_timestamps\n WHERE stage = 'log_collection.batch_handler'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n), 0, 1) AS arc__success, \n0 AS arc__filteredout,\nif((\n SELECT count(*) = 0\n FROM batch_timestamps\n WHERE stage = 'log_collection.batch_handler'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL\n\nSELECT 'Prefilter' AS id, 'Prefilter' AS title, (\n SELECT sum(message_count)\n FROM batch_timestamps\n WHERE stage = 'log_filtering.prefilter'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT count(*)\n FROM logline_timestamps\n WHERE stage = 'log_filtering.prefilter'\n AND status = 'filtered_out'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL\n\nSELECT 'Inspector' AS id, 'Inspector' AS title, (\n SELECT sum(message_count)\n FROM suspicious_batch_timestamps\n WHERE stage = 'data_inspection.inspector'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT sum(message_count)\n FROM batch_timestamps\n WHERE stage = 'data_inspection.inspector'\n AND status = 'filtered_out'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL\n\nSELECT 'Detector' AS id, 'Detector' AS title, (\n SELECT sum(message_count)\n FROM suspicious_batch_timestamps\n WHERE stage = 'data_analysis.detector'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT sum(message_count)\n FROM batch_timestamps\n WHERE stage = 'data_analysis.detector'\n AND status = 'filtered_out'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat", + "rawSql": "SELECT 'LogCollector' AS id, 'LogCollector' AS title, (\n SELECT count(*)\n FROM logline_timestamps\n WHERE stage = 'log_collection.collector'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT count(*)\n FROM failed_loglines\n WHERE timestamp_failed >= $__fromTime AND timestamp_failed <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL \n\nSELECT 'BatchHandler' AS id, 'BatchHandler' AS title, \n0 AS successful_count, \n0 AS filteredout_count, \nif((\n SELECT count(*) = 0\n FROM batch_timestamps\n WHERE stage = 'log_collection.batch_handler'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n), 0, 1) AS arc__success, \n0 AS arc__filteredout,\nif((\n SELECT count(*) = 0\n FROM batch_timestamps\n WHERE stage = 'log_collection.batch_handler'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL\n\nSELECT 'Prefilter' AS id, 'Prefilter' AS title, (\n SELECT sum(message_count)\n FROM batch_timestamps\n WHERE stage = 'log_filtering.prefilter'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT count(*)\n FROM logline_timestamps\n WHERE stage = 'log_filtering.prefilter'\n AND status = 'filtered_out'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL\n\nSELECT 'Inspector' AS id, 'Inspector' AS title, (\n SELECT sum(message_count)\n FROM suspicious_batch_timestamps\n WHERE stage = 'data_inspection.inspector'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT sum(message_count)\n FROM batch_timestamps\n WHERE stage = 'data_inspection.inspector'\n AND status = 'filtered_out'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat\n\nUNION ALL\n\nSELECT 'Detector' AS id, 'Detector' AS title, (\n SELECT sum(message_count)\n FROM suspicious_batch_timestamps\n WHERE stage = 'data_analysis.detector'\n AND status = 'finished'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS successful_count, (\n SELECT sum(message_count)\n FROM batch_timestamps\n WHERE stage = 'data_analysis.detector'\n AND status = 'filtered_out'\n AND timestamp >= $__fromTime AND timestamp <= $__toTime\n) AS filteredout_count,\nsuccessful_count / (successful_count + filteredout_count) AS arc__success,\nfilteredout_count / (successful_count + filteredout_count) AS arc__filteredout,\nif(isNaN(arc__success), '-', CONCAT(toString(round(arc__success * 100, 1)), '%')) AS mainstat", "refId": "Nodes" }, { diff --git a/docker/grafana-provisioning/dashboards/overview.json b/docker/grafana-provisioning/dashboards/overview.json index a935300d..eedd5611 100644 --- a/docker/grafana-provisioning/dashboards/overview.json +++ b/docker/grafana-provisioning/dashboards/overview.json @@ -193,7 +193,7 @@ }, "pluginVersion": "4.6.0", "queryType": "table", - "rawSql": "SELECT concat(rowNumberInAllBlocks() + 1, '.') AS \"Rank\", client_ip AS \"Client IP address\", count(logline_id) AS \"# Total Requests\"\nFROM dns_loglines\nWHERE \"Client IP address\" IN (\n SELECT DISTINCT client_ip\n FROM alerts\n WHERE alert_timestamp >= $__fromTime AND alert_timestamp <= $__toTime\n)\nGROUP BY \"Client IP address\"\nORDER BY \"# Total Requests\" DESC\nLIMIT 5", + "rawSql": "SELECT concat(rowNumberInAllBlocks() + 1, '.') AS \"Rank\", src_ip AS \"Client IP address\", count(logline_id) AS \"# Total Requests\"\nFROM loglines\nWHERE \"Client IP address\" IN (\n SELECT DISTINCT src_ip\n FROM alerts\n WHERE alert_timestamp >= $__fromTime AND alert_timestamp <= $__toTime\n)\nGROUP BY \"Client IP address\"\nORDER BY \"# Total Requests\" DESC\nLIMIT 5", "refId": "A" } ], diff --git a/docker/grafana-provisioning/datasources.yaml b/docker/grafana-provisioning/datasources.yaml index b0708fc4..57c157fa 100644 --- a/docker/grafana-provisioning/datasources.yaml +++ b/docker/grafana-provisioning/datasources.yaml @@ -11,3 +11,11 @@ datasources: username: default tlsSkipVerify: false + + - name: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + jsonData: + httpMethod: POST diff --git a/docker/prometheus/prometheus.yml b/docker/prometheus/prometheus.yml new file mode 100644 index 00000000..65b33d5e --- /dev/null +++ b/docker/prometheus/prometheus.yml @@ -0,0 +1,7 @@ +global: + scrape_interval: 5s + +scrape_configs: + - job_name: 'kafka-exporter' + static_configs: + - targets: ['kafka-exporter:9308'] diff --git a/docs/configuration.rst b/docs/configuration.rst index 56d453f9..8beb49d6 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1,17 +1,12 @@ Logline format configuration ............................ -Configure the format and validation rules for DNS server loglines through flexible field definitions that -support timestamps, IP addresses, regular expressions, and list-based validation. +If a wants to add a new inspector or detector, it might be necessary to adapt the logline formats, if the preexisting ones do not contain the needed information. +To do so, one can adapt or define logcollector formats in the main configuration file (``config.yaml``) under ``pipeline.log_collection.collectors.[collector_name].required_log_information``. +Adding a new logcollector enables prefilters (and later on onspectors and detectors) to consume from a new Kafka topic. -Configuration Overview -^^^^^^^^^^^^^^^^^^^^^^ - -Users can define the format and fields of their DNS server loglines through the -``pipeline.log_collection.collector.logline_format`` parameter. This configuration allows complete customization -of field types, validation rules, and filtering criteria for incoming log data. - -For example, a logline might look like this: +Currently, we support timestamps, IP addresses, regular expressions, and list-based validation for data fields in a logline. +For example, a logline for the DNS protocol might look like this: .. code-block:: console @@ -38,11 +33,9 @@ for proper pipeline operation, while others are forbidden as they are reserved f :header-rows: 1 :widths: 15 50 - * - Category - - Field Names - * - **Required** - - ``timestamp``, ``status_code``, ``client_ip``, ``record_type``, ``domain_name`` - * - **Forbidden** + * - Required + - ``ts``, ``src_ip`` + * - Forbidden - ``logline_id``, ``batch_id`` **Required fields** must be present in the configuration as they are essential for pipeline processing. @@ -142,15 +135,23 @@ functionality of the modules. ``pipeline.log_collection`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. list-table:: ``collector`` Parameters +.. list-table:: ``collectors`` Parameters :header-rows: 1 :widths: 30 70 * - Parameter - Description - * - logline_format - - Defines the expected format for incoming log lines. See the :ref:`logline-format-configuration` - section for more details. + * - name + - A unique name amongst the ``collectors``configurations top identify the collector instance. + * - protocol_base + - The lowercase protocol name to ingest data from. Currently supported: ``dns`` and ``http``. + * - required_log_information + - Defines the expected format for incoming log lines. See the :ref:`Logline format configuration` section for more + details. + +Each log_collector has a BatchHandler instance. Default confgurations for all Batch handlers are defined in ``pipeline.log_collection.default_batch_handler_config``. +You can override these values for each logcollector instance by adjusting the values inside the ``pipeline.log_collection.collectors.[collector_instance].batch_handler_config_override``. +The following list shows the available configuration options. .. list-table:: ``batch_handler`` Parameters :header-rows: 1 @@ -173,52 +174,50 @@ functionality of the modules. - ``64`` - The number of bits to trim from the client's IPv6 address for use as `Subnet ID`. + + + +``pipeline.log_filtering`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. list-table:: ``prefilter`` Parameters + :header-rows: 1 + :widths: 30 70 + + * - Parameter + - Description + * - name + - A unique name amongst the prefilter configurations top identify the prefitler instance. + * - relevance_method + - The name of the method used to to check if a given logline is relevant for further inspection. + This check can be skipped by choosing ``"no_relevance_check"``. + Avalable configurations are: ``"no_relevance_check"``, ``"check_dga_relevance"`` + * - collector_name + - The name of the collector configuration the prefilter consumes data from. The same collector name can be referenced in multiple prefilter configurations. + ``pipeline.data_inspection`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. list-table:: ``inspector`` Parameters :header-rows: 1 - :widths: 30 20 50 - + :widths: 30 70 * - Parameter - - Default Value - Description - * - mode - - ``univariate`` (options: ``multivariate``, ``ensemble``) - - Mode of operation for the data inspector. - * - ensemble.model - - ``WeightEnsemble`` - - Model to use when inspector mode is ``ensemble``. - * - ensemble.module - - ``streamad.process`` - - Python module for the ensemble model. - * - ensemble.model_args - - - - Additional Arguments for the ensemble model. - * - models.model - - ``ZScoreDetector`` - - Model to use for data inspection - * - models.module - - ``streamad.model`` - - Base python module for inspection models - * - models.model_args - - - - Additional arguments for the model - * - models.model_args.is_global - - ``false`` - - - * - anomaly_threshold - - ``0.01`` - - Threshold for classifying an observation as an anomaly. - * - score_threshold - - ``0.5`` - - Threshold for the anomaly score. - * - time_type - - ``ms`` - - Unit of time used in time range calculations. - * - time_range - - ``20`` - - Time window for data inspection + * - name + - A unique name amongst the inspector configurations top identify the inspector instance. + * - prefilter_name + - The name of the prefitler configuration the inspector consumes data from. The same prefilter name can be referenced in multiple inspector configurations. + * - inspector_module_name + - Name of the python file in ``"src/inspector/plugins/"`` the inspector should use. + * - inspector_class_name + - Name of the class inside the ``inspector_module`` to use. + + + +Inspectors can be added easily by implementing the base class for an inspector. More information is available at :ref:`inspection_stage`. +Each inspector might be needing additional configurations. These are also documented at :ref:`inspection_stage`. + +To entirely skip the anomaly detection phase, you can set ``inspector_module_name: "no_inspector"`` and ``inspector_class_name: "NoInspector"``. ``pipeline.data_analysis`` ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -230,6 +229,18 @@ functionality of the modules. * - Parameter - Default Value - Description + * - name + - + - A unique name amongst the detector configurations top identify the detector instance. + * - inspector_name + - + - The name of the inspector configuration the detector consumes data from. The same inspector name can be referenced in multiple detector configurations. + * - detector_module_name + - + - Name of the python file in ``"src/detector/plugins/"`` the detector should use. + * - detector_class_name + - + - Name of the class inside the ``detector_module`` to use. * - model - ``rf`` option: ``XGBoost`` - Model to use for the detector @@ -243,6 +254,30 @@ functionality of the modules. - ``0.5`` - Threshold for the detector's classification. + +``pipeline.zeek`` +^^^^^^^^^^^^^^^^^ + +To configure the Zeek sensors to ingest data, an entry in ther ``pipeline.zeek.sensors`` must be adapted or added. +Each of the configured sensores is meant to run on a different machine or network interface to collect data. +Each instance configured needs to be setup using the ``docker-compose.yaml``. The dictionary name needs to exactly correspond with the +name of the instance configured there. +Each sensore has the following configuration parameters: + +.. list-table:: ``zeek`` Parameters + :header-rows: 1 + :widths: 30 70 + + * - Parameter + - Description + * - static_analysis + - A bool to indicate whether or not a static analysis should be executed. If ``true``, the PCAPs from ``"data/test_pcaps"`` which are mounted to + each Zeek instance are analyzed. If set to ``false``, a network analysis is executed on the configured network interfaces. + * - protocols + - List of lowercase names of protocols the Zeek sensor should be monitoring and sending in the Kafka Queues. Currently supported: ``"dns"`` and ``http``. + * - interfaces + - List of network interface names for a network analysis to monitor. As the Zeek containers run in ``host`` mode, all network interfaces of the node are automatically mounted and ready to be scraped. + Environment Configuration ......................... @@ -256,11 +291,12 @@ The following parameters control the infrastructure of the software. - Default Value - Description * - kafka_brokers - - ``hostname: kafka1, port: 8097``, ``hostname: kafka2, port: 8098``, ``hostname: kafka3, port: 8099`` - - Hostnames and ports of the Kafka brokers, given as list. - * - kafka_topics + - ``hostname: kafka1, port: 8097, node_ip: 0.0.0.0``, ``hostname: kafka2, port: 8098, node_ip: 0.0.0.0``, ``hostname: kafka3, port: 8099, node_ip: 0.0.0.0`` + - Hostnames and ports of the Kafka brokers, given as list. The node ip is crucial and needs to be set to the actual IP of the system where the Kafka broker will be running on. + * - kafka_topics_prefix - Not given here - - Kafka topic names given as strings. These topics are used for the data transfer between the modules. + - Kafka topic name prefixes given as strings. These prefix name are used to construct the actual topic names based on the instance name (e.g. a collector instance name) that produces for the given stage. + (e.g. a prefilter instance name is added as suffix to the prefilter_to_inspector prefix for the inspector to know where to consume.) * - monitoring.clickhouse_server.hostname - ``clickhouse-server`` - Hostname of the ClickHouse server. Used by Grafana. diff --git a/docs/media/heidgaf_architecture.svg b/docs/media/heidgaf_architecture.svg new file mode 100644 index 00000000..bef8170d --- /dev/null +++ b/docs/media/heidgaf_architecture.svg @@ -0,0 +1,4 @@ + + + +
Log Server
Log Collector
Batch Sender
Prefilter
Inspector
Detector
ZooKeeper
Kafka Broker
Zeek Sensor
Kafka Broker
Kafka Broker
Log Generation
Log Aggregation
Collection
Filtering
Inspection
Detection
Legend
Consume
Produce
diff --git a/docs/pipeline.rst b/docs/pipeline.rst index 388c2938..a51aef8e 100644 --- a/docs/pipeline.rst +++ b/docs/pipeline.rst @@ -7,22 +7,64 @@ Overview The core component of the software's architecture is its data pipeline. It consists of five stages/modules, and data traverses through it using Apache Kafka. -.. image:: media/pipeline_overview.png +.. image:: media/heidgaf_architecture.png -.. _stage-1-log-storage: +Stage 1: Log Aggregation +======================== -Stage 1: Log Storage +The Log Aggregation stage harnesses multiple Zeek sensors to ingest data from static (i.e. PCAP files) and dynamic sources (i.e. traffic from network interfaces). +The traffic is split protocolwise into Kafka topics and send to the Logserver for the Log Storage phase. + +Overview +-------- + +The :class:`ZeekConfigurationHandler` takes care of the setup of a containerized Zeek sensor. It reads in the main configuration file and +adjusts the protocols to listen on, the logformats of incoming traffic, and the Kafka queues to send to. + +The :class:`ZeekAnalysisHandler` starts the actual Zeek instance. Based on the configuration it either starts Zeek in a cluster for specified network interfaces +or in a single node instance for static analyses. + +Main Classes +------------ + +.. py:currentmodule:: src.zeek.zeek_config_handler +.. autoclass:: ZeekConfigurationHandler + +.. py:currentmodule:: src.zeek.zeek_analysis_handler +.. autoclass:: ZeekAnalysisHandler + +Usage and configuration +----------------------- + +An analysis can be performed via tapping network inerfaces and by injecting pcap files. +To adjust this, adapt the ``pipeline.zeek.sensors.[sensor_name].static_analysis`` value to True or false. + +- **``pipeline.zeek.sensors.[sensor_name].static_analysis``** set to True: + + - An static analysis is executed. The PCAP files are extracted from within the GitHub root directory under ``/data/test_pcaps"`` and mounted into the zeek container. All files ending in .PCAP are then read and analyzed by Zeek. + Please Note that we do not recommend to use several Zeek instances for a static analysis, as the data will be read in multiple times, which impacts the benchmarks accordingly. + +- **``pipeline.zeek.sensors.[sensor_name].static_analysis``** set to False: + + - A network analysis is performed for the interfaces listed in ``pipeline.zeek.sensors.[sensor_name].interfaces`` + + +You can start multiple instances of Zeek by adding more entries to the dictionary ``pipeline.zeek.sensors``. +Necessary attributes are: +- ``pipeline.zeek.sensors.[sensor_name].static_analysis`` : **bool** +- if not static analysis: ``pipeline.zeek.sensors.[sensor_name].interfaces`` : **list** +- ``pipeline.zeek.sensors.[sensor_name].protocols`` : **list** + +Stage 2: Log Storage ==================== -This stage serves as the central contact point for all data. Data is read and entered into the pipeline. +This stage serves as the central ingestion point for all data. Overview -------- -The :class:`LogServer` class is the core component of this stage. It reads data from one or multiple data sources and -enters it into the pipeline by sending it to Kafka, where it can be obtained by the following module. For monitoring, -it logs all ingoing log lines including their timestamps of entering and leaving the module. +The :class:`LogServer` class is the core component of this stage. It reads the Zeek Sensors inputs and directs them via Kafka to the following stages. Main Class ---------- @@ -33,26 +75,16 @@ Main Class Usage and configuration ----------------------- +Currently, the :class:`LogServer` reads from the Kafka Queues specified by Zeek. These have a common prefix, specified in ``environment.kafka_topics_prefix.pipeline.logserver_in``. The suffix is the protocol name in lower case of the traffic. +The Logserver has no further configuration. + The :class:`LogServer` simultaneously listens on a Kafka topic and reads from an input file. The configuration allows changing the Kafka topic to listen on, as well as the file name to read from. -The Kafka topic to listen on can be changed through setting the ``environment.kafka_topics.pipeline.logserver_in`` -field in `config.yaml`. Changing the file name to read from differs depending on your environment: - -- **Without Docker**: +The Kafka topic to listen on takes input by Zeek. The traffic is split protocolwise, thus there are severa topics to listen to. +These have a common prefix, specified in ``environment.kafka_topics_prefix.pipeline.logserver_in``. The suffix is the protocol name in lower case of the traffic. - - To change the input file path, change ``pipeline.log_storage.logserver.input_file`` in the `config.yaml`. The - default setting is ``"/opt/file.txt"``. - -- **With Docker**: - - - Docker mounts the file specified in ``MOUNT_PATH`` in the file `docker/.env`. By default, this is set to - ``../../default.txt``, which refers to the file `docker/default.txt`. - - By changing this variable, the file to be mounted can be set. Please note that in this case, the variable specified - in the `config.yaml` must be set to the default value. - - -Stage 2: Log Collection +Stage 3: Log Collection ======================= The Log Collection stage validates and processes incoming loglines from the Log Storage stage, organizes them into @@ -118,7 +150,9 @@ LogCollector ............ The :class:`LogCollector` connects to the :class:`LogServer` to retrieve one logline, which it then processes and -validates. The logline is parsed into its respective fields, each checked for correct type and format: +validates. The logline is parsed into its respective fields, each checked for correct type and format. +For each configuration of a logg collector in ``pipeline.logcollection.collectors``, a process is spun up in the resulting docker container +allowing for multiprocessing and threading. - **Field Validation**: @@ -144,18 +178,22 @@ validates. The logline is parsed into its respective fields, each checked for co - **Log Line Format**: - - By default, log lines have the following format: + As the log information differs for each protocol, there is a default format per protocol. + This can be either adapted or a completely new one can be added as well. For more information + please reffer to section :ref:`Logline format configuration`. .. code-block:: - TIMESTAMP STATUS CLIENT_IP DNS_IP HOST_DOMAIN_NAME RECORD_TYPE RESPONSE_IP SIZE + DNS default logline format + + TS STATUS SRC_IP DNS_IP HOST_DOMAIN_NAME RECORD_TYPE RESPONSE_IP SIZE +----------------------+------------------------------------------------+ | **Field** | **Description** | +======================+================================================+ - | ``TIMESTAMP`` | The date and time when the log entry was | - | | recorded. The format is configurable through | - | | the ``logline_format`` in ``config.yaml``. | + | ``TS`` | The date and time when the log entry was | + | | recorded. Formatted as | + | | ``YYYY-MM-DDTHH:MM:SS.sssZ``. | | | | | | - **Default Format**: ``%Y-%m-%dT%H:%M:%S.%fZ``| | | (ISO 8601 with microseconds and UTC). | @@ -168,7 +206,7 @@ validates. The logline is parsed into its respective fields, each checked for co | ``STATUS`` | The status of the DNS query, e.g., ``NOERROR``,| | | ``NXDOMAIN``. | +----------------------+------------------------------------------------+ - | ``CLIENT_IP`` | The IP address of the client that made the | + | ``SRC_IP`` | The IP address of the client that made the | | | request. | +----------------------+------------------------------------------------+ | ``DNS_IP`` | The IP address of the DNS server processing | @@ -187,7 +225,50 @@ validates. The logline is parsed into its respective fields, each checked for co | | bytes. | +----------------------+------------------------------------------------+ - - Users can change the format and field types, as described in the :ref:`logline-format-configuration` section. + + .. code-block:: + + HTTP default logline format + + TS SRC_IP SRC_PORT DST_IP DST_PORT METHOD URI STATUS_CODE REQUEST_BODY RESPONSE_BODY + + +----------------------+------------------------------------------------+ + | **Field** | **Description** | + +======================+================================================+ + | ``TS`` | The date and time when the log entry was | + | | recorded. Formatted as | + | | ``YYYY-MM-DDTHH:MM:SS.sssZ``. | + | | | + | | - **Format**: ``%Y-%m-%dT%H:%M:%S.%f`` (with | + | | microseconds truncated to milliseconds). | + | | - **Time Zone**: ``Z`` | + | | indicates Zulu time (UTC). | + | | - **Example**: ``2024-07-28T14:45:30.123Z`` | + | | | + | | This format closely resembles ISO 8601, with | + | | milliseconds precision. | + +----------------------+------------------------------------------------+ + | ``SRC_IP`` | The IP address of the client that made the | + | | request. | + +----------------------+------------------------------------------------+ + | ``SRC_PORT`` | The source port of the cliend making the | + | | request | + +----------------------+------------------------------------------------+ + | ``DST_IP`` | The IP address of the target server for the | + | | request. | + +----------------------+------------------------------------------------+ + | ``DST_PORT`` | The port of the target server | + +----------------------+------------------------------------------------+ + | ``METHOD`` | The HTTP method used (e.g. ``GET, POST``) | + +----------------------+------------------------------------------------+ + | ``URI`` | Path accessed in the request (e.g. ``/admin``) | + +----------------------+------------------------------------------------+ + | ``STATUS_CODE`` | The HTTP status code returned (e.g. ``500``) | + +----------------------+------------------------------------------------+ + | ``REQUEST_BODY`` | The HTTP request payload (might be encrypted) | + +----------------------+------------------------------------------------+ + | ``RESPONSE_BODY`` | The HTTP response body (might be encrypted) | + +----------------------+------------------------------------------------+ BufferedBatch ............. @@ -199,7 +280,7 @@ The :class:`BufferedBatch` manages the buffering of validated loglines as well a - Collects log entries into a ``batch`` dictionary, with the ``subnet_id`` as key. - Uses a ``buffer`` per key to concatenate and send both the current and previous batches together. - This approach helps detect errors or attacks that may occur at the boundary between two batches when analyzed in - :ref:`stage-4-inspection` and :ref:`stage-5-detection`. + :ref:`Data-Inspection` and :ref:`Data-Analysis`. - All batches get sorted by their timestamps at completion to ensure correct chronological order. - A `begin_timestamp` and `end_timestamp` per key are extracted and sent as metadata (needed for analysis). These are taken from the chronologically first and last message in a batch. @@ -236,17 +317,18 @@ The :class:`BufferedBatchSender` manages the sending of validated loglines store Configuration ------------- -The :class:`LogCollector` checks the validity of incoming loglines. For this, it uses the ``logline_format`` -configured in the ``config.yaml``. Section :ref:`logline-format-configuration` provides detailed information -on how to customize the logline format and field definitions. The LogCollector uses the following -configuration options from the configuration: +The instances of the class :class:`LogCollector` check the validity of incoming loglines. For this, they use the ``required_log_information`` configured +in the ``config.yaml``. + +Configurations can arbitrarily added, adjusted and removed. This is especially useful if certain detectors need specialized log fields. +The following convention needs to be sticked to: -- **LogCollector Analyzation Criteria**: +- Each entry in the ``required_log_information`` needs to be a list +- The first item is the name of the datafield as adjusted in Zeek +- The second item is the Class name the value should be mapped to for validation +- Depending on the class, the third item is a list of valid inputs +- Depending on the class, the fourth item is a list of relevant inputs - - Valid status codes: The accepted status codes for logline validation. This is defined in the field with name - ``"status_code"`` in the ``logline_format`` list. - - Valid record types: The accepted DNS record types for logline validation. This is defined in the field with name - ``"record_type"`` in the ``logline_format`` list. .. _buffer-functionality: @@ -328,7 +410,7 @@ Example Workflow This class design effectively manages the batching and buffering of messages, allowing for precise timestamp tracking and efficient data processing across different message streams. -Stage 3: Log Filtering +Stage 4: Log Filtering ====================== The Log Filtering stage processes batches from the Log Collection stage and filters out irrelevant entries based on configurable relevance criteria, ensuring only meaningful data proceeds to anomaly detection. @@ -363,6 +445,10 @@ Main Class Usage ----- +One :class:`Prefilter` per prefilter configuration in ``pipeline.log_filtering`` is started. Each instance loads from a Kafka topic name that depends on the logcollector the prefilter builds upon. +The prefix for each topic is defined in ``environment.kafka_topics_prefix.batch_sender_to_prefilter.`` and the suffix is the configured log collector name. +The prefilters extract the log entries and apply a filter function (or relevance function) to retain only those entries that match the specified requirements. + Data Flow and Processing ........................ @@ -397,13 +483,21 @@ The implementation includes robust error handling: Configuration ------------- -Filtering behavior is controlled through the ``logline_format`` configuration in ``config.yaml``: - -- **Relevance Criteria**: - - - For fields of type ``ListItem``, the fourth entry (relevant_list) defines which values are considered relevant - - If no relevant_list is specified, all allowed values are deemed relevant - - Multiple fields can have relevance criteria, and all must pass for a logline to be retained +To customize the filtering behavior, the relevance function can be extended and adjusted in ``"src/base/logline_handler"`` and can be referenced in the ``"configuration.yaml"`` by the function name. +Checks can be skipped by referencing the ``no_relevance_check`` function. +We currently support the following relevance methods: + + +---------------------------+-------------------------------------------------------------+ + | **Name** | **Description** | + +===========================+=============================================================+ + | ``no_relevance_check `` | Skip the relevance check of the prefilters entirely. | + +---------------------------+-------------------------------------------------------------+ + | ``check_dga_relevance`` | Function to filter requests based on LisItems in the | + | | logcollector configuration. Using the fourth item in the | + | | list as a list of relevant status codes, only the request | + | | and responses are forwarded that include a **NXDOMAIN** | + | | status code. | + +---------------------------+-------------------------------------------------------------+ - **Example Configuration**: @@ -427,46 +521,77 @@ The :class:`Prefilter` provides comprehensive monitoring: .. _stage-4-inspection: -Stage 4: Inspection -=================== +Stage 5: Inspection +======================== +.. _inspection_stage: Overview -------- -The **Inspection** stage performs time-series-based anomaly detection on prefiltered DNS request batches. -Its primary purpose is to reduce the load on the `Detection` stage by filtering out non-suspicious traffic early. +The `Inspector` stage is responsible to run time-series based anomaly detection on prefiltered batches. This stage is essential to reduce +the load on the `Detection` stage. Otherwise, resource complexity increases disproportionately. -This stage uses StreamAD models—supporting univariate, multivariate, and ensemble techniques—to detect unusual patterns -in request volume and packet sizes. +Main Classes +------------ -Main Class ----------- +.. py:currentmodule:: src.inspector.inspector +.. autoclass:: InspectorAbstractBase .. py:currentmodule:: src.inspector.inspector -.. autoclass:: Inspector +.. autoclass:: InspectorBase -The :class:`Inspector` class is responsible for: +The :class:`InspectorBase` is the primary class for inspecotrs. It holds common functionalities and is responsible for data ingesting, sending, etc.. Any inspector build on top of this +class and needs to implement the methods specified by :class:`InspectorAbstractBase`. The class implementations need to go into ``"/src/inspector/plugins"`` -- Loading batches from Kafka -- Extracting time-series features (e.g., frequency and packet size) -- Applying anomaly detection models -- Forwarding suspicious batches to the detector stage -Usage ------ -Data Flow and Processing -........................ +Usage and Configuration +----------------------- + +We currently support the following inspectors: + +.. list-table:: + :header-rows: 1 + :widths: 15 30 55 + + * - **Name** + - **Description** + - **Configuration** + * - ``no_inspector`` + - Skip the anomaly inspection of data entirely. + - No additional configuration + * - ``stream_ad_inspector`` + - Uses StreamAD models for anomaly detection. All StreamAD models are supported (univariate, multivariate, ensembles). + - - ``mode``: univariate (options: multivariate, ensemble) + - ``ensemble.model``: WeightEnsemble (options: VoteEnsemble) + - ``ensemble.module``: streamad.process + - ``ensemble.model_args``: Additional Arguments for the ensemble model + - ``models.model``: ZScoreDetector + - ``models.module``: streamad.model + - ``models.model_args``: Additional arguments for the model + - ``anomaly_threshold``: 0.01 + - ``score_threshold``: 0.5 + - ``time_type``: streamad.process + - ``time_range``: 20 -The :class:`Inspector` consumes batches from the Kafka topic ``prefilter_to_inspector`` and processes them through -the following workflow: -1. **Batch Reception**: Receives batches with metadata (batch_id, begin_timestamp, end_timestamp) from the Prefilter -2. **Time Series Construction**: Creates time series features based on configurable time windows -3. **Anomaly Detection**: Applies StreamAD models to detect suspicious patterns -4. **Threshold Evaluation**: Evaluates anomaly scores against configured thresholds -5. **Suspicious Batch Forwarding**: Groups and forwards anomalous data by client IP to the Detector + + +Further inspectors can be added and referenced in the config by adjusting the ``pipeline.data_inspection.[inspector].inspector_module_name`` and ``pipeline.data_inspection.[inspector].inspector_class_name``. +Each inspector might need special configurations. For the possible configuration values, please reference the table above. + +StreamAD Inspector +................... + +The inspector consumes batches on the topic ``inspect``, usually produced by the ``Prefilter``. +For a new batch, it derives the timestamps ``begin_timestamp`` and ``end_timestamp``. +Based on time type (e.g. ``s``, ``ms``) and time range (e.g. ``5``) the sliding non-overlapping window is created. +For univariate time-series, it counts the number of occurances, whereas for multivariate, it considers the number of occurances and packet size. :cite:`schuppen_fanci_2018` + +An anomaly is noted when it is greater than a ``score_threshold``. +In addition, we support a relative anomaly threshold. +So, if the anomaly threshold is ``0.01``, it sends anomalies for further detection, if the amount of anomalies divided by the total amount of requests in the batch is greater than ``0.01``. Time Series Feature Extraction .............................. @@ -570,8 +695,9 @@ Current configuration uses 20-millisecond windows for high-resolution anomaly de .. _stage-5-detection: -Stage 5: Detection +Stage 6: Detection ================== +.. _detection_stage: Overview -------- @@ -584,24 +710,26 @@ The pre-trained models used here are licensed under **EUPL‑1.2** and built fro - `DGTA-BENCH - Domain Generation and Tunneling Algorithms for Benchmark `_ - `DGArchive `_ -Main Class ----------- +Main Classes +------------ + +.. py:currentmodule:: src.detector.detector +.. autoclass:: DetectorAbstractBase .. py:currentmodule:: src.detector.detector -.. autoclass:: Detector +.. autoclass:: DetectorBase + +.. py:currentmodule:: src.detector.plugins.dga_detector +.. autoclass:: DGADetector -The :class:`Detector` class: -- Consumes a batch flagged as suspicious. -- Downloads and validates the ML model (if necessary). -- Extracts features from domain names (e.g. character distributions, entropy, label statistics). -- Computes a probability per request and an overall risk score per batch. -- Emits alerts to ClickHouse and logs in ``/tmp/warnings.json`` where applicable. +The :class:`DetectorBase` is the primary class for Detectors. It holds common functionalities and is responsible for data ingesting, triggering alerts, logging, etc.. Any Detector is build on top of this +class and needs to implement the methods specified by :class:`DetectorAbstractBase`. The class implementations need to go into ``"/src/detector/plugins"`` Usage ----- -1. The `Detector` listens on the Kafka topic from the Inspector (``inspector_to_detector``). +1. A detector listens on the Kafka topic from the Inspector he is configured to. 2. For each suspicious batch: - Extracts features for every domain request. - Applies the loaded ML model (after scaling) to compute class probabilities. @@ -616,9 +744,31 @@ Configuration You may use the provided, pre-trained models or supply your own. To use a custom model, specify: +- `name`: unique name for the detector instance - `base_url`: URL from which to fetch model artifacts - `model`: model name - `checksum`: SHA256 digest for integrity validation - `threshold`: probability threshold for classifying a request as malicious +- `inspector_name`: name of the inspector configuration for input +- `detector_module_name`: name of the python module the implementation details reside +- `detector_class_name`: name of the class in the python module to load the detector implementation details These parameters are loaded at startup and used to download, verify, and load the model/scaler if not already cached locally (in temp directory). + + +Supported Detectors Overview +---------------------------- + +In case you want to load self-trained models, the configuration acn be adapted to load the model from a different location. Since download link is assembled the following way: +``/files/?p=%2F//.pickle&dl=1"`` You can adapt the base url. If you need to adhere to another URL composition create +A new detector class by either implementing the necessary base functions from :class:`DetectorBase` or by deriving the new class from :class:`DGADetector` and just overwrite the ``"get_model_download_url"`` method. + + +The following are already implemented detectors: + +DGA Detector +................... +The :class:`DGADetector` consumes anomalous batches of requests, preprocessed by the StreamAD library. +It calculates a probability score for each request, to find if a DGA DNS entry was queried. + + diff --git a/requirements/requirements.inspector.txt b/requirements/requirements.inspector.txt index 371b8d74..3be91d42 100644 --- a/requirements/requirements.inspector.txt +++ b/requirements/requirements.inspector.txt @@ -6,3 +6,4 @@ streamad~=0.3.1 numpy~=1.26.4 marshmallow_dataclass~=8.7.1 clickhouse_connect~=0.8.3 +scipy==1.12.0 diff --git a/requirements/requirements.zeek.txt b/requirements/requirements.zeek.txt new file mode 100644 index 00000000..ccad01f4 --- /dev/null +++ b/requirements/requirements.zeek.txt @@ -0,0 +1,3 @@ +click +PyYAML +colorlog \ No newline at end of file diff --git a/scripts/run_test.py b/scripts/run_test.py index 85a43588..a3816379 100644 --- a/scripts/run_test.py +++ b/scripts/run_test.py @@ -64,7 +64,7 @@ def generate_random_logline( # choose client IP address number_of_subnets = 50 - client_ip = ( + src_ip = ( f"192.168.{random.randint(0, number_of_subnets)}.{random.randint(1, 255)}" ) @@ -96,7 +96,7 @@ def _get_random_ipv6(): # choose random size size = f"{random.randint(50, 255)}b" - return f"{timestamp} {status} {client_ip} {server_ip} {domain} {record_type} {response_ip_address} {size}" + return f"{timestamp} {status} {src_ip} {server_ip} {domain} {record_type} {response_ip_address} {size}" def get_random_domain(self) -> str: random_domain = self.domains.sample(n=1) diff --git a/src/base/data_classes/batch.py b/src/base/data_classes/batch.py index 2999d429..0ea2e57c 100644 --- a/src/base/data_classes/batch.py +++ b/src/base/data_classes/batch.py @@ -12,6 +12,9 @@ class Batch: Class definition of a batch, used to divide the log input into smaller amounts """ + batch_tree_row_id: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) batch_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) diff --git a/src/base/data_classes/clickhouse_connectors.py b/src/base/data_classes/clickhouse_connectors.py index f7d98e9e..d1fa831a 100644 --- a/src/base/data_classes/clickhouse_connectors.py +++ b/src/base/data_classes/clickhouse_connectors.py @@ -35,7 +35,7 @@ class ServerLogsTimestamps: @dataclass -class FailedDNSLoglines: +class FailedLoglines: message_text: str = field( metadata={"marshmallow_field": marshmallow.fields.String()} ) @@ -65,7 +65,7 @@ class LoglineToBatches: @dataclass -class DNSLoglines: +class Loglines: logline_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) @@ -75,13 +75,7 @@ class DNSLoglines: "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") } ) - status_code: str = field( - metadata={"marshmallow_field": marshmallow.fields.String()} - ) - client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) - record_type: str = field( - metadata={"marshmallow_field": marshmallow.fields.String()} - ) + src_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) additional_fields: Optional[str] = field( metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} ) @@ -109,6 +103,9 @@ class BatchTimestamps: batch_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) + instance_name: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) timestamp: datetime.datetime = field( @@ -134,12 +131,38 @@ class SuspiciousBatchesToBatch: ) +@dataclass +class BatchTree: + batch_row_id: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + batch_id: uuid.UUID = field( + metadata={"marshmallow_field": marshmallow.fields.UUID()} + ) + parent_batch_row_id: Optional[str] = field( + metadata={"marshmallow_field": marshmallow.fields.String(allow_none=True)} + ) + instance_name: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) + stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + timestamp: datetime.datetime = field( + metadata={ + "marshmallow_field": marshmallow.fields.DateTime("%Y-%m-%d %H:%M:%S.%f") + } + ) + + @dataclass class SuspiciousBatchTimestamps: suspicious_batch_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) - client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + src_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + instance_name: str = field( + metadata={"marshmallow_field": marshmallow.fields.String()} + ) stage: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) status: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) timestamp: datetime.datetime = field( @@ -157,7 +180,7 @@ class SuspiciousBatchTimestamps: @dataclass class Alerts: - client_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) + src_ip: str = field(metadata={"marshmallow_field": marshmallow.fields.String()}) suspicious_batch_id: uuid.UUID = field( metadata={"marshmallow_field": marshmallow.fields.UUID()} ) @@ -192,13 +215,14 @@ class FillLevels: TABLE_NAME_TO_TYPE = { "server_logs": ServerLogs, "server_logs_timestamps": ServerLogsTimestamps, - "failed_dns_loglines": FailedDNSLoglines, + "failed_loglines": FailedLoglines, "logline_to_batches": LoglineToBatches, - "dns_loglines": DNSLoglines, + "loglines": Loglines, "logline_timestamps": LoglineTimestamps, "batch_timestamps": BatchTimestamps, "suspicious_batches_to_batch": SuspiciousBatchesToBatch, "suspicious_batch_timestamps": SuspiciousBatchTimestamps, "alerts": Alerts, "fill_levels": FillLevels, + "batch_tree": BatchTree, } diff --git a/src/base/kafka_handler.py b/src/base/kafka_handler.py index 9386a705..50060fe0 100644 --- a/src/base/kafka_handler.py +++ b/src/base/kafka_handler.py @@ -5,9 +5,11 @@ """ import ast +import json import os import sys import time +import uuid from abc import abstractmethod from typing import Optional @@ -24,6 +26,7 @@ from src.base.data_classes.batch import Batch from src.base.log_config import get_logger from src.base.utils import kafka_delivery_report, setup_config +import uuid logger = get_logger() @@ -137,6 +140,7 @@ def __init__(self): "bootstrap.servers": self.brokers, "enable.idempotence": False, "acks": "1", + "message.max.bytes": 1000000000, } super().__init__(conf) @@ -160,7 +164,6 @@ def produce(self, topic: str, data: str, key: None | str = None) -> None: """ if not data: return - self.producer.flush() self.producer.produce( topic=topic, @@ -200,8 +203,9 @@ def __init__(self): conf = { "bootstrap.servers": self.brokers, - "transactional.id": HOSTNAME, + "transactional.id": f"{HOSTNAME}-{uuid.uuid4()}", "enable.idempotence": True, + "message.max.bytes": 1000000000, } super().__init__(conf) @@ -239,9 +243,11 @@ def produce(self, topic: str, data: str, key: None | str = None) -> None: ) self.commit_transaction_with_retry() - except Exception: + except Exception as e: + logger.info(f"aborted for topic {topic}") self.producer.abort_transaction() logger.error("Transaction aborted.") + logger.error(e) raise def commit_transaction_with_retry( @@ -322,7 +328,7 @@ def __init__(self, topics: str | list[str]) -> None: # create consumer conf = { "bootstrap.servers": self.brokers, - "group.id": CONSUMER_GROUP_ID, + "group.id": f"{CONSUMER_GROUP_ID}", "enable.auto.commit": False, "auto.offset.reset": "earliest", "enable.partition.eof": True, @@ -386,7 +392,7 @@ def consume_as_json(self) -> tuple[Optional[str], dict]: return None, {} try: - eval_data = ast.literal_eval(value) + eval_data = json.loads(value) if isinstance(eval_data, dict): return key, eval_data @@ -437,6 +443,37 @@ def __del__(self) -> None: if self.consumer: self.consumer.close() + @staticmethod + def _is_dicts(obj): + return isinstance(obj, list) and all(isinstance(item, dict) for item in obj) + + def consume_as_object(self) -> tuple[None | str, Batch]: + """ + Consumes available messages on the specified topic. Decodes the data and converts it to a Batch + object. Returns the Batch object. + + Returns: + Consumed data as Batch object + + Raises: + ValueError: Invalid data format + """ + key, value, topic = self.consume() + if not key and not value: + # TODO: Change return value to fit the type, maybe switch to raise + return None, {} + eval_data: dict = json.loads(value) + if self._is_dicts(eval_data.get("data")): + eval_data["data"] = eval_data.get("data") + else: + eval_data["data"] = [json.loads(item) for item in eval_data.get("data")] + batch_schema = marshmallow_dataclass.class_schema(Batch)() + eval_data: Batch = batch_schema.load(eval_data) + if isinstance(eval_data, Batch): + return key, eval_data + else: + raise ValueError("Unknown data format.") + class SimpleKafkaConsumeHandler(KafkaConsumeHandler): """Simple Kafka Consumer wrapper without Write-Exactly-Once semantics @@ -486,7 +523,6 @@ def consume(self) -> tuple[Optional[str], Optional[str], Optional[str]]: empty_data_retrieved = True continue - if msg.error(): if msg.error().code() == KafkaError._PARTITION_EOF: continue @@ -498,7 +534,6 @@ def consume(self) -> tuple[Optional[str], Optional[str], Optional[str]]: key = msg.key().decode("utf-8") if msg.key() else None value = msg.value().decode("utf-8") if msg.value() else None topic = msg.topic() if msg.topic() else None - return key, value, topic except KeyboardInterrupt: logger.info("Stopping KafkaConsumeHandler...") diff --git a/src/base/logline_handler.py b/src/base/logline_handler.py index 16f0ffb4..c21e1c78 100644 --- a/src/base/logline_handler.py +++ b/src/base/logline_handler.py @@ -1,20 +1,14 @@ import datetime import re - +import json from src.base.log_config import get_logger from src.base.utils import setup_config, validate_host logger = get_logger() CONFIG = setup_config() -LOGLINE_FIELDS = CONFIG["pipeline"]["log_collection"]["collector"]["logline_format"] -REQUIRED_FIELDS = [ - "timestamp", - "status_code", - "client_ip", - "record_type", - "domain_name", -] + +REQUIRED_FIELDS = ["ts", "src_ip"] FORBIDDEN_FIELD_NAMES = [ "logline_id", "batch_id", @@ -69,7 +63,7 @@ def validate(self, value) -> bool: Returns: True if the value matches the pattern, False otherwise. """ - return True if re.match(self.pattern, value) else False + return True if re.match(self.pattern, str(value)) else False class Timestamp(FieldType): @@ -167,18 +161,67 @@ def validate(self, value) -> bool: """ return True if value in self.allowed_list else False - def check_relevance(self, value) -> bool: - """Checks if the given value is considered relevant for filtering. + +class RelevanceHandler: + """ + Handler class to check the relevance of a given logline. Loads the appropriate child method by the name, configured + in the config.yaml at the ``log_filtering`` section from the ``relevance_method`` attribute. + """ + + def __init__(self, log_configuration_instances): + self.log_configuration_instances = log_configuration_instances + + def check_relevance(self, function_name: str, logline_dict: dict) -> bool: + """ + wrapper function to get the appropriate relevance function by name. + + Args: + function_name (str): The name of the relevance_method to import + logline_dict (dict): The dictionary version of a logline + + Returns: + True, if the logline is relevant according to the relevance function, else False + """ + is_relevant = False + try: + is_relevant = getattr(self, function_name)(logline_dict) + except AttributeError as e: + logger.error(f"Function {function_name} is not implemented!") + raise Exception(f"Function {function_name} is not implemented!") + return is_relevant + + def check_dga_relevance(self, logline_dict: dict) -> bool: + """ + Method to check if a given logline is relevant for a dga analysis. Args: - value: Value to be checked for relevance. + logline_dict (dict): The dictionary version of a logline Returns: - True if the value is relevant (in relevant_list or if no relevant_list is defined), False otherwise. + True, if the logline is relevant according to the relevance function, else False """ - if self.relevant_list: - return True if value in self.relevant_list else False + relevant = True + for _, instance_configuartion in self.log_configuration_instances.items(): + if isinstance(instance_configuartion, ListItem): + if instance_configuartion.relevant_list: + relevant = ( + logline_dict[instance_configuartion.name] + in instance_configuartion.relevant_list + ) + if not relevant: + return relevant + return relevant + + def no_relevance_check(self, logline_dict: dict) -> bool: + """ + Skip the relevance check by always returning True + + Args: + logline_dict (dict): The dictionary version of a logline + Returns: + Always returns True (all lines are relevant) + """ return True @@ -190,40 +233,42 @@ class LoglineHandler: and relevance checking functionality for the log processing pipeline. """ - def __init__(self): - self.instances_by_name = {} - self.instances_by_position = {} - self.number_of_fields = 0 - - for field in LOGLINE_FIELDS: - instance = self._create_instance_from_list_entry(field) + def __init__(self, validation_config: list): + """ + Check all existing log configurations for validity. + Args: + validation_config (list): A list containing the configured attributes a given logline needs to hold. Otherwise it gets discarded + """ + self.logformats = validation_config + log_configuration_instances = {} + if not validation_config: + raise ValueError("No fields configured") + for log_config_item in validation_config: + instance = self._create_instance_from_list_entry(log_config_item) if instance.name in FORBIDDEN_FIELD_NAMES: raise ValueError( f"Forbidden field name included. These fields are used internally " f"and cannot be used as names: {FORBIDDEN_FIELD_NAMES}" ) - - if self.instances_by_name.get(instance.name): + if log_configuration_instances.get(instance.name): raise ValueError("Multiple fields with same name") - - self.instances_by_position[self.number_of_fields] = instance - self.instances_by_name[instance.name] = instance - self.number_of_fields += 1 - + else: + log_configuration_instances[instance.name] = instance for required_field in REQUIRED_FIELDS: - if required_field not in self.instances_by_name: + + if required_field not in log_configuration_instances.keys(): raise ValueError("Not all needed fields are set in the configuration") - if self.number_of_fields == 0: - raise ValueError("No fields configured") + self.relvance_handler = RelevanceHandler( + log_configuration_instances=log_configuration_instances + ) def validate_logline(self, logline: str) -> bool: - """Validates a complete logline according to the configured format. - - Checks if the number of fields is correct and validates each field using - the appropriate field type validator. Provides detailed error logging - with visual indicators for incorrect fields. + """ + Validates the given input logline by checking if the fields presented are corresponding to a given logformat of a protocol. + Calls the :meth:`validate` method for each field. If the logline is incorrect, it shows an error with the + incorrect fields being highlighted. Args: logline (str): Logline string to be validated. @@ -231,36 +276,25 @@ def validate_logline(self, logline: str) -> bool: Returns: True if the logline contains correct fields in the configured format, False otherwise. """ - parts = logline.split() - number_of_entries = len(parts) - - # check number of entries - if number_of_entries != self.number_of_fields: - logger.warning( - f"Logline contains {number_of_entries} value(s), not {self.number_of_fields}." - ) - return False - + logline = json.loads(logline) valid_values = [] - for i in range(self.number_of_fields): - valid_values.append(self.instances_by_position.get(i).validate(parts[i])) - - if not all(valid_values): - # handle logging - error_line = len("[yyyy-mm-dd hh:mm:ss, WARNING] ") * " " - error_line += len("Incorrect logline: ") * " " - - for i in range(self.number_of_fields): - if valid_values[i]: - error_line += len(parts[i]) * " " # keep all valid fields unchanged - else: - error_line += len(parts[i]) * "^" # underline all wrong fields - error_line += " " - - logger.warning(f"Incorrect logline: {logline}\n{error_line}") - return False - - return True + invalid_value_names = [] + for log_config_item in self.logformats: + # by convention the first item is always the key present in a logline + log_line_property_key = log_config_item[0] + instance = self._create_instance_from_list_entry(log_config_item) + try: + is_value_valid = instance.validate(logline.get(log_line_property_key)) + valid_values.append(is_value_valid) + if not is_value_valid: + invalid_value_names.append(log_line_property_key) + except: + logger.error( + f"line {logline} does not contain the specified field of {log_line_property_key}" + ) + if all(valid_values): + return True + return False def __get_fields_as_json(self, logline: str) -> dict: """Extracts fields from a logline and returns them as a dictionary. @@ -275,18 +309,7 @@ def __get_fields_as_json(self, logline: str) -> dict: Returns: Dictionary with field names as keys and field values as values. """ - parts = logline.split() - return_dict = {} - - for i in range(self.number_of_fields): - if not isinstance(self.instances_by_position[i], Timestamp): - return_dict[self.instances_by_position[i].name] = parts[i] - else: - return_dict[self.instances_by_position[i].name] = ( - self.instances_by_position[i].get_timestamp_as_str(parts[i]) - ) - - return return_dict.copy() + return json.loads(logline) def validate_logline_and_get_fields_as_json(self, logline: str) -> dict: """Validates a logline and returns the fields as a dictionary. @@ -295,7 +318,7 @@ def validate_logline_and_get_fields_as_json(self, logline: str) -> dict: First validates the logline format, then extracts and returns the fields. Args: - logline (str): Logline string to be validated and parsed. + logline (dict): Logline parsed from zeek Returns: Dictionary with field names as keys and field values as values. @@ -305,34 +328,22 @@ def validate_logline_and_get_fields_as_json(self, logline: str) -> dict: """ if not self.validate_logline(logline): raise ValueError("Incorrect logline, validation unsuccessful") - return self.__get_fields_as_json(logline) - def check_relevance(self, logline_dict: dict) -> bool: - """Checks if a logline is relevant based on configured relevance criteria. - - Iterates through all ListItem fields and checks their relevance using - the check_relevance method. A logline is considered relevant only if - all ListItem fields pass their relevance checks. + def check_relevance(self, logline_dict: dict, function_name: str) -> bool: + """ + Checks if the given logline is relevant. Args: - logline_dict (dict): Logline fields as dictionary to be checked for relevance. + logline_dict (dict): Logline parts to be checked for relevance as dictionary + function_name (str): A string that points to the relevance function to use Returns: - True if the logline is relevant according to all configured criteria, False otherwise. + Propagates the bool from the subordinate relevance method """ - relevant = True - - for i in self.instances_by_position: - current_instance = self.instances_by_position[i] - if isinstance(current_instance, ListItem): - if not current_instance.check_relevance( - logline_dict[current_instance.name] - ): - relevant = False - break - - return relevant + return self.relvance_handler.check_relevance( + function_name=function_name, logline_dict=logline_dict + ) @staticmethod def _create_instance_from_list_entry(field_list: list): diff --git a/src/base/utils.py b/src/base/utils.py index 6aa20b57..1b60be83 100644 --- a/src/base/utils.py +++ b/src/base/utils.py @@ -1,8 +1,8 @@ import ipaddress import os import sys +import uuid from typing import Optional - import yaml from confluent_kafka import KafkaError, Message @@ -14,6 +14,54 @@ CONFIG_FILEPATH = os.path.join(os.path.dirname(__file__), "../../config.yaml") +def get_zeek_sensor_topic_base_names(config: dict) -> set: + """ + Method to retrieve the protocols monitored by the zeek sensors based on the ``config.yaml`` + + Args: + config (dict): The configuration dictionary from config.yaml + + Returns: + Set of protocol names the zeek sensors are monitoring, e.g. (dns, http, sftp, ... ) + """ + return set( + [ + protocol + for sensor in config["pipeline"]["zeek"]["sensors"].values() + for protocol in sensor.get("protocols", []) + ] + ) + + +# TODO: test this method! +def get_batch_configuration(collector_name: str) -> dict: + """ + Method to combine custom batch_handler configuartions per logcollector with the default ones. + Yields a dict where custom configurations override default ones. If no custom value is specified, + deafult values are returned. + + Args: + collector_name (str): Name of the collector to retrieve the configuration for + Returns: + Dictionairy with the complete batch_handler configuration (e.g. ipv4_prefix_length, batch_size, etc. ) + """ + config = setup_config() + default_configuration = config["pipeline"]["log_collection"][ + "default_batch_handler_config" + ] + collector_configs = config["pipeline"]["log_collection"]["collectors"] + + for collector in collector_configs: + if collector["name"] == collector_name: + override = collector.get("batch_handler_config_override") + if override: + # Merge override into a copy of the default configuration + merged = {**default_configuration, **override} + return merged + + return default_configuration + + def setup_config(): """Load and return the application configuration from the YAML configuration file. @@ -176,3 +224,7 @@ def normalize_ipv6_address( net = ipaddress.IPv6Network((address, prefix_length), strict=False) return net.network_address, prefix_length + + +def generate_collisions_resistant_uuid(): + return f"{uuid.uuid4()}-{uuid.uuid4()}" diff --git a/src/detector/detector.py b/src/detector/detector.py index 062b1f14..0abc07d5 100644 --- a/src/detector/detector.py +++ b/src/detector/detector.py @@ -5,15 +5,16 @@ import pickle import sys import tempfile - -import math +import asyncio import numpy as np import requests from numpy import median +from abc import ABC, abstractmethod +import importlib sys.path.append(os.getcwd()) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender -from src.base.utils import setup_config +from src.base.utils import setup_config, generate_collisions_resistant_uuid from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, KafkaMessageFetchException, @@ -26,14 +27,16 @@ BUF_SIZE = 65536 # let's read stuff in 64kb chunks! config = setup_config() -MODEL = config["pipeline"]["data_analysis"]["detector"]["model"] -CHECKSUM = config["pipeline"]["data_analysis"]["detector"]["checksum"] -MODEL_BASE_URL = config["pipeline"]["data_analysis"]["detector"]["base_url"] -THRESHOLD = config["pipeline"]["data_analysis"]["detector"]["threshold"] -CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ +INSPECTORS = config["pipeline"]["data_inspection"] +DETECTORS = config["pipeline"]["data_analysis"] + + +CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "inspector_to_detector" ] +PLUGIN_PATH = "src.detector.plugins" + class WrongChecksum(Exception): # pragma: no cover """Raises when model checksum validation fails.""" @@ -41,16 +44,66 @@ class WrongChecksum(Exception): # pragma: no cover pass -class Detector: - """Main component of the Data Analysis stage to perform anomaly detection +class DetectorAbstractBase(ABC): # pragma: no cover + """ + Abstract base class for all detector implementations. + + This class defines the interface that all concrete detector implementations must follow. + It provides the essential methods that need to be implemented for a detector to function + within the pipeline. + + Subclasses must implement all abstract methods to ensure proper integration with the + detection system. + """ + + @abstractmethod + def __init__(self, detector_config, consume_topic) -> None: + pass + + @abstractmethod + def get_model_download_url(self): + pass + + @abstractmethod + def get_scaler_download_url(self): + pass + + @abstractmethod + def predict(self, message) -> np.ndarray: + pass + - Processes suspicious batches from the Inspector using configurable ML models to classify - DNS requests as benign or malicious. Downloads and validates models from a remote server, - extracts features from domain names, calculates probability scores, and generates alerts - when malicious requests are detected above the configured threshold. +class DetectorBase(DetectorAbstractBase): """ + Base implementation for detectors in the pipeline. - def __init__(self) -> None: + This class provides a concrete implementation of the detector interface with + common functionality shared across all detector types. It handles model + management, data processing, Kafka communication, and result reporting. + + The class is designed to be extended by specific detector implementations + that provide model-specific prediction logic. + """ + + def __init__(self, detector_config, consume_topic) -> None: + """ + Initialize the detector with configuration and Kafka topic settings. + + Sets up all necessary components including model loading, Kafka handlers, + and database connections. + + Args: + detector_config (dict): Configuration dictionary containing detector-specific + parameters such as name, model, checksum, and threshold. + consume_topic (str): Kafka topic from which the detector will consume messages. + """ + + self.name = detector_config["name"] + self.model = detector_config["model"] + self.checksum = detector_config["checksum"] + self.threshold = detector_config["threshold"] + + self.consume_topic = consume_topic self.suspicious_batch_id = None self.key = None self.messages = [] @@ -58,17 +111,18 @@ def __init__(self) -> None: self.begin_timestamp = None self.end_timestamp = None self.model_path = os.path.join( - tempfile.gettempdir(), f"{MODEL}_{CHECKSUM}_model.pickle" + tempfile.gettempdir(), f"{self.model}_{self.checksum}_model.pickle" ) self.scaler_path = os.path.join( - tempfile.gettempdir(), f"{MODEL}_{CHECKSUM}_scaler.pickle" + tempfile.gettempdir(), f"{self.model}_{self.checksum}_scaler.pickle" ) - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(self.consume_topic) self.model, self.scaler = self._get_model() # databases + self.batch_tree = ClickHouseKafkaSender("batch_tree") self.suspicious_batch_timestamps = ClickHouseKafkaSender( "suspicious_batch_timestamps" ) @@ -86,11 +140,15 @@ def __init__(self) -> None: ) def get_and_fill_data(self) -> None: - """Consumes suspicious batches from Kafka and stores them for analysis. + """ + Consume data from Kafka and store it for processing. + + This method retrieves messages from the Kafka topic, processes them, and + prepares the data for detection. It handles batch management, timestamp + tracking, and database updates for monitoring purposes. - Fetches suspicious batch data from the Inspector via Kafka and stores it in internal - data structures. If the Detector is already busy processing data, consumption is - skipped with a warning. Updates database entries for monitoring and logging purposes. + The method also manages the flow of data through the pipeline by updating + relevant database tables with processing status and metrics. """ if self.messages: logger.warning( @@ -98,27 +156,39 @@ def get_and_fill_data(self) -> None: "current workload." ) return - key, data = self.kafka_consume_handler.consume_as_object() - if data.data: + self.parent_row_id = data.batch_tree_row_id self.suspicious_batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key - self.suspicious_batch_timestamps.insert( dict( suspicious_batch_id=self.suspicious_batch_id, - client_ip=key, + src_ip=key, stage=module_name, + instance_name=self.name, status="in_process", timestamp=datetime.datetime.now(), is_active=True, message_count=len(self.messages), ) ) + row_id = generate_collisions_resistant_uuid() + + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="in_process", + timestamp=datetime.datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=self.suspicious_batch_id, + ) + ) self.fill_levels.insert( dict( @@ -141,13 +211,17 @@ def get_and_fill_data(self) -> None: ) def _sha256sum(self, file_path: str) -> str: - """Calculates SHA256 checksum for model file validation. + """ + Calculate the SHA256 checksum of a file. + + This utility method reads a file in chunks and computes its SHA256 hash, + which is used for model integrity verification. Args: - file_path (str): Path to the model file to validate. + file_path (str): Path to the file for which the checksum should be calculated. Returns: - str: SHA256 hexadecimal digest of the file. + str: Hexadecimal string representation of the SHA256 checksum. """ h = hashlib.sha256() @@ -162,52 +236,51 @@ def _sha256sum(self, file_path: str) -> str: return h.hexdigest() def _get_model(self): - """Downloads and loads ML model and scaler from remote server. + """ + Download and validate the detection model. + + This method handles the model management process: + 1. Checks if the model already exists locally + 2. Downloads the model if not present + 3. Verifies the model's integrity using SHA256 checksum + 4. Loads the model for use in detection - Retrieves the configured model and scaler files from the remote server if not - already present locally. Validates model integrity using SHA256 checksum and - loads the pickled model and scaler objects for inference. + The method ensures that only verified models are used for detection to + maintain system reliability. Returns: - tuple: Trained ML model and data scaler objects. + object: The loaded model object ready for prediction. Raises: - WrongChecksum: If model checksum validation fails. + WrongChecksum: If the downloaded model's checksum doesn't match the expected value. + requests.HTTPError: If there's an error downloading the model. """ - logger.info(f"Get model: {MODEL} with checksum {CHECKSUM}") + logger.info(f"Get model: {self.model} with checksum {self.checksum}") + # TODO test the if! if not os.path.isfile(self.model_path): - response = requests.get( - f"{MODEL_BASE_URL}/files/?p=%2F{MODEL}/{CHECKSUM}/{MODEL}.pickle&dl=1" - ) + model_download_url = self.get_model_download_url() logger.info( - f"{MODEL_BASE_URL}/files/?p=%2F{MODEL}/{CHECKSUM}/{MODEL}.pickle&dl=1" + f"downloading model {self.model} from {model_download_url} with checksum {self.checksum}" ) + response = requests.get(model_download_url) response.raise_for_status() - with open(self.model_path, "wb") as f: f.write(response.content) - - if not os.path.isfile(self.scaler_path): - response = requests.get( - f"{MODEL_BASE_URL}/files/?p=%2F{MODEL}/{CHECKSUM}/scaler.pickle&dl=1" - ) - logger.info( - f"{MODEL_BASE_URL}/files/?p=%2F{MODEL}/{CHECKSUM}/scaler.pickle&dl=1" - ) - response.raise_for_status() - + scaler_download_url = self.get_scaler_download_url() + scaler_response = requests.get(scaler_download_url) + scaler_response.raise_for_status() with open(self.scaler_path, "wb") as f: - f.write(response.content) + f.write(scaler_response.content) # Check file sha256 local_checksum = self._sha256sum(self.model_path) - if local_checksum != CHECKSUM: + if local_checksum != self.checksum: logger.warning( - f"Checksum {CHECKSUM} SHA256 is not equal with new checksum {local_checksum}!" + f"Checksum {self.checksum} SHA256 is not equal with new checksum {local_checksum}!" ) raise WrongChecksum( - f"Checksum {CHECKSUM} SHA256 is not equal with new checksum {local_checksum}!" + f"Checksum {self.checksum} SHA256 is not equal with new checksum {local_checksum}!" ) with open(self.model_path, "rb") as input_file: @@ -218,143 +291,59 @@ def _get_model(self): return clf, scaler - def clear_data(self) -> None: - """Clears all data from internal data structures. - - Resets messages, timestamps, and warnings to prepare the Detector - for processing the next suspicious batch. + def detect(self) -> None: """ - self.messages = [] - self.begin_timestamp = None - self.end_timestamp = None - self.warnings = [] + Process messages to detect malicious requests. - def _get_features(self, query: str) -> np.ndarray: - """Extracts feature vector from domain name for ML model inference. + This method applies the detection model to each message in the current batch, + identifies potential threats based on the model's predictions, and collects + warnings for further processing. - Computes various statistical and linguistic features from the domain name - including label lengths, character frequencies, entropy measures, and - counts of different character types across domain name levels. + The detection uses a threshold to determine if a prediction indicates + malicious activity, and only warnings exceeding this threshold are retained. - Args: - query (str): Domain name string to extract features from. - - Returns: - numpy.ndarray: Feature vector ready for ML model prediction. - """ - - # Splitting by dots to calculate label length and max length - query = query.strip(".") - label_parts = query.split(".") - - levels = { - "fqdn": query, - "secondleveldomain": label_parts[-2] if len(label_parts) >= 2 else "", - "thirdleveldomain": ( - ".".join(label_parts[:-2]) if len(label_parts) > 2 else "" - ), - } - - label_length = len(label_parts) - parts = query.split(".") - label_max = len(max(parts, key=str)) if parts else 0 - label_average = len(query) - - basic_features = np.array( - [label_length, label_max, label_average], dtype=np.float64 - ) - - alc = "abcdefghijklmnopqrstuvwxyz" - query_len = len(query) - freq = np.array( - [query.lower().count(c) / query_len if query_len > 0 else 0.0 for c in alc], - dtype=np.float64, - ) - - logger.debug("Get full, alpha, special, and numeric count.") - - def calculate_counts(level: str) -> np.ndarray: - if not level: - return np.array([0.0, 0.0, 0.0, 0.0], dtype=np.float64) - - full_count = len(level) / len(level) - alpha_ratio = sum(c.isalpha() for c in level) / len(level) - numeric_ratio = sum(c.isdigit() for c in level) / len(level) - special_ratio = sum( - not c.isalnum() and not c.isspace() for c in level - ) / len(level) - - return np.array( - [full_count, alpha_ratio, numeric_ratio, special_ratio], - dtype=np.float64, - ) - - fqdn_counts = calculate_counts(levels["fqdn"]) - third_counts = calculate_counts(levels["thirdleveldomain"]) - second_counts = calculate_counts(levels["secondleveldomain"]) - - level_features = np.hstack([third_counts, second_counts, fqdn_counts]) - - def calculate_entropy(s: str) -> float: - if len(s) == 0: - return 0.0 - probs = [s.count(c) / len(s) for c in dict.fromkeys(s)] - return -sum(p * math.log(p, 2) for p in probs) - - logger.debug("Start entropy calculation") - entropy_features = np.array( - [ - calculate_entropy(levels["fqdn"]), - calculate_entropy(levels["thirdleveldomain"]), - calculate_entropy(levels["secondleveldomain"]), - ], - dtype=np.float64, - ) - - logger.debug("Entropy features calculated") - - all_features = np.concatenate( - [basic_features, freq, level_features, entropy_features] - ) - - logger.debug("Finished data transformation") - - return all_features.reshape(1, -1) - - def detect(self) -> None: # pragma: no cover - """Analyzes DNS requests and identifies malicious domains. - - Processes each DNS request in the current batch by extracting features, - running ML model prediction, and collecting warnings for requests that - exceed the configured maliciousness threshold. + Note: + This method relies on the implementation of ``predict``of the rspective subclass """ logger.info("Start detecting malicious requests.") for message in self.messages: # TODO predict all messages - # TODO use scalar: self.scaler.transform(self._get_features(message["domain_name"])) - y_pred = self.model.predict_proba( - self._get_features(message["domain_name"]) - ) + y_pred = self.predict(message) logger.info(f"Prediction: {y_pred}") - if np.argmax(y_pred, axis=1) == 1 and y_pred[0][1] > THRESHOLD: + if np.argmax(y_pred, axis=1) == 1 and y_pred[0][1] > self.threshold: logger.info("Append malicious request to warning.") warning = { "request": message, "probability": float(y_pred[0][1]), - "model": MODEL, - "sha256": CHECKSUM, + # TODO: what is the use of this? not even json serializabel ? + # "model": self.model, + "name": self.name, + "sha256": self.checksum, } self.warnings.append(warning) + def clear_data(self): + """Clears the data in the internal data structures.""" + self.messages = [] + self.begin_timestamp = None + self.end_timestamp = None + self.warnings = [] + def send_warning(self) -> None: - """Generates and stores alerts for detected malicious requests. + """ + Dispatch detected warnings to the appropriate systems. + + This method handles the reporting of detected threats by: + 1. Calculating an overall threat score + 2. Storing detailed warning information + 3. Updating database records with detection results + 4. Marking processed loglines with appropriate status - Creates comprehensive alert records from accumulated warnings including - overall risk scores, individual predictions, and metadata. Stores alerts - in the database and updates batch processing status. If no warnings are - present, marks the batch as filtered out. + The method updates multiple database tables to maintain the pipeline's + state tracking and provides detailed information about detected threats. """ logger.info("Store alert.") + row_id = generate_collisions_resistant_uuid() if len(self.warnings) > 0: overall_score = median( [warning["probability"] for warning in self.warnings] @@ -368,7 +357,7 @@ def send_warning(self) -> None: self.alerts.insert( dict( - client_ip=self.key, + src_ip=self.key, alert_timestamp=datetime.datetime.now(), suspicious_batch_id=self.suspicious_batch_id, overall_score=overall_score, @@ -382,8 +371,9 @@ def send_warning(self) -> None: self.suspicious_batch_timestamps.insert( dict( suspicious_batch_id=self.suspicious_batch_id, - client_ip=self.key, + src_ip=self.key, stage=module_name, + instance_name=self.name, status="finished", timestamp=datetime.datetime.now(), is_active=False, @@ -411,8 +401,9 @@ def send_warning(self) -> None: self.suspicious_batch_timestamps.insert( dict( suspicious_batch_id=self.suspicious_batch_id, - client_ip=self.key, + src_ip=self.key, stage=module_name, + instance_name=self.name, status="filtered_out", timestamp=datetime.datetime.now(), is_active=False, @@ -435,6 +426,18 @@ def send_warning(self) -> None: ) ) + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="finished", + timestamp=datetime.datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=self.suspicious_batch_id, + ) + ) + self.fill_levels.insert( dict( timestamp=datetime.datetime.now(), @@ -444,52 +447,78 @@ def send_warning(self) -> None: ) ) + # TODO: test bootstrap! + def bootstrap_detector_instance(self): + """ + Main processing loop for the detector instance. + + This method implements the core processing loop that continuously: + 1. Fetches data from Kafka + 2. Performs detection on the data + 3. Sends warnings for detected threats + 4. Handles exceptions and cleanup + + The loop continues until interrupted by a keyboard interrupt (Ctrl+C), + at which point it performs a graceful shutdown. -def main(one_iteration: bool = False) -> None: # pragma: no cover - """Creates and runs the Detector instance in a continuous processing loop. + Note: + This method is designed to run in a dedicated thread or process. + """ + while True: + try: + logger.debug("Before getting and filling data") + self.get_and_fill_data() + logger.debug("Inspect Data") + self.detect() + logger.debug("Send warnings") + self.send_warning() + except KafkaMessageFetchException as e: # pragma: no cover + logger.debug(e) + except IOError as e: + logger.error(e) + raise e + except ValueError as e: + logger.debug(e) + except KeyboardInterrupt: + logger.info("Closing down Detector...") + break + finally: + self.clear_data() + + async def start(self): # pragma: no cover + """ + Start the detector instance asynchronously. + + This method sets up the detector to run in an asynchronous execution context, + allowing it to operate concurrently with other components in the system. + """ + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self.bootstrap_detector_instance) - Initializes the Detector and starts the main processing loop that continuously - fetches suspicious batches from Kafka, performs malicious domain detection, - and generates alerts. Handles various exceptions gracefully and ensures - proper cleanup of data structures. - Args: - one_iteration (bool): For testing purposes - stops loop after one iteration. +async def main(): # pragma: no cover + """ + Initialize and start all detector instances defined in the configuration. - Raises: - KeyboardInterrupt: Execution interrupted by user. + This function: + 1. Reads detector configurations + 2. Dynamically loads detector classes + 3. Creates detector instances + 4. Starts all detectors concurrently """ - logger.info("Starting Detector...") - detector = Detector() - logger.info(f"Detector is running.") - - iterations = 0 - - while True: - if one_iteration and iterations > 0: - break - iterations += 1 - - try: - logger.debug("Before getting and filling data") - detector.get_and_fill_data() - logger.debug("Inspect Data") - detector.detect() - logger.debug("Send warnings") - detector.send_warning() - except KafkaMessageFetchException as e: # pragma: no cover - logger.debug(e) - except IOError as e: - logger.error(e) - raise e - except ValueError as e: - logger.debug(e) - except KeyboardInterrupt: - logger.info("Closing down Detector...") - break - finally: - detector.clear_data() + tasks = [] + for detector_config in DETECTORS: + consume_topic = f"{CONSUME_TOPIC_PREFIX}-{detector_config['name']}" + class_name = detector_config["detector_class_name"] + module_name = f"{PLUGIN_PATH}.{detector_config['detector_module_name']}" + module = importlib.import_module(module_name) + DetectorClass = getattr(module, class_name) + detector = DetectorClass( + detector_config=detector_config, consume_topic=consume_topic + ) + tasks.append(asyncio.create_task(detector.start())) + await asyncio.gather(*tasks) if __name__ == "__main__": # pragma: no cover - main() + asyncio.run(main()) diff --git a/src/detector/plugins/__init__.py b/src/detector/plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/detector/plugins/dga_detector.py b/src/detector/plugins/dga_detector.py new file mode 100644 index 00000000..98d61ed1 --- /dev/null +++ b/src/detector/plugins/dga_detector.py @@ -0,0 +1,181 @@ +from src.detector.detector import DetectorBase +import math +import numpy as np +from src.base.log_config import get_logger + +module_name = "data_analysis.detector" +logger = get_logger(module_name) + + +class DGADetector(DetectorBase): + """ + Detector implementation for identifying Domain Generation Algorithm (DGA) domains. + + This class extends the DetectorBase to provide specific functionality for detecting + malicious domains generated by domain generation algorithms. It uses a machine learning + model to analyze domain name characteristics and identify potential DGA activity. + + The detector extracts various statistical and structural features from domain names + to make predictions about whether a domain is likely generated by a DGA. + """ + + def __init__(self, detector_config, consume_topic): + """ + Initialize the DGA detector with configuration parameters. + + Sets up the detector with the model base URL and passes configuration to the + base class for standard detector initialization. + + Args: + detector_config (dict): Configuration dictionary containing detector-specific + parameters including base_url, model, checksum, and threshold. + consume_topic (str): Kafka topic from which the detector will consume messages. + """ + self.model_base_url = detector_config["base_url"] + super().__init__(detector_config, consume_topic) + + def get_model_download_url(self): + """ + Generate the complete URL for downloading the DGA detection model. + + Constructs the URL using the base URL from configuration and appends the + specific model filename with checksum for verification. + + Returns: + str: Fully qualified URL where the model can be downloaded. + """ + self.model_base_url = ( + self.model_base_url[:-1] + if self.model_base_url[-1] == "/" + else self.model_base_url + ) + return f"{self.model_base_url}/files/?p=%2F{self.model}/{self.checksum}/{self.model}.pickle&dl=1" + + def get_scaler_download_url(self): + """ + Generate the complete URL for downloading the DGA detection models scaler. + + Constructs the URL using the base URL from configuration and appends the + specific model filename with checksum for verification. + + Returns: + str: Fully qualified URL where the model can be downloaded. + """ + self.model_base_url = ( + self.model_base_url[:-1] + if self.model_base_url[-1] == "/" + else self.model_base_url + ) + return f"{self.model_base_url}/files/?p=%2F{self.model}/{self.checksum}/scaler.pickle&dl=1" + + def predict(self, message): + """ + Process a message and predict if the domain is likely generated by a DGA. + + Extracts features from the domain name in the message and uses the loaded + machine learning model to generate prediction probabilities. + + Args: + message (dict): A dictionary containing message data, expected to have + a "domain_name" key with the domain to analyze. + + Returns: + np.ndarray: Prediction probabilities for each class. Typically a 2D array + where the shape is (1, 2) for binary classification (benign/malicious). + """ + y_pred = self.model.predict_proba(self._get_features(message["domain_name"])) + return y_pred + + def _get_features(self, query: str): + """Transform a dataset with new features using numpy. + + Args: + query (str): A string to process. + + Returns: + dict: Preprocessed data with computed features. + """ + # Splitting by dots to calculate label length and max length + label_parts = query.split(".") + label_length = len(label_parts) + label_max = max(len(part) for part in label_parts) + label_average = len(query.strip(".")) + + logger.debug("Get letter frequency") + alc = "abcdefghijklmnopqrstuvwxyz" + freq = np.array( + [query.lower().count(i) / len(query) if len(query) > 0 else 0 for i in alc] + ) + + logger.debug("Get full, alpha, special, and numeric count.") + + def calculate_counts(level: str) -> np.ndarray: + if len(level) == 0: + return np.array([0, 0, 0, 0]) + + full_count = len(level) + alpha_count = sum(c.isalpha() for c in level) / full_count + numeric_count = sum(c.isdigit() for c in level) / full_count + special_count = ( + sum(not c.isalnum() and not c.isspace() for c in level) / full_count + ) + + return np.array([full_count, alpha_count, numeric_count, special_count]) + + levels = { + "fqdn": query, + "thirdleveldomain": label_parts[0] if len(label_parts) > 2 else "", + "secondleveldomain": label_parts[1] if len(label_parts) > 1 else "", + } + counts = { + level: calculate_counts(level_value) + for level, level_value in levels.items() + } + + logger.debug( + "Get standard deviation, median, variance, and mean for full, alpha, special, and numeric count." + ) + stats = {} + for level, count_array in counts.items(): + stats[f"{level}_std"] = np.std(count_array) + stats[f"{level}_var"] = np.var(count_array) + stats[f"{level}_median"] = np.median(count_array) + stats[f"{level}_mean"] = np.mean(count_array) + + logger.debug("Start entropy calculation") + + def calculate_entropy(s: str) -> float: + if len(s) == 0: + return 0 + probabilities = [float(s.count(c)) / len(s) for c in dict.fromkeys(list(s))] + entropy = -sum(p * math.log(p, 2) for p in probabilities) + return entropy + + entropy = {level: calculate_entropy(value) for level, value in levels.items()} + + logger.debug("Finished entropy calculation") + + # Final feature aggregation as a NumPy array + basic_features = np.array([label_length, label_max, label_average]) + + # Flatten counts and stats for each level into arrays + level_features = np.hstack([counts[level] for level in levels.keys()]) + + # Entropy features + entropy_features = np.array([entropy[level] for level in levels.keys()]) + + # Concatenate all features into a single numpy array + all_features = np.concatenate( + [ + basic_features, + freq, + # freq_features, + level_features, + # stats_features, + entropy_features, + ] + ) + + logger.debug("Finished data transformation") + + return all_features.reshape(1, -1) diff --git a/src/inspector/inspector.py b/src/inspector/inspector.py index 3aa5de0b..8be8d958 100644 --- a/src/inspector/inspector.py +++ b/src/inspector/inspector.py @@ -4,7 +4,8 @@ import uuid from datetime import datetime from enum import Enum, unique - +import asyncio +from abc import ABC, abstractmethod import marshmallow_dataclass import numpy as np from streamad.util import StreamGenerator, CustomDS @@ -12,7 +13,11 @@ sys.path.append(os.getcwd()) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.data_classes.batch import Batch -from src.base.utils import setup_config +from src.base.utils import ( + setup_config, + get_zeek_sensor_topic_base_names, + generate_collisions_resistant_uuid, +) from src.base.kafka_handler import ( ExactlyOnceKafkaConsumeHandler, ExactlyOnceKafkaProduceHandler, @@ -24,21 +29,18 @@ logger = get_logger(module_name) config = setup_config() -MODE = config["pipeline"]["data_inspection"]["inspector"]["mode"] -ENSEMBLE = config["pipeline"]["data_inspection"]["inspector"]["ensemble"] -MODELS = config["pipeline"]["data_inspection"]["inspector"]["models"] -ANOMALY_THRESHOLD = config["pipeline"]["data_inspection"]["inspector"][ - "anomaly_threshold" +PRODUCE_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ + "inspector_to_detector" ] -SCORE_THRESHOLD = config["pipeline"]["data_inspection"]["inspector"]["score_threshold"] -TIME_TYPE = config["pipeline"]["data_inspection"]["inspector"]["time_type"] -TIME_RANGE = config["pipeline"]["data_inspection"]["inspector"]["time_range"] -CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ +CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "prefilter_to_inspector" ] -PRODUCE_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ - "inspector_to_detector" -] +SENSOR_PROTOCOLS = get_zeek_sensor_topic_base_names(config) +PREFILTERS = config["pipeline"]["log_filtering"] +INSPECTORS = config["pipeline"]["data_inspection"] +COLLECTORS = config["pipeline"]["log_collection"]["collectors"] +DETECTORS = config["pipeline"]["data_analysis"] +PLUGIN_PATH = "src.inspector.plugins" KAFKA_BROKERS = ",".join( [ f"{broker['hostname']}:{broker['port']}" @@ -46,48 +48,58 @@ ] ) -VALID_UNIVARIATE_MODELS = [ - "KNNDetector", - "SpotDetector", - "SRDetector", - "ZScoreDetector", - "OCSVMDetector", - "MadDetector", - "SArimaDetector", -] -VALID_MULTIVARIATE_MODELS = [ - "xStreamDetector", - "RShashDetector", - "HSTreeDetector", - "LodaDetector", - "OCSVMDetector", - "RrcfDetector", -] -VALID_ENSEMBLE_MODELS = ["WeightEnsemble", "VoteEnsemble"] +class InspectorAbstractBase(ABC): # pragma: no cover + @abstractmethod + def __init__(self, consume_topic, produce_topics, config) -> None: + pass -STATIC_ZEROS_UNIVARIATE = np.zeros((100, 1)) -STATIC_ZEROS_MULTIVARIATE = np.zeros((100, 2)) + @abstractmethod + def inspect_anomalies(self) -> None: + pass + @abstractmethod + def _get_models(self, models) -> list: + pass -@unique -class EnsembleModels(str, Enum): - """Available ensemble models for combining multiple anomaly detectors""" + @abstractmethod + def subnet_is_suspicious(self) -> bool: + pass - WEIGHT = "WeightEnsemble" - VOTE = "VoteEnsemble" +class InspectorBase(InspectorAbstractBase): + """Finds anomalies in a batch of requests and produces it to the ``Detector``.""" -class Inspector: - """Main component of the Data Inspection stage to detect anomalies in request batches + def __init__(self, consume_topic, produce_topics, config) -> None: + """ + Initializes the InspectorBase with necessary configurations and connections. - Analyzes batches of DNS requests using configurable streaming anomaly detection models. - Supports univariate, multivariate, and ensemble detection modes. Processes time series - features from DNS request patterns to identify suspicious network behavior and forwards - anomalous batches to the Detector for further analysis. - """ + Sets up Kafka handlers, database connections, and configuration parameters based on + the provided configuration. For non-NoInspector implementations, initializes model + related parameters including mode, model configurations, thresholds, and time parameters. + + Args: + consume_topic (str): Kafka topic to consume messages from + produce_topics (list): List of Kafka topics to produce messages to + config (dict): Configuration dictionary containing inspector settings + + Note: + The "NoInspector" implementation skips model configuration initialization + as it doesn't perform actual anomaly detection. + """ - def __init__(self) -> None: + if not config["inspector_class_name"] == "NoInspector": + self.mode = config["mode"] + self.model_configurations = ( + config["models"] if "models" in config.keys() else None + ) + self.anomaly_threshold = config["anomaly_threshold"] + self.score_threshold = config["score_threshold"] + self.time_type = config["time_type"] + self.time_range = config["time_range"] + self.name = config["name"] + self.consume_topic = consume_topic + self.produce_topics = produce_topics self.batch_id = None self.X = None self.key = None @@ -97,10 +109,11 @@ def __init__(self) -> None: self.messages = [] self.anomalies = [] - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(self.consume_topic) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() # databases + self.batch_tree = ClickHouseKafkaSender("batch_tree") self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") self.suspicious_batch_timestamps = ClickHouseKafkaSender( "suspicious_batch_timestamps" @@ -135,25 +148,39 @@ def get_and_fill_data(self) -> None: return key, data = self.kafka_consume_handler.consume_as_object() - if data: + self.parent_row_id = data.batch_tree_row_id self.batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.messages = data.data self.key = key - self.batch_timestamps.insert( dict( batch_id=self.batch_id, stage=module_name, status="in_process", + instance_name=self.name, timestamp=datetime.now(), is_active=True, message_count=len(self.messages), ) ) + row_id = generate_collisions_resistant_uuid() + + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="in_process", + timestamp=datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=self.batch_id, + ) + ) + self.fill_levels.insert( dict( timestamp=datetime.now(), @@ -162,7 +189,6 @@ def get_and_fill_data(self) -> None: entry_count=len(self.messages), ) ) - if not self.messages: logger.info( "Received message:\n" @@ -187,367 +213,19 @@ def clear_data(self) -> None: self.end_timestamp = None logger.debug("Cleared messages and timestamps. Inspector is now available.") - def _mean_packet_size( - self, messages: list, begin_timestamp, end_timestamp - ) -> np.ndarray: - """Calculates mean packet size per time step for time series analysis. - - Computes the average packet size for each time step in a given time window. - Time steps are configurable via "time_type" and "time_range" in config.yaml. - Default time step is 1 ms. - - Args: - messages (list): Messages from KafkaConsumeHandler containing size information. - begin_timestamp (datetime): Start timestamp of the batch time window. - end_timestamp (datetime): End timestamp of the batch time window. - - Returns: - numpy.ndarray: 2-D numpy array with mean packet sizes for each time step. - """ - logger.debug("Convert timestamps to numpy datetime64") - timestamps = np.array( - [ - np.datetime64(datetime.fromisoformat(item["timestamp"])) - for item in messages - ] - ) - - # Extract and convert the size values from "111b" to integers - sizes = np.array([int(str(item["size"]).replace("b", "")) for item in messages]) - - logger.debug("Sort timestamps and count occurrences") - sorted_indices = np.argsort(timestamps) - timestamps = timestamps[sorted_indices] - sizes = sizes[sorted_indices] - - logger.debug("Set min_date and max_date") - min_date = np.datetime64(begin_timestamp) - max_date = np.datetime64(end_timestamp) - - logger.debug( - "Generate the time range from min_date to max_date with 1ms interval" - ) - time_range = np.arange( - min_date, - max_date + np.timedelta64(TIME_RANGE, TIME_TYPE), - np.timedelta64(TIME_RANGE, TIME_TYPE), - ) - - logger.debug( - "Initialize an array to hold counts for each timestamp in the range" - ) - counts = np.zeros(time_range.shape, dtype=np.float64) - size_sums = np.zeros(time_range.shape, dtype=np.float64) - mean_sizes = np.zeros(time_range.shape, dtype=np.float64) - - # Handle empty messages. - if len(messages) > 0: - logger.debug( - "Count occurrences of timestamps and fill the corresponding index in the counts array" - ) - _, unique_indices, unique_counts = np.unique( - timestamps, return_index=True, return_counts=True - ) - - # Sum the sizes at each unique timestamp - for idx, count in zip(unique_indices, unique_counts): - time_index = ( - ((timestamps[idx] - min_date) // TIME_RANGE) - .astype(f"timedelta64[{TIME_TYPE}]") - .astype(int) - ) - size_sums[time_index] = np.sum(sizes[idx : idx + count]) - counts[time_index] = count - - # Calculate the mean packet size for each millisecond (ignore division by zero warnings) - with np.errstate(divide="ignore", invalid="ignore"): - mean_sizes = np.divide( - size_sums, counts, out=np.zeros_like(size_sums), where=counts != 0 - ) - else: - logger.warning("Empty messages to inspect.") - - logger.debug("Reshape into the required shape (n, 1)") - return mean_sizes.reshape(-1, 1) - - def _count_errors( - self, messages: list, begin_timestamp, end_timestamp - ) -> np.ndarray: - """Counts message occurrences per time step for time series analysis. - - Counts the number of messages occurring in each time step within a given time window. - Time steps are configurable via "time_type" and "time_range" in config.yaml. - Default time step is 1 ms. - - Args: - messages (list): Messages from KafkaConsumeHandler containing timestamp information. - begin_timestamp (datetime): Start timestamp of the batch time window. - end_timestamp (datetime): End timestamp of the batch time window. - - Returns: - numpy.ndarray: 2-D numpy array with message counts for each time step. - """ - logger.debug("Convert timestamps to numpy datetime64") - timestamps = np.array( - [ - np.datetime64(datetime.fromisoformat(item["timestamp"])) - for item in messages - ] - ) - - logger.debug("Sort timestamps and count occurrences") - sorted_indices = np.argsort(timestamps) - timestamps = timestamps[sorted_indices] - - logger.debug("Set min_date and max_date") - min_date = np.datetime64(begin_timestamp) - max_date = np.datetime64(end_timestamp) - - logger.debug( - "Generate the time range from min_date to max_date with 1ms interval" - ) - # Adding np.timedelta adds end time to time_range - time_range = np.arange( - min_date, - max_date + np.timedelta64(TIME_RANGE, TIME_TYPE), - np.timedelta64(TIME_RANGE, TIME_TYPE), - ) - - logger.debug( - "Initialize an array to hold counts for each timestamp in the range" - ) - counts = np.zeros(time_range.shape, dtype=np.float64) - - # Handle empty messages. - if len(messages) > 0: - logger.debug( - "Count occurrences of timestamps and fill the corresponding index in the counts array" - ) - unique_times, _, unique_counts = np.unique( - timestamps, return_index=True, return_counts=True - ) - time_indices = ( - ((unique_times - min_date) // TIME_RANGE) - .astype(f"timedelta64[{TIME_TYPE}]") - .astype(int) - ) - counts[time_indices] = unique_counts - else: - logger.warning("Empty messages to inspect.") - - logger.debug("Reshape into the required shape (n, 1)") - return counts.reshape(-1, 1) - - def inspect(self) -> None: - """Runs anomaly detection using configured StreamAD models. - - Executes anomaly detection based on the configured mode (univariate, multivariate, or ensemble). - Validates model configuration and delegates to the appropriate inspection method. - - Raises: - NotImplementedError: If no models are configured or mode is unsupported. - """ - if MODELS == None or len(MODELS) == 0: - logger.warning("No model ist set!") - raise NotImplementedError(f"No model is set!") - if len(MODELS) > 1: - logger.warning( - f"Model List longer than 1. Only the first one is taken: {MODELS[0]['model']}!" - ) - self._get_models(MODELS) - match MODE: - case "univariate": - self._inspect_univariate() - case "multivariate": - self._inspect_multivariate() - case "ensemble": - self._get_ensemble() - self._inspect_ensemble() - case _: - logger.warning(f"Mode {MODE} is not supported!") - raise NotImplementedError(f"Mode {MODE} is not supported!") - - def _inspect_multivariate(self) -> None: - """Performs multivariate anomaly detection using StreamAD model. - - Combines mean packet size and message count time series to create a multivariate - feature matrix for anomaly detection. Computes anomaly scores for each time step - using the configured multivariate StreamAD model. - """ - - logger.debug("Inspecting data...") - - X_1 = self._mean_packet_size( - self.messages, self.begin_timestamp, self.end_timestamp - ) - X_2 = self._count_errors( - self.messages, self.begin_timestamp, self.end_timestamp - ) - - self.X = np.concatenate((X_1, X_2), axis=1) - - # TODO Append zeros to avoid issues when model is reused. - # self.X = np.vstack((STATIC_ZEROS_MULTIVARIATE, X)) - - ds = CustomDS(self.X, self.X) - stream = StreamGenerator(ds.data) - - for x in stream.iter_item(): - score = self.models[0].fit_score(x) - # noqa - if score != None: - self.anomalies.append(score) - else: - self.anomalies.append(0) - - def _inspect_ensemble(self) -> None: - """Performs ensemble anomaly detection using multiple StreamAD models. - - Uses message count time series and combines scores from multiple StreamAD models - through ensemble methods (Weight or Vote). Computes final ensemble scores - for each time step in the data. - """ - self.X = self._count_errors( - self.messages, self.begin_timestamp, self.end_timestamp - ) - - # TODO Append zeros to avoid issues when model is reused. - # self.X = np.vstack((STATIC_ZEROS_UNIVARIATE, X)) - - ds = CustomDS(self.X, self.X) - stream = StreamGenerator(ds.data) - - for x in stream.iter_item(): - scores = [] - # Fit all models in ensemble - for model in self.models: - scores.append(model.fit_score(x)) - # TODO Calibrators are missing - score = self.ensemble.ensemble(scores) - # noqa - if score != None: - self.anomalies.append(score) - else: - self.anomalies.append(0) - - def _inspect_univariate(self) -> None: - """Performs univariate anomaly detection using StreamAD model. - - Uses message count time series as a single feature for anomaly detection. - Computes anomaly scores for each time step using the configured - univariate StreamAD model. - """ - - logger.debug("Inspecting data...") - - self.X = self._count_errors( - self.messages, self.begin_timestamp, self.end_timestamp - ) - - # TODO Append zeros to avoid issues when model is reused. - # self.X = np.vstack((STATIC_ZEROS_UNIVARIATE, X)) - - ds = CustomDS(self.X, self.X) - stream = StreamGenerator(ds.data) - - for x in stream.iter_item(): - score = self.models[0].fit_score(x) - # noqa - if score is not None: - self.anomalies.append(score) - else: - self.anomalies.append(0) - - def _get_models(self, models: list) -> None: - """Loads and initializes StreamAD detection models. - - Dynamically imports and instantiates the configured StreamAD models based on the - detection mode (univariate, multivariate, or ensemble). Validates model compatibility - with the selected mode and initializes models with their configuration parameters. - - Args: - models (list): List of model configurations containing module and model information. - - Raises: - NotImplementedError: If a model is not compatible with the selected mode. - """ - if hasattr(self, "models") and self.models != None and self.models != []: - logger.info("All models have been successfully loaded!") - return - - self.models = [] - for model in models: - if MODE == "univariate" or MODE == "ensemble": - logger.debug(f"Load Model: {model['model']} from {model['module']}.") - if not model["model"] in VALID_UNIVARIATE_MODELS: - logger.error( - f"Model {models} is not a valid univariate or ensemble model." - ) - raise NotImplementedError( - f"Model {models} is not a valid univariate or ensemble model." - ) - if MODE == "multivariate": - logger.debug(f"Load Model: {model['model']} from {model['module']}.") - if not model["model"] in VALID_MULTIVARIATE_MODELS: - logger.error(f"Model {model} is not a valid multivariate model.") - raise NotImplementedError( - f"Model {model} is not a valid multivariate model." - ) - - module = importlib.import_module(model["module"]) - module_model = getattr(module, model["model"]) - self.models.append(module_model(**model["model_args"])) - - def _get_ensemble(self) -> None: - """Loads and initializes ensemble model for combining multiple detectors. - - Dynamically imports and instantiates the configured ensemble model (Weight or Vote) - that combines scores from multiple StreamAD models. Validates that the ensemble - model is supported and initializes it with configuration parameters. - - Raises: - NotImplementedError: If the ensemble model is not supported. - """ - logger.debug(f"Load Model: {ENSEMBLE['model']} from {ENSEMBLE['module']}.") - if not ENSEMBLE["model"] in VALID_ENSEMBLE_MODELS: - logger.error(f"Model {ENSEMBLE} is not a valid ensemble model.") - raise NotImplementedError( - f"Model {ENSEMBLE} is not a valid ensemble model." - ) - - if hasattr(self, "ensemble") and self.ensemble != None: - logger.info("Ensemble have been successfully loaded!") - return - - module = importlib.import_module(ENSEMBLE["module"]) - module_model = getattr(module, ENSEMBLE["model"]) - self.ensemble = module_model(**ENSEMBLE["model_args"]) - - def send_data(self) -> None: - """Forwards anomalous data to the Detector for further analysis. - - Evaluates anomaly scores against the configured thresholds. If the proportion of - anomalous time steps exceeds the threshold, groups messages by client IP and - forwards each group as a suspicious batch to the Detector via Kafka. Otherwise, - logs the batch as filtered out and updates monitoring databases. - """ - total_anomalies = np.count_nonzero( - np.greater_equal(np.array(self.anomalies), SCORE_THRESHOLD) - ) - if total_anomalies / len(self.X) > ANOMALY_THRESHOLD: # subnet is suspicious - logger.info("Sending anomalies to detector for further analysis.") + def send_data(self): + """Pass the anomalous data for the detector unit for further processing""" + row_id = generate_collisions_resistant_uuid() + if self.subnet_is_suspicious(): buckets = {} - for message in self.messages: - if message["client_ip"] in buckets.keys(): - buckets[message["client_ip"]].append(message) + if message["src_ip"] in buckets.keys(): + buckets[message["src_ip"]].append(message) else: - buckets[message["client_ip"]] = [] - buckets.get(message["client_ip"]).append(message) + buckets[message["src_ip"]] = [] + buckets.get(message["src_ip"]).append(message) for key, value in buckets.items(): - logger.info(f"Sending anomalies to detector for {key}.") - logger.info(f"Sending anomalies to detector for {value}.") suspicious_batch_id = uuid.uuid4() # generate new suspicious_batch_id @@ -559,6 +237,7 @@ def send_data(self) -> None: ) data_to_send = { + "batch_tree_row_id": row_id, "batch_id": suspicious_batch_id, "begin_timestamp": self.begin_timestamp, "end_timestamp": self.end_timestamp, @@ -567,11 +246,13 @@ def send_data(self) -> None: batch_schema = marshmallow_dataclass.class_schema(Batch)() + # important to finish before sending, otherwise detector can process before finished here! self.suspicious_batch_timestamps.insert( dict( suspicious_batch_id=suspicious_batch_id, - client_ip=key, + src_ip=key, stage=module_name, + instance_name=self.name, status="finished", timestamp=datetime.now(), is_active=True, @@ -579,16 +260,31 @@ def send_data(self) -> None: ) ) - self.kafka_produce_handler.produce( - topic=PRODUCE_TOPIC, - data=batch_schema.dumps(data_to_send), - key=key, + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="finished", + timestamp=datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=suspicious_batch_id, + ) ) + for topic in self.produce_topics: + self.kafka_produce_handler.produce( + topic=topic, + data=batch_schema.dumps(data_to_send), + key=key, + ) + else: # subnet is not suspicious + self.batch_timestamps.insert( dict( batch_id=self.batch_id, stage=module_name, + instance_name=self.name, status="filtered_out", timestamp=datetime.now(), is_active=False, @@ -611,6 +307,17 @@ def send_data(self) -> None: ) ) + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="finished", + timestamp=datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=self.batch_id, + ) + ) self.fill_levels.insert( dict( timestamp=datetime.now(), @@ -620,53 +327,103 @@ def send_data(self) -> None: ) ) + def inspect(self): + """ + Executes the anomaly detection process with validation and fallback handling. -def main(one_iteration: bool = False) -> None: - """Creates and runs the Inspector instance in a continuous processing loop. + This method: + 1. Validates that model configurations exist + 2. Logs a warning if multiple models are configured (only first is used) + 3. Retrieves the models through _get_models() + 4. Calls inspect_anomalies() to perform the actual detection - Initializes the Inspector and starts the main processing loop that continuously - fetches batches from Kafka, performs anomaly detection, and forwards suspicious - batches to the Detector. Handles various exceptions gracefully and ensures - proper cleanup of data structures. + Raises: + NotImplementedError: If no model configurations are provided + """ + if self.model_configurations == None or len(self.model_configurations) == 0: + logger.warning("No model ist set!") + raise NotImplementedError(f"No model is set!") + if len(self.model_configurations) > 1: + logger.warning( + f"Model List longer than 1. Only the first one is taken: {self.model_configurations[0]['model']}!" + ) + self.models = self._get_models(self.model_configurations) + self.inspect_anomalies() - Args: - one_iteration (bool): For testing purposes - stops loop after one iteration. + # TODO: test this! + def bootstrap_inspection_process(self): + """ + Implements the main inspection process loop that continuously: + 1. Fetches new data from Kafka + 2. Inspects the data for anomalies + 3. Sends suspicious data to detectors + """ + logger.info(f"Starting {self.name}") + while True: + try: + self.get_and_fill_data() + self.inspect() + self.send_data() + except KafkaMessageFetchException as e: # pragma: no cover + logger.debug(e) + except IOError as e: + logger.error(e) + raise e + except ValueError as e: + logger.debug(e) + except KeyboardInterrupt: + logger.info(f" {self.consume_topic} Closing down Inspector...") + break + finally: + self.clear_data() + + async def start(self): # pragma: no cover + """ + Starts the inspector in an asynchronous context. + + This method runs the synchronous bootstrap_inspection_process() in a separate + thread using run_in_executor, allowing the inspector to operate concurrently + with other async components in the pipeline. + """ + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self.bootstrap_inspection_process) - Raises: - KeyboardInterrupt: Execution interrupted by user. + +async def main(): """ - logger.info("Starting Inspector...") - inspector = Inspector() - logger.info(f"Inspector is running.") - - iterations = 0 - - while True: - if one_iteration and iterations > 0: - break - iterations += 1 - - try: - logger.debug("Before getting and filling data") - inspector.get_and_fill_data() - logger.debug("After getting and filling data") - logger.debug("Start anomaly detection") - inspector.inspect() - logger.debug("Send data to detector") - inspector.send_data() - except KafkaMessageFetchException as e: # pragma: no cover - logger.debug(e) - except IOError as e: - logger.error(e) - raise e - except ValueError as e: - logger.debug(e) - except KeyboardInterrupt: - logger.info("Closing down Inspector...") - break - finally: - inspector.clear_data() + Entry point for the Inspector module. + + This function: + 1. Iterates through all configured inspectors + 2. Creates the appropriate inspector instance based on configuration + 3. Starts each inspector as an asynchronous task + 4. Gathers all tasks to run them concurrently + + The function dynamically loads inspector classes from the plugin system + based on configuration values, allowing for flexible extension of the + inspection capabilities. + + """ + tasks = [] + for inspector in INSPECTORS: + logger.info(inspector["name"]) + consume_topic = f"{CONSUME_TOPIC_PREFIX}-{inspector['name']}" + produce_topics = [ + f"{PRODUCE_TOPIC_PREFIX}-{detector['name']}" + for detector in DETECTORS + if detector["inspector_name"] == inspector["name"] + ] + class_name = inspector["inspector_class_name"] + module_name = f"{PLUGIN_PATH}.{inspector['inspector_module_name']}" + module = importlib.import_module(module_name) + InspectorClass = getattr(module, class_name) + logger.info(f"using {class_name} and {module_name}") + inspector_instance = InspectorClass( + consume_topic=consume_topic, produce_topics=produce_topics, config=inspector + ) + tasks.append(asyncio.create_task(inspector_instance.start())) + await asyncio.gather(*tasks) if __name__ == "__main__": # pragma: no cover - main() + asyncio.run(main()) diff --git a/src/inspector/plugins/__init__.py b/src/inspector/plugins/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/inspector/plugins/no_inspector.py b/src/inspector/plugins/no_inspector.py new file mode 100644 index 00000000..fface6ae --- /dev/null +++ b/src/inspector/plugins/no_inspector.py @@ -0,0 +1,27 @@ +from src.inspector.inspector import InspectorBase +import numpy as np +from src.base.log_config import get_logger + +module_name = "data_inspection.inspector" +logger = get_logger(module_name) + + +# TODO: test this! +class NoInspector(InspectorBase): + def __init__(self, consume_topic, produce_topics, config) -> None: + super().__init__(consume_topic, produce_topics, config) + + def inspect_anomalies(self) -> None: + # declare everything to be suspicious + self.anomalies = np.array([1 for message in self.messages]) + + # TODO: send data needs to be either revised or values be set for it ! + def inspect(self) -> None: + self.inspect_anomalies() + + def _get_models(self, models): + pass + + def subnet_is_suspicious(self) -> bool: + logger.info(f"{self.name}: {len(self.anomalies)} anomalies found") + return True diff --git a/src/inspector/plugins/stream_ad_inspector.py b/src/inspector/plugins/stream_ad_inspector.py new file mode 100644 index 00000000..78fa80ad --- /dev/null +++ b/src/inspector/plugins/stream_ad_inspector.py @@ -0,0 +1,342 @@ +from src.inspector.inspector import InspectorBase +import importlib +import os +import sys +from datetime import datetime +import numpy as np +from streamad.util import StreamGenerator, CustomDS + +# TODO: test all of this! +sys.path.append(os.getcwd()) +from src.base.utils import setup_config +from src.base.log_config import get_logger + +module_name = "data_inspection.inspector" +logger = get_logger(module_name) + +config = setup_config() +VALID_UNIVARIATE_MODELS = [ + "KNNDetector", + "SpotDetector", + "SRDetector", + "ZScoreDetector", + "OCSVMDetector", + "MadDetector", + "SArimaDetector", +] + +VALID_MULTIVARIATE_MODELS = [ + "xStreamDetector", + "RShashDetector", + "HSTreeDetector", + "LodaDetector", + "OCSVMDetector", + "RrcfDetector", +] + +VALID_ENSEMBLE_MODELS = ["WeightEnsemble", "VoteEnsemble"] + +STATIC_ZEROS_UNIVARIATE = np.zeros((100, 1)) +STATIC_ZEROS_MULTIVARIATE = np.zeros((100, 2)) + + +class StreamADInspector(InspectorBase): + + def __init__(self, consume_topic, produce_topics, config): + super().__init__(consume_topic, produce_topics, config) + self.ensemble_config = config["ensemble"] + + def subnet_is_suspicious(self) -> bool: + total_anomalies = np.count_nonzero( + np.greater_equal(np.array(self.anomalies), self.score_threshold) + ) + logger.info(f"{self.name}: {total_anomalies} anomalies found") + return True if total_anomalies / len(self.X) > self.anomaly_threshold else False + + def _mean_packet_size(self, messages: list, begin_timestamp, end_timestamp): + """Returns mean of packet size of messages between two timestamps given a time step. + By default, 1 ms time step is applied. Time steps are adjustable by "time_type" and "time_range" + in config.yaml. + + Args: + messages (list): Messages from KafkaConsumeHandler. + begin_timestamp (datetime): Begin timestamp of batch. + end_timestamp (datetime): End timestamp of batch. + + Returns: + numpy.ndarray: 2-D numpy.ndarray including all steps. + """ + logger.debug("Convert timestamps to numpy datetime64") + timestamps = np.array( + [np.datetime64(datetime.fromisoformat(item["ts"])) for item in messages] + ) + + # Extract and convert the size values from "111b" to integers + sizes = np.array([int(str(item["size"]).replace("b", "")) for item in messages]) + + logger.debug("Sort timestamps and count occurrences") + sorted_indices = np.argsort(timestamps) + timestamps = timestamps[sorted_indices] + sizes = sizes[sorted_indices] + + logger.debug("Set min_date and max_date") + min_date = np.datetime64(begin_timestamp) + max_date = np.datetime64(end_timestamp) + + logger.debug( + "Generate the time range from min_date to max_date with 1ms interval" + ) + time_range = np.arange( + min_date, + max_date + np.timedelta64(self.time_range, self.time_type), + np.timedelta64(self.time_range, self.time_type), + ) + + logger.debug( + "Initialize an array to hold counts for each timestamp in the range" + ) + counts = np.zeros(time_range.shape, dtype=np.float64) + size_sums = np.zeros(time_range.shape, dtype=np.float64) + mean_sizes = np.zeros(time_range.shape, dtype=np.float64) + + # Handle empty messages. + if len(messages) > 0: + logger.debug( + "Count occurrences of timestamps and fill the corresponding index in the counts array" + ) + _, unique_indices, unique_counts = np.unique( + timestamps, return_index=True, return_counts=True + ) + + # Sum the sizes at each unique timestamp + for idx, count in zip(unique_indices, unique_counts): + time_index = ( + ((timestamps[idx] - min_date) // self.time_range) + .astype(f"timedelta64[{self.time_type}]") + .astype(int) + ) + size_sums[time_index] = np.sum(sizes[idx : idx + count]) + counts[time_index] = count + + # Calculate the mean packet size for each millisecond (ignore division by zero warnings) + with np.errstate(divide="ignore", invalid="ignore"): + mean_sizes = np.divide( + size_sums, counts, out=np.zeros_like(size_sums), where=counts != 0 + ) + else: + logger.warning("Empty messages to inspect.") + + logger.debug("Reshape into the required shape (n, 1)") + return mean_sizes.reshape(-1, 1) + + def _count_errors(self, messages: list, begin_timestamp, end_timestamp): + """Counts occurances of messages between two timestamps given a time step. + By default, 1 ms time step is applied. Time steps are adjustable by "time_type" and "time_range" + in config.yaml. + + Args: + messages (list): Messages from KafkaConsumeHandler. + begin_timestamp (datetime): Begin timestamp of batch. + end_timestamp (datetime): End timestamp of batch. + + Returns: + numpy.ndarray: 2-D numpy.ndarray including all steps. + """ + logger.debug("Convert timestamps to numpy datetime64") + timestamps = np.array( + [np.datetime64(datetime.fromisoformat(item["ts"])) for item in messages] + ) + + logger.debug("Sort timestamps and count occurrences") + sorted_indices = np.argsort(timestamps) + timestamps = timestamps[sorted_indices] + + logger.debug("Set min_date and max_date") + min_date = np.datetime64(begin_timestamp) + max_date = np.datetime64(end_timestamp) + + logger.debug( + "Generate the time range from min_date to max_date with 1ms interval" + ) + # Adding np.timedelta adds end time to time_range + time_range = np.arange( + min_date, + max_date + np.timedelta64(self.time_range, self.time_type), + np.timedelta64(self.time_range, self.time_type), + ) + + logger.debug( + "Initialize an array to hold counts for each timestamp in the range" + ) + counts = np.zeros(time_range.shape, dtype=np.float64) + + # Handle empty messages. + if len(messages) > 0: + logger.debug( + "Count occurrences of timestamps and fill the corresponding index in the counts array" + ) + unique_times, _, unique_counts = np.unique( + timestamps, return_index=True, return_counts=True + ) + time_indices = ( + ((unique_times - min_date) // self.time_range) + .astype(f"timedelta64[{self.time_type}]") + .astype(int) + ) + counts[time_indices] = unique_counts + else: + logger.warning("Empty messages to inspect.") + + logger.debug("Reshape into the required shape (n, 1)") + return counts.reshape(-1, 1) + + def _get_models(self, models): + if hasattr(self, "models") and self.models != None and self.models != []: + logger.info("All models have been successfully loaded!") + return + + model_list = [] + for model in models: + if self.mode == "univariate" or self.mode == "ensemble": + logger.debug(f"Load Model: {model['model']} from {model['module']}.") + if not model["model"] in VALID_UNIVARIATE_MODELS: + logger.error( + f"Model {models} is not a valid univariate or ensemble model." + ) + raise NotImplementedError( + f"Model {models} is not a valid univariate or ensemble model." + ) + if self.mode == "multivariate": + logger.debug(f"Load Model: {model['model']} from {model['module']}.") + if not model["model"] in VALID_MULTIVARIATE_MODELS: + logger.error(f"Model {model} is not a valid multivariate model.") + raise NotImplementedError( + f"Model {model} is not a valid multivariate model." + ) + + module = importlib.import_module(model["module"]) + module_model = getattr(module, model["model"]) + model_list.append(module_model(**model["model_args"])) + return model_list + + def inspect_anomalies(self): + match self.mode: + case "univariate": + self._inspect_univariate() + case "multivariate": + self._inspect_multivariate() + case "ensemble": + self._get_ensemble() + self._inspect_ensemble() + case _: + logger.warning(f"Mode {self.mode} is not supported!") + raise NotImplementedError(f"Mode {self.mode} is not supported!") + + def _inspect_multivariate(self): + """ + Method to inspect multivariate data for anomalies using a StreamAD Model + Errors are count in the time window and fit model to retrieve scores. + + Args: + model (str): Model name (should be capable of handling multivariate data) + + """ + + logger.debug("Inspecting data...") + + X_1 = self._mean_packet_size( + self.messages, self.begin_timestamp, self.end_timestamp + ) + X_2 = self._count_errors( + self.messages, self.begin_timestamp, self.end_timestamp + ) + + self.X = np.concatenate((X_1, X_2), axis=1) + + # TODO Append zeros to avoid issues when model is reused. + # self.X = np.vstack((STATIC_ZEROS_MULTIVARIATE, X)) + + ds = CustomDS(self.X, self.X) + stream = StreamGenerator(ds.data) + + for x in stream.iter_item(): + score = self.models[0].fit_score(x) + # noqa + if score != None: + self.anomalies.append(score) + else: + self.anomalies.append(0) + + def _inspect_ensemble(self): + """ + Method to inspect data for anomalies using ensembles of two StreamAD models + Errors are count in the time window and fit model to retrieve scores. + """ + self.X = self._count_errors( + self.messages, self.begin_timestamp, self.end_timestamp + ) + + # TODO Append zeros to avoid issues when model is reused. + # self.X = np.vstack((STATIC_ZEROS_UNIVARIATE, X)) + + ds = CustomDS(self.X, self.X) + stream = StreamGenerator(ds.data) + + for x in stream.iter_item(): + scores = [] + # Fit all models in ensemble + for model in self.models: + scores.append(model.fit_score(x)) + # TODO Calibrators are missing + score = self.ensemble.ensemble(scores) + # noqa + if score != None: + self.anomalies.append(score) + else: + self.anomalies.append(0) + + def _inspect_univariate(self): + """Runs anomaly detection on given StreamAD Model on univariate data. + Errors are count in the time window and fit model to retrieve scores. + + Args: + model (str): StreamAD model name. + """ + + logger.debug("Inspecting data...") + + self.X = self._count_errors( + self.messages, self.begin_timestamp, self.end_timestamp + ) + + # TODO Append zeros to avoid issues when model is reused. + # self.X = np.vstack((STATIC_ZEROS_UNIVARIATE, X)) + + ds = CustomDS(self.X, self.X) + stream = StreamGenerator(ds.data) + + for x in stream.iter_item(): + score = self.models[0].fit_score(x) + # noqa + if score is not None: + self.anomalies.append(score) + else: + self.anomalies.append(0) + + def _get_ensemble(self): + logger.debug( + f"Load Model: {self.ensemble_config['model']} from {self.ensemble_config['module']}." + ) + if not self.ensemble_config["model"] in VALID_ENSEMBLE_MODELS: + logger.error(f"Model {self.ensemble_config} is not a valid ensemble model.") + raise NotImplementedError( + f"Model {self.ensemble_config} is not a valid ensemble model." + ) + + if hasattr(self, "ensemble") and self.ensemble != None: + logger.info("Ensemble have been successfully loaded!") + return + + module = importlib.import_module(self.ensemble_config["module"]) + module_model = getattr(module, self.ensemble_config["model"]) + self.ensemble = module_model(**self.ensemble_config["model_args"]) diff --git a/src/logcollector/batch_handler.py b/src/logcollector/batch_handler.py index 542fef20..72fb74b0 100644 --- a/src/logcollector/batch_handler.py +++ b/src/logcollector/batch_handler.py @@ -11,18 +11,18 @@ from src.base.data_classes.batch import Batch from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.kafka_handler import ExactlyOnceKafkaProduceHandler -from src.base.utils import setup_config +from src.base.utils import ( + setup_config, + get_batch_configuration, + generate_collisions_resistant_uuid, +) from src.base.log_config import get_logger module_name = "log_collection.batch_handler" logger = get_logger(module_name) config = setup_config() -BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] -BATCH_TIMEOUT = config["pipeline"]["log_collection"]["batch_handler"]["batch_timeout"] -PRODUCE_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ - "batch_sender_to_prefilter" -] + KAFKA_BROKERS = ",".join( [ f"{broker['hostname']}:{broker['port']}" @@ -40,7 +40,8 @@ class BufferedBatch: chronological processing. Tracks batch metadata including IDs, timestamps, and fill levels for monitoring. """ - def __init__(self): + def __init__(self, collector_name): + self.name = f"buffered-batch-for-{collector_name}" self.batch = {} # Batch for the latest messages coming in self.buffer = {} # Former batch with previous messages self.batch_id = {} # Batch ID per key @@ -49,7 +50,7 @@ def __init__(self): self.logline_to_batches = ClickHouseKafkaSender("logline_to_batches") self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") self.fill_levels = ClickHouseKafkaSender("fill_levels") - + self.batch_tree = ClickHouseKafkaSender("batch_tree") self.fill_levels.insert( dict( timestamp=datetime.datetime.now(), @@ -95,6 +96,7 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: dict( batch_id=batch_id, stage=module_name, + instance_name=self.name, status="waiting", timestamp=datetime.datetime.now(), is_active=True, @@ -119,6 +121,7 @@ def add_message(self, key: str, logline_id: uuid.UUID, message: str) -> None: dict( batch_id=new_batch_id, stage=module_name, + instance_name=self.name, status="waiting", timestamp=datetime.datetime.now(), is_active=True, @@ -214,7 +217,8 @@ def complete_batch(self, key: str) -> dict: def _get_first_timestamp_of_batch() -> str | None: entries = self.batch.get(key) return ( - json.loads(entries[0])["timestamp"] + # TODO remove hardcoded ts value + json.loads(entries[0])["ts"] if entries and entries[0] else None ) @@ -230,9 +234,7 @@ def _get_first_timestamp_of_batch() -> str | None: def _get_first_timestamp_of_buffer() -> str | None: entries = self.buffer.get(key) return ( - json.loads(entries[0])["timestamp"] - if entries and entries[0] - else None + json.loads(entries[0])["ts"] if entries and entries[0] else None ) begin_timestamp = _get_first_timestamp_of_buffer() @@ -243,12 +245,13 @@ def _get_last_timestamp_of_batch() -> str | None: entries = self.batch.get(key) return ( - json.loads(entries[-1])["timestamp"] - if entries and entries[-1] - else None + json.loads(entries[-1])["ts"] if entries and entries[-1] else None ) + row_id = generate_collisions_resistant_uuid() + data = { + "batch_tree_row_id": row_id, "batch_id": batch_id, "begin_timestamp": datetime.datetime.fromisoformat(begin_timestamp), "end_timestamp": datetime.datetime.fromisoformat( @@ -257,16 +260,29 @@ def _get_last_timestamp_of_batch() -> str | None: "data": buffer_data + self.batch[key], } + timestamp = datetime.datetime.now() self.batch_timestamps.insert( dict( batch_id=batch_id, stage=module_name, + instance_name=self.name, status="completed", - timestamp=datetime.datetime.now(), + timestamp=timestamp, is_active=True, message_count=self.get_message_count_for_batch_key(key), ) ) + self.batch_tree.insert( + dict( + batch_row_id=row_id, + parent_batch_row_id=None, + stage=module_name, + instance_name=self.name, + timestamp=timestamp, + status="completed", + batch_id=batch_id, + ) + ) # Move data from batch to buffer self.buffer[key] = self.batch[key] @@ -355,8 +371,8 @@ def _extract_tuples_from_json_formatted_strings( for item in data: record = json.loads(item) - - timestamp = record.get("timestamp", "") + # TODO remove hardcoded ts value + timestamp = record.get("ts", "") tuples.append((str(timestamp), item)) return tuples @@ -420,9 +436,10 @@ class BufferedBatchSender: via Kafka and tracks message timestamps for monitoring and debugging purposes. """ - def __init__(self): - self.topic = PRODUCE_TOPIC - self.batch = BufferedBatch() + def __init__(self, produce_topics, collector_name): + self.topics = produce_topics + self.batch_configuration = get_batch_configuration(collector_name) + self.batch = BufferedBatch(collector_name) self.timer = None self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() @@ -473,7 +490,7 @@ def add_message(self, key: str, message: str) -> None: logger.debug(f"Batch: {self.batch.batch}") number_of_messages_for_key = self.batch.get_message_count_for_batch_key(key) - if number_of_messages_for_key >= BATCH_SIZE: + if number_of_messages_for_key >= self.batch_configuration["batch_size"]: self._send_batch_for_key(key) logger.info( f"Full batch: Successfully sent batch messages for subnet_id {key}.\n" @@ -553,11 +570,13 @@ def _send_data_packet(self, key: str, data: dict) -> None: """ batch_schema = marshmallow_dataclass.class_schema(Batch)() - self.kafka_produce_handler.produce( - topic=self.topic, - data=batch_schema.dumps(data), - key=key, - ) + for topic in self.topics: + self.kafka_produce_handler.produce( + topic=topic, + data=batch_schema.dumps(data), + key=key, + ) + logger.info(f"send data to {topic}") def _reset_timer(self) -> None: """Restarts the internal timer for batch timeout handling. @@ -568,5 +587,7 @@ def _reset_timer(self) -> None: if self.timer: self.timer.cancel() - self.timer = Timer(BATCH_TIMEOUT, self._send_all_batches) + self.timer = Timer( + self.batch_configuration["batch_timeout"], self._send_all_batches + ) self.timer.start() diff --git a/src/logcollector/collector.py b/src/logcollector/collector.py index d6cdf1af..e0f59c6f 100644 --- a/src/logcollector/collector.py +++ b/src/logcollector/collector.py @@ -13,28 +13,31 @@ from src.base import utils from src.logcollector.batch_handler import BufferedBatchSender from src.base.log_config import get_logger +from collections import defaultdict module_name = "log_collection.collector" logger = get_logger(module_name) config = utils.setup_config() -IPV4_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ - "ipv4_prefix_length" -] -IPV6_PREFIX_LENGTH = config["pipeline"]["log_collection"]["batch_handler"]["subnet_id"][ - "ipv6_prefix_length" -] + REQUIRED_FIELDS = [ - "timestamp", - "status_code", - "client_ip", - "record_type", + "ts", + "src_ip", ] -BATCH_SIZE = config["pipeline"]["log_collection"]["batch_handler"]["batch_size"] -CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ +PRODUCE_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ + "batch_sender_to_prefilter" +] +CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "logserver_to_collector" ] +SENSOR_PROTOCOLS = utils.get_zeek_sensor_topic_base_names(config) +PREFILTERS = config["pipeline"]["log_filtering"] + +COLLECTORS = [ + collector for collector in config["pipeline"]["log_collection"]["collectors"] +] + class LogCollector: """Main component of the Log Collection stage to pre-process and format data @@ -43,72 +46,86 @@ class LogCollector: value, invalid loglines are discarded. All valid loglines are sent to the BatchSender. """ - def __init__(self) -> None: + def __init__( + self, collector_name, protocol, consume_topic, produce_topics, validation_config + ) -> None: + """Initializes a new LogCollector instance with the specified configuration. + + Args: + collector_name (str): Name of the collector instance + protocol (str): Protocol type of the log lines (e.g., 'dns', 'http') + consume_topic (str): Kafka topic to consume log lines from + produce_topics (list[str]): List of Kafka topics to produce validated log lines to + validation_config (list): Configuration for validating log line fields + """ + self.protocol = protocol + self.consume_topic = consume_topic + self.batch_configuration = utils.get_batch_configuration(collector_name) self.loglines = asyncio.Queue() - self.batch_handler = BufferedBatchSender() - self.logline_handler = LoglineHandler() - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) + self.batch_handler = BufferedBatchSender( + produce_topics=produce_topics, collector_name=collector_name + ) + self.logline_handler = LoglineHandler(validation_config) + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(consume_topic) # databases - self.failed_dns_loglines = ClickHouseKafkaSender("failed_dns_loglines") - self.dns_loglines = ClickHouseKafkaSender("dns_loglines") + self.failed_protocol_loglines = ClickHouseKafkaSender("failed_loglines") + self.protocol_loglines = ClickHouseKafkaSender("loglines") self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") async def start(self) -> None: - """Starts the task to fetch data from Kafka.""" + """Starts the LogCollector processing loop. + + This method initializes the Kafka message fetching process and runs it in an executor + to avoid blocking the asyncio event loop. It logs the startup information and + continues processing until interrupted. + + """ logger.info( "LogCollector started:\n" - f" ⤷ receiving on Kafka topic '{CONSUME_TOPIC}'" + f" ⤷ receiving on Kafka topic '{self.consume_topic}'" ) + loop = asyncio.get_event_loop() + await loop.run_in_executor(None, self.fetch) - task_fetch = asyncio.Task(self.fetch()) - - try: - await asyncio.gather( - task_fetch, - ) - except KeyboardInterrupt: - task_fetch.cancel() + logger.info("LogCollector stopped.") - logger.info("LogCollector stopped.") + def fetch(self) -> None: + """Continuously listens for messages on the configured Kafka topic. - async def fetch(self) -> None: - """Fetches data from the configured Kafka topic in a loop. + This method runs in an infinite loop, consuming messages from Kafka and + processing them through the send method. It blocks until messages are + available on the Kafka topic. - Starts an asynchronous loop to continuously listen on the configured Kafka topic and fetch new messages. - If a message is consumed, it is decoded and sent. + Note: + This method is intended to be run in a separate thread via run_in_executor + since it contains a blocking loop. """ - loop = asyncio.get_running_loop() while True: - key, value, topic = await loop.run_in_executor( - None, self.kafka_consume_handler.consume - ) + key, value, topic = self.kafka_consume_handler.consume() logger.debug(f"From Kafka: '{value}'") - self.send(datetime.datetime.now(), value) def send(self, timestamp_in: datetime.datetime, message: str) -> None: - """Adds a message to the BatchSender to be stored temporarily. + """Processes and sends a log line to the batch handler after validation. - The message is added in JSON format to the BatchSender, where it is stored in - a temporary batch before being sent to the Prefilter. The subnet ID is added to the message. - In the case that a message does not have a valid logline format, it is logged as a failed logline - including timestamps of entering and being detected as invalid. In the case of a valid message, the logline's - fields as well as an "in_process" event are logged using the timestamp of it entering the module. After - processing, a "finished" event is logged for it. + This method: + 1. Validates the log line format and required fields + 2. Stores valid log lines in the database + 3. Calculates the subnet ID for batch processing + 4. Adds the log line to the batch handler Args: - timestamp_in (datetime.datetime): Timestamp of entering the pipeline. - message (str): Message to be stored. + timestamp_in (datetime.datetime): Timestamp when the log line entered the pipeline + message (str): Raw log line message in JSON format """ - try: fields = self.logline_handler.validate_logline_and_get_fields_as_json( message ) except ValueError: - self.failed_dns_loglines.insert( + self.failed_protocol_loglines.insert( dict( message_text=message, timestamp_in=timestamp_in, @@ -117,26 +134,20 @@ def send(self, timestamp_in: datetime.datetime, message: str) -> None: ) ) return - additional_fields = fields.copy() for field in REQUIRED_FIELDS: additional_fields.pop(field) - - subnet_id = self._get_subnet_id(ipaddress.ip_address(fields.get("client_ip"))) + subnet_id = self._get_subnet_id(ipaddress.ip_address(fields.get("src_ip"))) logline_id = uuid.uuid4() - - self.dns_loglines.insert( + self.protocol_loglines.insert( dict( logline_id=logline_id, subnet_id=subnet_id, - timestamp=datetime.datetime.fromisoformat(fields.get("timestamp")), - status_code=fields.get("status_code"), - client_ip=fields.get("client_ip"), - record_type=fields.get("record_type"), + timestamp=datetime.datetime.fromisoformat(fields.get("ts")), + src_ip=fields.get("src_ip"), additional_fields=json.dumps(additional_fields), ) ) - self.logline_timestamps.insert( dict( logline_id=logline_id, @@ -146,7 +157,6 @@ def send(self, timestamp_in: datetime.datetime, message: str) -> None: is_active=True, ) ) - message_fields = fields.copy() message_fields["logline_id"] = str(logline_id) @@ -159,34 +169,35 @@ def send(self, timestamp_in: datetime.datetime, message: str) -> None: is_active=True, ) ) - self.batch_handler.add_message(subnet_id, json.dumps(message_fields)) - logger.debug(f"Sent: '{message}'") - - @staticmethod - def _get_subnet_id(address: ipaddress.IPv4Address | ipaddress.IPv6Address) -> str: - """Returns the subnet ID of an IP address. + logger.debug(f"Sent: {message}") - The subnet ID is formatted as `[NORMALIZED_IP_ADDRESS]_[PREFIX_LENGTH]`. - Depending on the IP address, the configuration value - ``pipeline.log_collection.batch_handler.subnet_id.[ipv4_prefix_length | ipv6_prefix_length]`` - is used as `PREFIX_LENGTH`. + def _get_subnet_id( + self, address: ipaddress.IPv4Address | ipaddress.IPv6Address + ) -> str: + """Calculates the subnet ID for an IP address based on batch configuration. - For example, the IPv4 address `192.168.1.1` with prefix length `24` is formatted to ``192.168.1.0_24``. + This method normalizes the IP address to the configured subnet prefix length + and returns a string representation of the subnet. Args: - address (ipaddress.IPv4Address | ipaddress.IPv6Address): IP address to get the subnet ID for. + address (ipaddress.IPv4Address | ipaddress.IPv6Address): IP address to process Returns: - Subnet ID for the given IP address as string. + str: Subnet ID in the format "network_address/prefix_length" + Example: "192.168.1.0_24" or "2001:db8::/64" + + Raises: + ValueError: If the address is neither IPv4 nor IPv6 address type + """ if isinstance(address, ipaddress.IPv4Address): normalized_ip_address, prefix_length = utils.normalize_ipv4_address( - address, IPV4_PREFIX_LENGTH + address, self.batch_configuration["subnet_id"]["ipv4_prefix_length"] ) elif isinstance(address, ipaddress.IPv6Address): normalized_ip_address, prefix_length = utils.normalize_ipv6_address( - address, IPV6_PREFIX_LENGTH + address, self.batch_configuration["subnet_id"]["ipv6_prefix_length"] ) else: raise ValueError("Unsupported IP address type") @@ -194,11 +205,37 @@ def _get_subnet_id(address: ipaddress.IPv4Address | ipaddress.IPv6Address) -> st return f"{normalized_ip_address}_{prefix_length}" -def main() -> None: - """Creates the :class:`LogCollector` instance and starts it.""" - collector_instance = LogCollector() - asyncio.run(collector_instance.start()) +async def main() -> None: + """Creates and starts all configured LogCollector instances. + + This function: + 1. Iterates through all collectors defined in the configuration + 2. Creates a LogCollector instance for each collector + 3. Starts each collector in its own asyncio task + 4. Waits for all collectors to complete (which is effectively forever) + + """ + tasks = [] + + for collector in COLLECTORS: + protocol = collector["protocol_base"] + consume_topic = f"{CONSUME_TOPIC_PREFIX}-{collector['name']}" + produce_topics = [ + f"{PRODUCE_TOPIC_PREFIX}-{prefilter['name']}" + for prefilter in PREFILTERS + if collector["name"] == prefilter["collector_name"] + ] + validation_config = collector["required_log_information"] + collector_instance = LogCollector( + collector_name=collector["name"], + protocol=protocol, + consume_topic=consume_topic, + produce_topics=produce_topics, + validation_config=validation_config, + ) + tasks.append(asyncio.create_task(collector_instance.start())) + await asyncio.gather(*tasks) if __name__ == "__main__": # pragma: no cover - main() + asyncio.run(main()) diff --git a/src/logserver/server.py b/src/logserver/server.py index ec43da88..49c07ca2 100644 --- a/src/logserver/server.py +++ b/src/logserver/server.py @@ -8,21 +8,26 @@ sys.path.append(os.getcwd()) from src.base.kafka_handler import ( - SimpleKafkaConsumeHandler, + ExactlyOnceKafkaConsumeHandler, ExactlyOnceKafkaProduceHandler, ) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender -from src.base.utils import setup_config +from src.base.utils import setup_config, get_zeek_sensor_topic_base_names from src.base.log_config import get_logger module_name = "log_storage.logserver" logger = get_logger(module_name) config = setup_config() -CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"]["logserver_in"] -PRODUCE_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ +CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ + "logserver_in" +] +PRODUCE_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "logserver_to_collector" ] + +SENSOR_PROTOCOLS = get_zeek_sensor_topic_base_names(config) + READ_FROM_FILE = config["pipeline"]["log_storage"]["logserver"]["input_file"] KAFKA_BROKERS = ",".join( [ @@ -30,6 +35,9 @@ for broker in config["environment"]["kafka_brokers"] ] ) +COLLECTORS = [ + collector for collector in config["pipeline"]["log_collection"]["collectors"] +] class LogServer: @@ -40,8 +48,12 @@ class LogServer: the next stage. """ - def __init__(self) -> None: - self.kafka_consume_handler = SimpleKafkaConsumeHandler(CONSUME_TOPIC) + def __init__(self, consume_topic, produce_topics) -> None: + + self.consume_topic = consume_topic + self.produce_topics = produce_topics + + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(consume_topic) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() # databases @@ -52,25 +64,14 @@ async def start(self) -> None: """Starts the tasks to both fetch messages from Kafka and read them from the input file.""" logger.info( "LogServer started:\n" - f" ⤷ receiving on Kafka topic '{CONSUME_TOPIC}'\n" - f" ⤷ receiving from input file '{READ_FROM_FILE}'\n" - f" ⤷ sending on Kafka topic '{PRODUCE_TOPIC}'" + f" ⤷ receiving on Kafka topic '{self.consume_topic}'\n" + f" ⤷ sending on Kafka topics '{self.produce_topics}'" ) - task_fetch_kafka = asyncio.Task(self.fetch_from_kafka()) - task_fetch_file = asyncio.Task(self.fetch_from_file()) - - try: - task = asyncio.gather( - task_fetch_kafka, - task_fetch_file, - ) - await task - except KeyboardInterrupt: - task_fetch_kafka.cancel() - task_fetch_file.cancel() - - logger.info("LogServer stopped.") + loop = asyncio.get_running_loop() + await loop.run_in_executor(None, self.fetch_from_kafka) + # if awaited completely then the while True has come to an end + logger.info("LogServer stopped.") def send(self, message_id: uuid.UUID, message: str) -> None: """Sends a message using Kafka. @@ -81,8 +82,9 @@ def send(self, message_id: uuid.UUID, message: str) -> None: message_id (uuid.UUID): UUID of the message to be sent. message (str): Message to be sent. """ - self.kafka_produce_handler.produce(topic=PRODUCE_TOPIC, data=message) - logger.debug(f"Sent: '{message}'") + for topic in self.produce_topics: + self.kafka_produce_handler.produce(topic=topic, data=message) + logger.debug(f"Sent: '{message}' to topic {topic}") self.server_logs_timestamps.insert( dict( @@ -99,12 +101,8 @@ async def fetch_from_kafka(self) -> None: When a message is consumed, the unprocessed log line string including its timestamp ("timestamp_in") is logged. """ - loop = asyncio.get_running_loop() - while True: - key, value, topic = await loop.run_in_executor( - None, self.kafka_consume_handler.consume - ) + key, value, topic = self.kafka_consume_handler.consume() logger.debug(f"From Kafka: '{value}'") message_id = uuid.uuid4() @@ -118,52 +116,26 @@ async def fetch_from_kafka(self) -> None: self.send(message_id, value) - async def fetch_from_file(self, file: str = READ_FROM_FILE) -> None: - """Starts a loop to continuously check for new lines at the end of the input file and sends them. - - Checks are done every 0.1 seconds. If one or multiple new lines are found, any empty lines are removed - and the remaining lines are sent individually. For each fetched log line, the unprocessed log line string - including its timestamp ("timestamp_in") is logged. - - Args: - file (str): Filename of the file to be read. - Default: File configured in `config.yaml` (``pipeline.log_storage.logserver.input_file``) - """ - async with aiofiles.open(file, mode="r") as file: - await file.seek(0, 2) # jump to end of file - - while True: - lines = await file.readlines() - - if not lines: - await asyncio.sleep(0.1) - continue - - for line in lines: - cleaned_line = line.strip() # remove empty lines - - if not cleaned_line: - continue - - logger.debug(f"From file: '{cleaned_line}'") - - message_id = uuid.uuid4() - self.server_logs.insert( - dict( - message_id=message_id, - timestamp_in=datetime.datetime.now(), - message_text=cleaned_line, - ) - ) - - self.send(message_id, cleaned_line) +async def main() -> None: + """ + Creates the :class:`LogServer` instance and starts it for every topic used by any of the Zeek-sensors. + """ + tasks = [] + for protocol in SENSOR_PROTOCOLS: + consume_topic = f"{CONSUME_TOPIC_PREFIX}-{protocol}" + produce_topics = [ + f'{PRODUCE_TOPIC_PREFIX}-{collector["name"]}' + for collector in COLLECTORS + if collector["protocol_base"] == protocol + ] + server_instance = LogServer( + consume_topic=consume_topic, produce_topics=produce_topics + ) + tasks.append(asyncio.create_task(server_instance.start())) -def main() -> None: - """Creates the :class:`LogServer` instance and starts it.""" - server_instance = LogServer() - asyncio.run(server_instance.start()) + await asyncio.gather(*tasks) if __name__ == "__main__": # pragma: no cover - main() + asyncio.run(main()) diff --git a/src/mock/log_generator.py b/src/mock/log_generator.py index 1b7d8211..1b2cbef4 100644 --- a/src/mock/log_generator.py +++ b/src/mock/log_generator.py @@ -24,13 +24,13 @@ def generate_dns_log_line(domain: str): datetime.datetime.now() + datetime.timedelta(0, 0, random.randint(0, 900)) ).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z" status = random.choice(STATUSES) - client_ip = f"192.168.{random.randint(0, 3)}.{random.randint(1, 255)}" + src_ip = f"192.168.{random.randint(0, 3)}.{random.randint(1, 255)}" server_ip = f"10.10.0.{random.randint(1, 10)}" record_type = random.choice(RECORD_TYPES) response = IP[random.randint(0, 1)]() size = f"{random.randint(50, 255)}b" - return f"{timestamp} {status} {client_ip} {server_ip} {domain} {record_type} {response} {size}" + return f"{timestamp} {status} {src_ip} {server_ip} {domain} {record_type} {response} {size}" if __name__ == "__main__": diff --git a/src/monitoring/clickhouse_batch_sender.py b/src/monitoring/clickhouse_batch_sender.py index fa29a3f2..81d36adc 100644 --- a/src/monitoring/clickhouse_batch_sender.py +++ b/src/monitoring/clickhouse_batch_sender.py @@ -98,8 +98,8 @@ def __init__(self): "event_timestamp": datetime.datetime, }, ), - "failed_dns_loglines": Table( - "failed_dns_loglines", + "failed_loglines": Table( + "failed_loglines", { "message_text": str, "timestamp_in": datetime.datetime, @@ -114,15 +114,13 @@ def __init__(self): "batch_id": uuid.UUID, }, ), - "dns_loglines": Table( - "dns_loglines", + "loglines": Table( + "loglines", { "logline_id": uuid.UUID, "subnet_id": str, "timestamp": datetime.datetime, - "status_code": str, - "client_ip": str, - "record_type": str, + "src_ip": str, "additional_fields": Optional[str], }, ), @@ -140,6 +138,7 @@ def __init__(self): "batch_timestamps", { "batch_id": uuid.UUID, + "instance_name": str, "stage": str, "status": str, "timestamp": datetime.datetime, @@ -158,7 +157,8 @@ def __init__(self): "suspicious_batch_timestamps", { "suspicious_batch_id": uuid.UUID, - "client_ip": str, + "src_ip": str, + "instance_name": str, "stage": str, "status": str, "timestamp": datetime.datetime, @@ -169,7 +169,7 @@ def __init__(self): "alerts": Table( "alerts", { - "client_ip": str, + "src_ip": str, "suspicious_batch_id": uuid.UUID, "alert_timestamp": datetime.datetime, "overall_score": float, @@ -186,6 +186,18 @@ def __init__(self): "entry_count": int, }, ), + "batch_tree": Table( + "batch_tree", + { + "batch_row_id": str, + "batch_id": uuid.UUID, + "parent_batch_row_id": Optional[str], + "instance_name": str, + "stage": str, + "status": str, + "timestamp": datetime.datetime, + }, + ), } self.max_batch_size = BATCH_SIZE diff --git a/src/monitoring/monitoring_agent.py b/src/monitoring/monitoring_agent.py index 61c147ca..81154a17 100644 --- a/src/monitoring/monitoring_agent.py +++ b/src/monitoring/monitoring_agent.py @@ -64,15 +64,16 @@ def __init__(self): self.table_names = [ "server_logs", "server_logs_timestamps", - "failed_dns_loglines", + "failed_loglines", "logline_to_batches", - "dns_loglines", + "loglines", "logline_timestamps", "batch_timestamps", "suspicious_batches_to_batch", "suspicious_batch_timestamps", "alerts", "fill_levels", + "batch_tree", ] self.topics = [f"clickhouse_{table_name}" for table_name in self.table_names] diff --git a/src/prefilter/prefilter.py b/src/prefilter/prefilter.py index b05f63c7..8c6bb4b4 100644 --- a/src/prefilter/prefilter.py +++ b/src/prefilter/prefilter.py @@ -2,31 +2,44 @@ import os import sys import uuid - +import asyncio import marshmallow_dataclass +from collections import defaultdict sys.path.append(os.getcwd()) from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender from src.base.data_classes.batch import Batch from src.base.logline_handler import LoglineHandler from src.base.kafka_handler import ( - ExactlyOnceKafkaConsumeHandler, ExactlyOnceKafkaProduceHandler, + ExactlyOnceKafkaConsumeHandler, KafkaMessageFetchException, ) from src.base.log_config import get_logger -from src.base.utils import setup_config +from src.base.utils import ( + setup_config, + get_zeek_sensor_topic_base_names, + generate_collisions_resistant_uuid, +) module_name = "log_filtering.prefilter" logger = get_logger(module_name) config = setup_config() -CONSUME_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ + +CONSUME_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "batch_sender_to_prefilter" ] -PRODUCE_TOPIC = config["environment"]["kafka_topics"]["pipeline"][ +PRODUCE_TOPIC_PREFIX = config["environment"]["kafka_topics_prefix"]["pipeline"][ "prefilter_to_inspector" ] + +SENSOR_PROTOCOLS = get_zeek_sensor_topic_base_names(config) +PREFILTERS = config["pipeline"]["log_filtering"] +INSPECTORS = config["pipeline"]["data_inspection"] +COLLECTORS = [ + collector for collector in config["pipeline"]["log_collection"]["collectors"] +] KAFKA_BROKERS = ",".join( [ f"{broker['hostname']}:{broker['port']}" @@ -43,20 +56,37 @@ class Prefilter: data to the next pipeline stage for anomaly detection. """ - def __init__(self): + def __init__( + self, validation_config, consume_topic, produce_topics, relevance_function_name + ): + """Initializes a new ``Prefilter`` instance with the specified configuration. + + Args: + validation_config (list): Configuration for validating log line fields + consume_topic (str): Kafka topic to consume data from + produce_topics (list[str]): Kafka topics to produce filtered data to + relevance_function_name (str): Name of the relevance function to apply + + """ + self.name = None + self.consume_topic = consume_topic + self.produce_topics = produce_topics self.batch_id = None self.begin_timestamp = None self.end_timestamp = None self.subnet_id = None + self.parent_row_id = None + self.relevance_function_name = relevance_function_name self.unfiltered_data = [] self.filtered_data = [] - self.logline_handler = LoglineHandler() - self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(CONSUME_TOPIC) + self.logline_handler = LoglineHandler(validation_config) + self.kafka_consume_handler = ExactlyOnceKafkaConsumeHandler(self.consume_topic) self.kafka_produce_handler = ExactlyOnceKafkaProduceHandler() # databases + self.batch_tree = ClickHouseKafkaSender("batch_tree") self.batch_timestamps = ClickHouseKafkaSender("batch_timestamps") self.logline_timestamps = ClickHouseKafkaSender("logline_timestamps") self.fill_levels = ClickHouseKafkaSender("fill_levels") @@ -71,27 +101,34 @@ def __init__(self): ) def get_and_fill_data(self) -> None: - """Retrieves and processes a new batch from the configured Kafka topic. - - Clears any previously stored data and consumes a new batch message. Unpacks the batch - data including metadata (batch_id, timestamps, subnet_id) and stores it internally. - Logs batch reception information and updates monitoring metrics for tracking purposes. + """Retrieves and processes new data from Kafka. + + This method: + 1. Clears any existing data + 2. Consumes a new batch of data from Kafka + 3. Extracts batch metadata (ID, timestamps, subnet ID) + 4. Stores the unfiltered data internally + 5. Records processing timestamps and metrics + + Note: + This method blocks until data is available on the Kafka topic. + Empty batches are handled gracefully but logged for monitoring. """ - self.clear_data() # clear in case we already have data stored + self.clear_data() # clear in case we already have data stored key, data = self.kafka_consume_handler.consume_as_object() - self.subnet_id = key if data.data: + self.parent_row_id = data.batch_tree_row_id self.batch_id = data.batch_id self.begin_timestamp = data.begin_timestamp self.end_timestamp = data.end_timestamp self.unfiltered_data = data.data - self.batch_timestamps.insert( dict( batch_id=self.batch_id, stage=module_name, + instance_name=self.name, status="in_process", timestamp=datetime.datetime.now(), is_active=True, @@ -99,6 +136,20 @@ def get_and_fill_data(self) -> None: ) ) + row_id = generate_collisions_resistant_uuid() + + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="in_process", + timestamp=datetime.datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=self.batch_id, + ) + ) + self.fill_levels.insert( dict( timestamp=datetime.datetime.now(), @@ -115,24 +166,33 @@ def get_and_fill_data(self) -> None: ) else: logger.info( - f"Received message:\n" + f"{self.consume_topic} Received message:\n" f" ⤷ Contains data field of {len(self.unfiltered_data)} message(s) with " f"subnet_id: '{self.subnet_id}'." ) - def filter_by_error(self) -> None: - """Applies relevance-based filtering to the unfiltered batch data. + def check_data_relevance_using_rules(self) -> None: + """Applies relevance filtering to the unfiltered data. + + This method: + 1. Iterates through each log line in unfiltered_data + 2. Applies the configured relevance function to determine if the log line is relevant + 3. Adds relevant log lines to filtered_data + 4. Records non-relevant log lines as filtered out in the database + + Note: + The specific relevance function used is determined by the relevance_function_name + parameter provided during initialization. - Iterates through all loglines in the unfiltered data and applies the relevance check - using the :class:`LoglineHandler`. Relevant loglines are added to the filtered data, while - irrelevant ones are discarded and marked as "filtered_out" in the monitoring system. - Updates fill level metrics to track filtering progress. """ - for e in self.unfiltered_data: - if self.logline_handler.check_relevance(e): - self.filtered_data.append(e) + + for logline in self.unfiltered_data: + if self.logline_handler.check_relevance( + logline_dict=logline, function_name=self.relevance_function_name + ): + self.filtered_data.append(logline) else: # not relevant, filtered out - logline_id = uuid.UUID(e.get("logline_id")) + logline_id = uuid.UUID(logline.get("logline_id")) self.logline_timestamps.insert( dict( @@ -153,30 +213,37 @@ def filter_by_error(self) -> None: ) ) - def send_filtered_data(self) -> None: - """Sends the filtered batch data to the next pipeline stage via Kafka. + def send_filtered_data(self): + """Sends the filtered data to the configured Kafka topics. - Creates a properly formatted batch message with metadata and sends it to the - configured output topic. Updates batch processing status and resets fill level - metrics. Logs detailed statistics about the filtering results. + This method: + 1. Verifies there is filtered data to send + 2. Prepares the data in the required batch format + 3. Records completion timestamps in the database + 4. Sends the data to all configured produce topics Raises: - ValueError: If no filtered data is available to send. + ValueError: If there is no filtered data to send """ + + row_id = generate_collisions_resistant_uuid() + if not self.filtered_data: raise ValueError("Failed to send data: No filtered data.") - data_to_send = { + "batch_tree_row_id": row_id, "batch_id": self.batch_id, "begin_timestamp": self.begin_timestamp, "end_timestamp": self.end_timestamp, "data": self.filtered_data, } + # important to finish before sending, otherwise inspector can process before finished here! self.batch_timestamps.insert( dict( batch_id=self.batch_id, stage=module_name, + instance_name=self.name, status="finished", timestamp=datetime.datetime.now(), is_active=True, @@ -184,6 +251,18 @@ def send_filtered_data(self) -> None: ) ) + self.batch_tree.insert( + dict( + batch_row_id=row_id, + stage=module_name, + instance_name=self.name, + status="finished", + timestamp=datetime.datetime.now(), + parent_batch_row_id=self.parent_row_id, + batch_id=self.batch_id, + ) + ) + self.fill_levels.insert( dict( timestamp=datetime.datetime.now(), @@ -194,11 +273,14 @@ def send_filtered_data(self) -> None: ) batch_schema = marshmallow_dataclass.class_schema(Batch)() - self.kafka_produce_handler.produce( - topic=PRODUCE_TOPIC, - data=batch_schema.dumps(data_to_send), - key=self.subnet_id, - ) + for topic in self.produce_topics: + + self.kafka_produce_handler.produce( + topic=topic, + data=batch_schema.dumps(data_to_send), + key=self.subnet_id, + ) + logger.info( f"Filtered data was successfully sent:\n" f" ⤷ Contains data field of {len(self.filtered_data)} message(s). Originally: " @@ -214,44 +296,83 @@ def clear_data(self) -> None: self.unfiltered_data = [] self.filtered_data = [] + def bootstrap_prefiltering_process(self): + """Runs the main prefiltering process loop. + + This method implements an infinite loop that: + 1. Fetches new data from Kafka + 2. Filters the data for relevance + 3. Sends the filtered data to inspectors + + """ + logger.info(f"I am {self.consume_topic}") + counter = 0 + while True: + self.get_and_fill_data() + self.check_data_relevance_using_rules() + self.send_filtered_data() + counter += 1 -def main(one_iteration: bool = False) -> None: - """Creates the :class:`Prefilter` instance and runs the main processing loop. + async def start(self): # pragma: no cover + """Starts the ``Prefilter`` processing loop. - Continuously processes batches by retrieving data, applying filters, and sending - filtered results. The loop handles various exceptions gracefully and supports - clean shutdown via KeyboardInterrupt. + This method: + 1. Logs startup information + 2. Runs the main processing loop in a separate thread + 3. Handles graceful shutdown on interruption + + Args: + one_iteration (bool): If True, processes only one batch (for testing). Default: False + + """ + loop = asyncio.get_running_loop() + logger.info( + "Prefilter started:\n" + f" ⤷ receiving on Kafka topic '{self.consume_topic}'" + ) + await loop.run_in_executor(None, self.bootstrap_prefiltering_process) + logger.info("Closing down Prefilter...") + self.clear_data() + + +async def main() -> None: + """Creates and starts all configured Prefilter instances. + + This function: + 1. Iterates through all prefilter configurations defined in config.yaml + 2. For each prefilter: + - Determines the relevance function to use + - Sets up the validation configuration based on the config.yaml + - Determines the topics to consume from and produce to + - Creates an according ``Prefilter`` instance + - Runs the ``start`` method - Args: - one_iteration (bool): If True, only processes one batch and exits. - Used primarily for testing purposes. Default: False """ - prefilter = Prefilter() - - iterations = 0 - while True: - if one_iteration and iterations > 0: - break - iterations += 1 - - try: - prefilter.get_and_fill_data() - prefilter.filter_by_error() - prefilter.send_filtered_data() - except IOError as e: - logger.error(e) - raise - except ValueError as e: - logger.debug(e) - except KafkaMessageFetchException as e: - logger.debug(e) - continue - except KeyboardInterrupt: - logger.info("Closing down Prefilter...") - break - finally: - prefilter.clear_data() + tasks = [] + for prefilter in PREFILTERS: + relevance_function_name = prefilter["relevance_method"] + validation_config = [ + item + for collector in COLLECTORS + if collector["name"] == prefilter["collector_name"] + for item in collector["required_log_information"] + ] + consume_topic = f"{CONSUME_TOPIC_PREFIX}-{prefilter['name']}" + produce_topics = [ + f"{PRODUCE_TOPIC_PREFIX}-{inspector['name']}" + for inspector in INSPECTORS + if prefilter["name"] == inspector["prefilter_name"] + ] + prefilter_instance = Prefilter( + validation_config=validation_config, + consume_topic=consume_topic, + produce_topics=produce_topics, + relevance_function_name=relevance_function_name, + ) + prefilter_instance.name = prefilter["name"] + tasks.append(asyncio.create_task(prefilter_instance.start())) + await asyncio.gather(*tasks) if __name__ == "__main__": # pragma: no cover - main() + asyncio.run(main()) diff --git a/src/train/dataset.py b/src/train/dataset.py index 6325f51b..247b05c0 100644 --- a/src/train/dataset.py +++ b/src/train/dataset.py @@ -274,7 +274,7 @@ def cast_heicloud(data_path: str, max_rows: int) -> pl.DataFrame: { "column_1": "timestamp", "column_2": "return_code", - "column_3": "client_ip", + "column_3": "src_ip", "column_4": "dns_server", "column_5": "query", "column_6": "type", diff --git a/src/zeek/__init__.py b/src/zeek/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/zeek/additional_configs/dns_config.zeek b/src/zeek/additional_configs/dns_config.zeek new file mode 100644 index 00000000..a66253ab --- /dev/null +++ b/src/zeek/additional_configs/dns_config.zeek @@ -0,0 +1,93 @@ +@load base/protocols/dns +module CustomDNS; +export { + redef enum Log::ID += { LOG }; + type Info: record { + ts: string &log; + uid: string &log; + src_ip: addr &log; + src_port: port &log; + dns_server_ip: addr &log; + dns_server_port: port &log; + domain_name: string &log &optional; + record_type: string &log &optional; + response_ip: vector of string &log &optional; + ttls: vector of interval &log &optional; + rejected: bool &log &default=F; + status_code_id: count &log &optional; + status_code: string &log &optional; + + }; + global log_dns: event(rec: Info); + global dns_payload_sizes: table[string] of count + &default=0 + &write_expire = 5min; +} + + + +event zeek_init() &priority=5 +{ + Log::create_stream(CustomDNS::LOG, [$columns=Info, $path="custom_dns"]); +} + + +redef record CustomDNS::Info += { + size: count &log &optional; +}; + +event dns_message(c: connection, is_query: bool, msg: dns_msg, len: count) +{ + dns_payload_sizes[c$uid] = len; +} + +event DNS::log_dns(rec: DNS::Info) +{ + local dnsLog: Info = [ + $ts = strftime("%Y-%m-%dT%H:%M:%S", rec$ts), + $uid = rec$uid, + $src_ip = rec$id$orig_h, + $src_port = rec$id$orig_p, + $dns_server_ip = rec$id$resp_h, + $dns_server_port = rec$id$resp_p, + $rejected = rec$rejected + ]; + + ##### add custom log messages if a given field that needs to be present is not present in the logline #### + # use this only for fields that are absolutely necessary + + # Keep this deactivated for now, as we want to use zeek at first to not prefilter anything + +# if ( ! rec?$query ) +# print fmt("Info: missing domain in DNS log %s, skipping the log...", rec); +# if ( ! rec?$conn ) +# print fmt("Info:could not determine request length for line %s, skipping the log...", rec); +# + ########################################################################################################### + + if ( rec?$query ) + dnsLog$domain_name = rec$query; + + if ( rec?$qtype_name ) + dnsLog$record_type = rec$qtype_name; + + if ( rec?$answers ) + dnsLog$response_ip = rec$answers; + + if ( rec?$TTLs ) + dnsLog$ttls = rec$TTLs; + + if ( rec?$rcode ) + dnsLog$status_code_id = rec$rcode; + + if ( rec?$rcode_name ) + dnsLog$status_code = rec$rcode_name; + + if ( rec$uid in dns_payload_sizes ) + { + dnsLog$size = dns_payload_sizes[rec$uid]; + delete dns_payload_sizes[rec$uid]; + } + + Log::write(CustomDNS::LOG, dnsLog); +} diff --git a/src/zeek/additional_configs/http_config.zeek b/src/zeek/additional_configs/http_config.zeek new file mode 100644 index 00000000..51c6a04b --- /dev/null +++ b/src/zeek/additional_configs/http_config.zeek @@ -0,0 +1,70 @@ +@load base/protocols/http +module CustomHTTP; +export { + redef enum Log::ID += { LOG }; + type Info: record { + ts: string &log; + uid: string &log; + src_ip: addr &log; + src_port: port &log; + dst_ip: addr &log; + dst_port: port &log; + method: string &log; + host: string &log &optional; + uri: string &log; + referrer: string &log &optional; + version: string &log &optional; + user_agent: string &log &optional; + request_body_len: count &log; + response_body_len: count &log; + status_code: count &log &optional; + status_msg: string &log &optional; + }; + global log_http: event(rec: Info); +} +event zeek_init() &priority=5 +{ + Log::create_stream(CustomHTTP::LOG, [$columns=Info, $path="custom_http"]); +} + + + +event HTTP::log_http(rec: HTTP::Info) +{ + local mylog: Info = [ + $ts = strftime("%Y-%m-%dT%H:%M:%S", rec$ts), + $uid = rec$uid, + $src_ip = rec$id$orig_h, + $src_port = rec$id$orig_p, + $dst_ip = rec$id$resp_h, + $dst_port = rec$id$resp_p, + $method = rec$method, + $uri = rec$uri, + $request_body_len = rec$request_body_len, + $response_body_len = rec$response_body_len + ]; + + + ##### add custom log messages if a given field that needs to be present is not present in the logline #### + + if ( ! rec?$host ) + print fmt("Info: missing host name in HTTP log %s, skipping the log...", rec); + if ( ! rec?$status_code ) + print fmt("Info: missing status code in HTTP log %s, skipping the log...", rec); + ########################################################################################################### + + if ( rec?$host ) + mylog$host = rec$host; + if ( rec?$version ) + mylog$version = rec$version; + if ( rec?$referrer ) + mylog$referrer = rec$referrer; + if ( rec?$user_agent ) + mylog$user_agent = rec$user_agent; + if ( rec?$status_code ) + mylog$status_code = rec$status_code; + if ( rec?$status_msg ) + mylog$status_msg = rec$status_msg; + + Log::write(CustomHTTP::LOG, mylog); +} diff --git a/src/zeek/base_node.cfg b/src/zeek/base_node.cfg new file mode 100644 index 00000000..f1f50d25 --- /dev/null +++ b/src/zeek/base_node.cfg @@ -0,0 +1,11 @@ +[logger] +type=logger +host=localhost + +[manager] +type=manager +host=localhost + +[proxy] +type=proxy +host=localhost diff --git a/src/zeek/zeek_analysis_handler.py b/src/zeek/zeek_analysis_handler.py new file mode 100644 index 00000000..2e1199b7 --- /dev/null +++ b/src/zeek/zeek_analysis_handler.py @@ -0,0 +1,125 @@ +import sys +import os +import threading +import subprocess +import glob + +sys.path.append(os.getcwd()) +from src.base.log_config import get_logger + +logger = get_logger("zeek.sensor") + + +class ZeekAnalysisHandler: + """ + Handles the execution of Zeek analysis in either static or network analysis mode. + + This class manages the Zeek processing workflow, supporting both static analysis of + PCAP files and live network traffic analysis. It provides the necessary infrastructure + for launching Zeek processes, managing their execution, and handling their output. + + """ + + def __init__(self, zeek_config_location: str, zeek_log_location: str): + """ + Initialize the Zeek analysis handler with configuration and log locations. + + Args: + zeek_config_location: Path to the Zeek configuration file that defines + the analysis scripts and plugins to be loaded + zeek_log_location: Path where Zeek will write its processing logs + + Note: + The configuration file location typically points to local.zeek or + another site-specific configuration file that incorporates the necessary + analysis scripts and Kafka plugin configuration. + """ + self.zeek_log_location = zeek_log_location + self.zeek_config_location = zeek_config_location + + def start_analysis(self, static_analysis: bool): + """ + Start Zeek analysis in the specified mode. + + This method serves as the main entry point for initiating Zeek processing, + delegating to the appropriate analysis method based on the mode parameter. + + Args: + static_analysis: If True, process stored PCAP files; if False, analyze + live network traffic + """ + if static_analysis: + logger.info("static analysis mode selected") + self.start_static_analysis() + else: + logger.info("network analysis mode selected") + self.start_network_analysis() + + def start_static_analysis(self): + """ + Start an analysis by reading in PCAP files + + This method: + 1. Locates all PCAP files in the directory specified by STATIC_FILES_DIR + 2. Creates a separate Zeek process for each PCAP file + 3. Runs these processes in parallel using threads + 4. Waits for all processes to complete before returning + + The Zeek processes use the configured analysis scripts to process the PCAP + files and output the results to the configured destinations (typically Kafka + via the Zeek Kafka plugin). + """ + self.static_files_dir = os.getenv("STATIC_FILES_DIR") + files = glob.glob(f"{self.static_files_dir}/*.pcap") + threads = [] + for file in files: + logger.info(f"Starting Analysis for file {file}...") + command = ["zeek", "-C", "-r", file, self.zeek_config_location] + thread = threading.Thread(target=subprocess.run, args=(command,)) + thread.start() + threads.append(thread) + + for thread in threads: + thread.join() + logger.info("Finished static analyses") + + def start_network_analysis(self): + """ + Start Zeek in live network analysis mode. + + This method: + 1. Deploys the Zeek configuration using zeekctl + 2. Starts monitoring Zeek's log output in real-time + 3. Streams the processed data to the configured output destinations + + The method creates a dedicated thread to monitor Zeek's log output to prevent + buffer overflow issues that would occur if the output was processed in the + main thread. This ensures continuous processing of network traffic without + data loss. + """ + start_zeek = ["zeekctl", "deploy"] + thread = threading.Thread(target=subprocess.run, args=(start_zeek,)) + thread.start() + thread.join() + + process = subprocess.Popen( + ["tail", "-f", "/dev/null"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + + def read_output(): # pragma: no cover + for line in iter(process.stdout.readline, ""): + if line: + print(f"[ZEEK LOG] {line}", end="") + process.stdout.close() + + logger.info("network analysis started") + # Start background thread to read stdout line by line + # necesseray because otherwise subprocess stdout will run into buffer errors eventually + reader_thread = threading.Thread(target=read_output, daemon=True) + reader_thread.start() + logger.info("network analysis ongoing") + reader_thread.join() + logger.info("network analysis stopped") diff --git a/src/zeek/zeek_config_handler.py b/src/zeek/zeek_config_handler.py new file mode 100644 index 00000000..9cbc4f4c --- /dev/null +++ b/src/zeek/zeek_config_handler.py @@ -0,0 +1,271 @@ +import sys +import os +import shutil + +sys.path.append(os.getcwd()) +from src.base.log_config import get_logger +import glob + +logger = get_logger("zeek.sensor") + + +class ZeekConfigurationHandler: + """ + Handles the configuration of Zeek sensors based on the pipeline configuration. + + This class is responsible for setting up Zeek to process network traffic according + to the specified configuration. It configures the Zeek Kafka plugin, sets up worker + nodes for network interfaces, and integrates additional custom configurations. + + The handler supports both static analysis (processing PCAP files) and network + analysis (live traffic monitoring) modes, with configuration adapted to the + specific sensor requirements defined in the pipeline configuration. + + Example: + >>> config = { + ... "environment": { + ... "kafka_brokers": [{"hostname": "kafka1", "port": 9092, "node_ip": "192.168.1.100"}], + ... "kafka_topics_prefix": {"pipeline": {"logserver_in": "pipeline-logserver_in"}} + ... }, + ... "pipeline": { + ... "zeek": { + ... "sensors": { + ... "zeek1": { + ... "static_analysis": True, + ... "protocols": ["http", "dns"], + ... "interfaces": ["eth0"] + ... } + ... } + ... } + ... } + ... } + >>> os.environ["CONTAINER_NAME"] = "zeek1" + >>> handler = ZeekConfigurationHandler(config) + >>> handler.configure() + """ + + def __init__( + self, + configuration_dict: dict, + zeek_config_location: str = "/usr/local/zeek/share/zeek/site/local.zeek", + zeek_node_config_template: str = "/opt/src/zeek/base_node.cfg", + zeek_log_location: str = "/usr/local/zeek/log/zeek.log", + additional_configurations: str = "/opt/src/zeek/additional_configs/", + ): + """ + Initialize the Zeek configuration handler with the pipeline configuration. + + Args: + configuration_dict: The complete pipeline configuration dictionary + loaded from config.yaml, containing sensor, Kafka, and environment settings + zeek_config_location: Path to the main Zeek configuration file where + plugin configurations will be appended (default: standard Zeek location) + zeek_node_config_template: Path to the template for node.cfg configuration + (default: internal template in the project) + zeek_log_location: Path where Zeek will write its log files + (default: standard Zeek log location) + additional_configurations: Directory containing additional Zeek configuration + files that should be appended to the main configuration + """ + logger.info(f"Setting up Zeek configuration...") + self.base_config_location = zeek_config_location + self.additional_configurations = additional_configurations + self.zeek_node_config_template = zeek_node_config_template + self.zeek_node_config_path: str = "/usr/local/zeek/etc/node.cfg" + self.zeek_log_location = zeek_log_location + + self.container_name = os.getenv("CONTAINER_NAME", None) + if self.container_name is None: + logger.error( + "CONTAINER_NAME ENV variable could not be found. Aborting configuration..." + ) + raise Exception("CONTAINER_NAME env. variable not found.") + + configured_kafka_brokers = configuration_dict["environment"]["kafka_brokers"] + # configured_kafka_topic = configuration_dict["environment"]["kafka_topics"]["pipeline"]["zeek_to_logserver"] + zeek_sensor_configuration = configuration_dict["pipeline"]["zeek"]["sensors"][ + self.container_name + ] + + if ( + "static_analysis" in zeek_sensor_configuration.keys() + and zeek_sensor_configuration["static_analysis"] + ): + self.is_analysis_static = True + else: + self.is_analysis_static = False + try: + self.network_interfaces = zeek_sensor_configuration["interfaces"] + except Exception as e: + logger.error(e) + logger.error( + "Could not parse configuration for zeek sensor, as the 'interfaces' parameter is not specified" + ) + + self.kafka_topic_prefix = configuration_dict["environment"][ + "kafka_topics_prefix" + ]["pipeline"]["logserver_in"] + + self.configured_protocols = [ + protocol for protocol in zeek_sensor_configuration["protocols"] + ] + self.kafka_brokers = [ + f"{broker['node_ip']}:{broker['port']}" + for broker in configured_kafka_brokers + ] + logger.info(f"Succesfully parse config.yaml") + + def configure(self): + """ + Execute the complete Zeek configuration process. + + This method orchestrates the entire configuration workflow: + 1. For network analysis mode: Sets up node configuration for network interfaces + 2. Appends any additional custom configurations + 3. Creates and writes the Kafka plugin configuration + + The method adapts the configuration based on whether the sensor is in + static analysis mode (processing PCAP files) or network analysis mode + (monitoring live traffic). + + Note: + This is the main entry point for configuring Zeek. After calling this + method, Zeek should be fully configured and ready to process traffic + according to the pipeline specifications. + """ + logger.info(f"configuring Zeek...") + if not self.is_analysis_static: + self.template_and_copy_node_config() + self.append_additional_configurations() + self.create_plugin_configuration() + + def append_additional_configurations(self): + """ + Append custom configuration files to the main Zeek configuration. + + This method: + 1. Finds all *.zeek files in the additional configurations directory + 2. Appends their contents to the main Zeek configuration file + + Custom configuration files can be used to extend Zeek's functionality + with custom scripts, event handlers, or protocol analyzers without + modifying the core configuration. + + Example: + If additional_configurations="/opt/src/zeek/additional_configs/" + contains a file custom_http.zeek with content: + @load base/protocols/http/main.zeek + redef HTTP::default_accept_gzip = T; + + This content will be appended to the main Zeek configuration file. + + Note: + The method adds a newline before appending each file to ensure + proper separation between configuration sections. + """ + config_files = find_files_in_dir(self.additional_configurations) + with open(self.base_config_location, "a") as base_config: + base_config.write("\n") + for file in config_files: + with open(file) as additional_config: + base_config.writelines(additional_config) + + def create_plugin_configuration(self): + """ + Generate and write the Kafka plugin configuration for Zeek. + + This method: + 1. Creates the core Kafka plugin configuration + 2. Sets up topic mappings for each configured protocol + 3. Writes the complete configuration to the main Zeek configuration file + + The configuration directs Zeek to send processed log data to Kafka topics + following the naming convention: {kafka_topic_prefix}-{protocol} + + """ + config_lines = [ + "@load packages/zeek-kafka\n", + 'redef Kafka::topic_name = "";\n', + f"redef Kafka::kafka_conf = table(\n" + f' ["metadata.broker.list"] = "{",".join(self.kafka_brokers)}");\n', + "redef Kafka::tag_json = F;\n", + "event zeek_init() &priority=-10\n", + "{\n", + ] + for protocol in self.configured_protocols: + topic_name = f"{self.kafka_topic_prefix}-{protocol.lower()}" + zeek_protocol_log_format = f"Custom{protocol.upper()}" + kafka_writer_name = f"{protocol.lower()}_filter" + filter_block = f""" + local {kafka_writer_name}: Log::Filter = [ + $name = "kafka-{kafka_writer_name}", + $writer = Log::WRITER_KAFKAWRITER, + $path = "{topic_name}" + ]; + Log::add_filter({zeek_protocol_log_format}::LOG, {kafka_writer_name});\n + """ + config_lines.append(filter_block) + config_lines.append("\n}") + + with open(self.base_config_location, "a") as f: + f.writelines(config_lines) + logger.info("Wrote kafka zeek plugin configuration to file") + + def create_worker_configurations_for_interfaces(self): + """ + Generate configuration lines for Zeek worker nodes. + + This method creates the configuration blocks needed for Zeek's cluster mode, + where each network interface gets its own worker node. + + Returns: + List[str]: Configuration lines that should be appended to node.cfg + + Example: + For network_interfaces=["eth0", "dummy"], returns: + [ + "[zeek-eth0]\n", + "type=worker\n", + "host=localhost\n", + "[zeek-dummy]\n", + "type=worker\n", + "host=localhost\n" + ] + + Note: + This method is only called when in network analysis mode (not static analysis). + Each worker is configured to run on the local host and process traffic + from a specific network interface. + """ + worker_configuration_lines = [] + for network_interface in self.network_interfaces: + worker_configuration_lines.extend( + [f"[zeek-{network_interface}]\n", "type=worker\n", "host=localhost\n"] + ) + return worker_configuration_lines + + def template_and_copy_node_config(self): + """ + Set up the node configuration for Zeek cluster mode. + + This method: + 1. Copies the node configuration template to Zeek's expected location + 2. Appends worker configurations for each network interface + + The node configuration (node.cfg) defines how Zeek should distribute + processing across multiple worker processes, which is necessary for + monitoring multiple network interfaces simultaneously. + + Note: + This method is only called when in network analysis mode. Static + analysis mode does not require worker configuration as it processes + PCAP files sequentially. + """ + shutil.copy2(self.zeek_node_config_template, self.zeek_node_config_path) + configuration_lines = self.create_worker_configurations_for_interfaces() + with open(self.zeek_node_config_path, "a") as f: + f.writelines(configuration_lines) + + +def find_files_in_dir(path): # pragma: no cover + return glob.glob(os.path.join(path, "*.zeek")) diff --git a/src/zeek/zeek_handler.py b/src/zeek/zeek_handler.py new file mode 100644 index 00000000..a11c7ce0 --- /dev/null +++ b/src/zeek/zeek_handler.py @@ -0,0 +1,97 @@ +import click +import sys +import yaml +import shutil +import os +from src.zeek.zeek_analysis_handler import ZeekAnalysisHandler +from src.zeek.zeek_config_handler import ZeekConfigurationHandler + +sys.path.append(os.getcwd()) +from src.base.log_config import get_logger + +logger = get_logger("zeek.sensor") + + +@click.command() +@click.option( + "-c", + "--config", + "configuration_file_path", + required=True, + type=click.File(mode="r"), + help="Path to the configuration file location", +) +@click.option( + "--zeek-config-location", + "zeek_config_location", + help=( + "Overrides the default configuration location of Zeek under /usr/local/zeek/share/zeek/site/local.zeek" + ), +) +def setup_zeek(configuration_file_path, zeek_config_location): + """ + Configure and start Zeek analysis based on pipeline configuration. + + This is the main entry point for the Zeek configuration and analysis process. + It handles the complete workflow from configuration setup to analysis execution. + + The function: + 1. Manages Zeek configuration backups to ensure clean setup between runs + 2. Parses the pipeline configuration file + 3. Configures Zeek using the specified or default configuration location + 4. Starts analysis in the appropriate mode (static or network) + + Args: + configuration_file_path: File object pointing to the pipeline configuration + YAML file that defines sensor settings, Kafka brokers, and other parameters + zeek_config_location: Optional path to override the default Zeek configuration + location. If not provided, uses /usr/local/zeek/share/zeek/site/local.zeek + + Workflow: + 1. On first run: Backs up the default Zeek configuration + 2. On subsequent runs: Restores the backed-up configuration to ensure a clean state + 3. Parses the YAML configuration file + 4. Configures Zeek using ZeekConfigurationHandler + 5. Starts analysis using ZeekAnalysisHandler in the mode specified by the config + + Raises: + yaml.YAMLError: If the configuration file is not valid YAML + Exception: If required environment variables (like CONTAINER_NAME) are missing + """ + default_zeek_config_location = "/usr/local/zeek/share/zeek/site/local.zeek" + default_zeek_config_backup_location = "/opt/local.zeek_backup" + initial_zeek_setup: bool = ( + False if os.path.isfile(default_zeek_config_backup_location) else True + ) + logger.info(f"initial setup: {initial_zeek_setup}") + if initial_zeek_setup: + logger.info("Backup default config") + shutil.copy2(default_zeek_config_location, default_zeek_config_backup_location) + else: + logger.info("Restore default config") + shutil.copy2(default_zeek_config_backup_location, default_zeek_config_location) + + configuration_file_content = configuration_file_path.read() + try: + data = yaml.safe_load(configuration_file_content) + except yaml.YAMLError as e: + logger.error("Error parsing the config file. Is this proper yaml?") + raise (e) + + if zeek_config_location is None: + zeek_config_location = default_zeek_config_location + zeekConfigHandler = ZeekConfigurationHandler(data, default_zeek_config_location) + else: + zeekConfigHandler = ZeekConfigurationHandler(data, zeek_config_location) + + zeekConfigHandler.configure() + logger.info("configured zeek") + zeekAnalysisHandler = ZeekAnalysisHandler( + zeek_config_location, zeekConfigHandler.zeek_log_location + ) + logger.info("starting analysis...") + zeekAnalysisHandler.start_analysis(zeekConfigHandler.is_analysis_static) + + +if __name__ == "__main__": # pragma: no cover + setup_zeek() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/detector/test_detector.py b/tests/detector/test_detector.py index c240ccb7..5ce6aad8 100644 --- a/tests/detector/test_detector.py +++ b/tests/detector/test_detector.py @@ -8,10 +8,42 @@ from requests import HTTPError from src.base.data_classes.batch import Batch -from src.detector.detector import Detector, WrongChecksum +from src.detector.detector import DetectorBase, WrongChecksum +from src.base.kafka_handler import KafkaMessageFetchException + +MINIMAL_DETECTOR_CONFIG = { + "name": "test-detector", + "detector_module_name": "test_detector", + "detector_class_name": "TestDetector", + "model": "rf", + "checksum": "ba1f718179191348fe2abd51644d76191d42a5d967c6844feb3371b6f798bf06", + "base_url": "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", + "threshold": "0.005", + "inspector_name": "no_inspector", +} + + +class TestDetector(DetectorBase): + """ + Testclass that does not take any action to not dialute the tests + """ + + def __init__(self, detector_config, consume_topic) -> None: + self.model_base_url = detector_config["base_url"] + super().__init__(consume_topic=consume_topic, detector_config=detector_config) + + def get_model_download_url(self): + return f"{self.model_base_url}/files/?p=%2F{self.model}_{self.checksum}.pickle&dl=1" + + def get_scaler_download_url(self): + return f"{self.model_base_url}/files/?p=%2F{self.model}_{self.checksum}_scaler.pickle&dl=1" + + def predict(self, message): + pass + DEFAULT_DATA = { - "client_ip": "192.168.0.167", + "src_ip": "192.168.0.167", "dns_ip": "10.10.0.10", "response_ip": "252.79.173.222", "timestamp": "", @@ -25,139 +57,84 @@ class TestSha256Sum(unittest.TestCase): @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") - def test_sha256_empty_file(self, mock_clickhouse, mock_kafka_consume_handler): + @patch("src.detector.detector.DetectorBase._get_model") + def test_sha256_empty_file( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler + ): + mock_get_model.return_value = (MagicMock(), MagicMock()) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() - + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) with self.assertRaises(FileNotFoundError): sut._sha256sum("") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") def test_sha256_not_existing_file( - self, mock_clickhouse, mock_kafka_consume_handler + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler ): + mock_get_model.return_value = (MagicMock(), MagicMock()) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() - + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) with self.assertRaises(FileNotFoundError): sut._sha256sum("not_existing") -class TestFeatures(unittest.TestCase): +class TestGetModel(unittest.TestCase): def setUp(self): patcher = patch("src.detector.detector.logger") self.mock_logger = patcher.start() self.addCleanup(patcher.stop) - @patch( - "src.detector.detector.CHECKSUM", - "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") def test_get_model(self, mock_clickhouse, mock_kafka_consume_handler): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() - sut._get_features("google.de") - - -class TestGetModel(unittest.TestCase): - def setUp(self): - patcher = patch("src.detector.detector.logger") - self.mock_logger = patcher.start() - self.addCleanup(patcher.stop) + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) - @patch( - "src.detector.detector.CHECKSUM", - "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") - def test_get_model(self, mock_clickhouse, mock_kafka_consume_handler): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - - sut = Detector() - - @patch( - "src.detector.detector.CHECKSUM", - "WRONG", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) - @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_get_model_not_existing(self, mock_kafka_consume_handler): + def test_get_model_wrong_checksum( + self, mock_clickhouse, mock_kafka_consume_handler + ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - + detector_config = MINIMAL_DETECTOR_CONFIG.copy() + detector_config["checksum"] = "INVALID" with self.assertRaises(WrongChecksum): - sut = Detector() - - @patch( - "src.detector.detector.CHECKSUM", - "04970cd6fe0be5369248d24541c7b8faf69718706019f80280a0a687884f35fb", - ) - @patch("src.detector.detector.MODEL", "WRONG") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) - @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_get_model_not_existing(self, mock_kafka_consume_handler): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - - with self.assertRaises(WrongChecksum): - sut = Detector() - - @patch( - "src.detector.detector.CHECKSUM", - "Test", - ) - @patch("src.detector.detector.MODEL", "xg") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/WRONG/", - ) - @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - def test_get_model_not_existing(self, mock_kafka_consume_handler): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - - with self.assertRaises(HTTPError): - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=detector_config + ) class TestInit(unittest.TestCase): - @patch("src.detector.detector.CONSUME_TOPIC", "test_topic") + # @patch("src.detector.detector.CONSUME_TOPIC", "test_topic") @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") - def test_init(self, mock_clickhouse, mock_kafka_consume_handler, mock_logger): + @patch("src.detector.detector.DetectorBase._get_model") + def test_init( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler, mock_logger + ): + mock_get_model.return_value = (MagicMock(), MagicMock()) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) self.assertEqual([], sut.messages) self.assertEqual(mock_kafka_consume_handler_instance, sut.kafka_consume_handler) @@ -168,10 +145,13 @@ class TestGetData(unittest.TestCase): @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") def test_get_data_without_return_data( - self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): + mock_get_model.return_value = (MagicMock(), MagicMock()) test_batch = Batch( + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", batch_id=uuid.uuid4(), begin_timestamp=datetime.now(), end_timestamp=datetime.now() + timedelta(0, 3), @@ -185,7 +165,10 @@ def test_get_data_without_return_data( test_batch, ) - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" sut.get_and_fill_data() self.assertEqual([], sut.messages) @@ -193,12 +176,15 @@ def test_get_data_without_return_data( @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") def test_get_data_with_return_data( - self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): + mock_get_model.return_value = (MagicMock(), MagicMock()) begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, @@ -212,7 +198,9 @@ def test_get_data_with_return_data( test_batch, ) - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) sut.messages = [] sut.get_and_fill_data() @@ -223,12 +211,15 @@ def test_get_data_with_return_data( @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") def test_get_data_while_busy( - self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): + mock_get_model.return_value = (MagicMock(), MagicMock()) begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, @@ -242,7 +233,9 @@ def test_get_data_while_busy( test_batch, ) - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) sut.messages = [{"test": "test_message_2"}] sut.get_and_fill_data() @@ -255,22 +248,19 @@ def setUp(self): self.mock_logger = patcher.start() self.addCleanup(patcher.stop) - @patch( - "src.detector.detector.CHECKSUM", - "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") - def test_save_warning(self, mock_clickhouse, mock_kafka_consume_handler): + @patch("src.detector.detector.DetectorBase._get_model") + def test_save_warning( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler + ): + mock_get_model.return_value = (MagicMock(), MagicMock()) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) sut.warnings = [ { "request": "google.de", @@ -285,6 +275,7 @@ def test_save_warning(self, mock_clickhouse, mock_kafka_consume_handler): "sha256": "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", }, ] + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" sut.messages = [{"logline_id": "test_id"}] open_mock = mock_open() with patch("src.detector.detector.open", open_mock, create=True): @@ -294,42 +285,20 @@ def test_save_warning(self, mock_clickhouse, mock_kafka_consume_handler): os.path.join(tempfile.gettempdir(), "warnings.json"), "a+" ) - @patch( - "src.detector.detector.CHECKSUM", - "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) - @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") - @patch("src.detector.detector.ClickHouseKafkaSender") - def test_prediction(self, mock_clickhouse, mock_kafka_consume_handler): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - - sut = Detector() - sut.messages = [DEFAULT_DATA] - sut.detect() - self.assertNotEqual([], sut.warnings) - - @patch( - "src.detector.detector.CHECKSUM", - "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") - def test_save_empty_warning(self, mock_clickhouse, mock_kafka_consume_handler): + @patch("src.detector.detector.DetectorBase._get_model") + def test_save_empty_warning( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler + ): + mock_get_model.return_value = (MagicMock(), MagicMock()) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" sut.warnings = [] sut.messages = [{"logline_id": "test_id"}] open_mock = mock_open() @@ -338,22 +307,28 @@ def test_save_empty_warning(self, mock_clickhouse, mock_kafka_consume_handler): open_mock.assert_not_called() - @patch( - "src.detector.detector.CHECKSUM", - "021af76b2385ddbc76f6e3ad10feb0bb081f9cf05cff2e52333e31040bbf36cc", - ) - @patch("src.detector.detector.MODEL", "rf") - @patch( - "src.detector.detector.MODEL_BASE_URL", - "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", - ) + # @patch( + # "src.detector.detector.CHECKSUM", + # "ba1f718179191348fe2abd51644d76191d42a5d967c6844feb3371b6f798bf06", + # ) + # @patch("src.detector.detector.MODEL", "rf") + # @patch( + # "src.detector.detector.MODEL_BASE_URL", + # "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/", + # ) @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") - def test_save_warning_error(self, mock_clickhouse, mock_kafka_consume_handler): + @patch("src.detector.detector.DetectorBase._get_model") + def test_save_warning_error( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler + ): + mock_get_model.return_value = (MagicMock(), MagicMock()) mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) sut.warnings = [ { "request": "request.de", @@ -375,13 +350,19 @@ def setUp(self): @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") def test_clear_data_without_existing_data( - self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): + mock_get_model.return_value = (MagicMock(), MagicMock()) begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( - batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[] + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", + batch_id=uuid.uuid4(), + begin_timestamp=begin, + end_timestamp=end, + data=[], ) mock_kafka_consume_handler_instance = MagicMock() @@ -391,7 +372,10 @@ def test_clear_data_without_existing_data( test_batch, ) - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + sut.messages = [] sut.clear_data() @@ -400,13 +384,19 @@ def test_clear_data_without_existing_data( @patch("src.detector.detector.logger") @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") def test_clear_data_with_existing_data( - self, mock_clickhouse, mock_kafka_consume_handler, mock_logger + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler, mock_logger ): + mock_get_model.return_value = (MagicMock(), MagicMock()) begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( - batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, data=[] + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", + batch_id=uuid.uuid4(), + begin_timestamp=begin, + end_timestamp=end, + data=[], ) mock_kafka_consume_handler_instance = MagicMock() @@ -416,7 +406,9 @@ def test_clear_data_with_existing_data( test_batch, ) - sut = Detector() + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) sut.messages = ["test_data"] sut.begin_timestamp = datetime.now() sut.end_timestamp = sut.begin_timestamp + timedelta(0, 3) @@ -427,5 +419,228 @@ def test_clear_data_with_existing_data( self.assertIsNone(sut.end_timestamp) -if __name__ == "__main__": - unittest.main() +class TestGetModelMethod(unittest.TestCase): + def setUp(self): + patcher = patch("src.detector.detector.logger") + self.mock_logger = patcher.start() + self.addCleanup(patcher.stop) + + @patch("src.detector.detector.requests.get") + @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_get_model_downloads_and_validates( + self, mock_clickhouse, mock_kafka_consume_handler, mock_requests_get + ): + """Test that model is downloaded when not present and checksum is validated.""" + # Setup mock response with valid model content + mock_response = MagicMock() + mock_response.content = b"mock model content" + mock_requests_get.return_value = mock_response + + # Create test detector instance + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + + # Mock file operations + with patch("src.detector.detector.os.path.isfile", return_value=False), patch( + "src.detector.detector.open", mock_open() + ), patch( + "src.detector.detector.pickle.load", return_value="mock_model_or_scaler" + ), patch.object( + sut, "_sha256sum", return_value=MINIMAL_DETECTOR_CONFIG["checksum"] + ): + + model = sut._get_model() + + # Verify download was attempted + mock_requests_get.assert_called() + # Verify model was loaded + self.assertEqual(model, ("mock_model_or_scaler", "mock_model_or_scaler")) + # Verify logger messages + self.mock_logger.info.assert_any_call( + f"Get model: {sut.model} with checksum {sut.checksum}" + ) + self.mock_logger.info.assert_any_call( + f"downloading model {sut.model} from {sut.get_model_download_url()} with checksum {sut.checksum}" + ) + + @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_get_model_uses_existing_file( + self, mock_clickhouse, mock_kafka_consume_handler + ): + """Test that existing model file is used without re-downloading.""" + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + + # Mock file operations + with patch("src.detector.detector.os.path.isfile", return_value=True), patch( + "src.detector.detector.open", mock_open() + ), patch( + "src.detector.detector.pickle.load", return_value="mock_model_or_scaler" + ), patch.object( + sut, "_sha256sum", return_value=MINIMAL_DETECTOR_CONFIG["checksum"] + ), patch( + "src.detector.detector.requests.get" + ) as mock_requests_get: + + model_and_scaler = sut._get_model() + + # Verify no download was attempted + mock_requests_get.assert_not_called() + # Verify model was loaded + self.assertEqual( + model_and_scaler, ("mock_model_or_scaler", "mock_model_or_scaler") + ) + # Verify logger messages + self.mock_logger.info.assert_any_call( + f"Get model: {sut.model} with checksum {sut.checksum}" + ) + + @patch("src.detector.detector.requests.get") + @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_get_model_raises_wrong_checksum( + self, mock_clickhouse, mock_kafka_consume_handler, mock_requests_get + ): + """Test that WrongChecksum exception is raised when checksums don't match.""" + # Setup mock response + mock_response = MagicMock() + mock_response.content = b"mock model content" + mock_requests_get.return_value = mock_response + + # Create test detector instance + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + + # Mock file operations with wrong checksum + with patch("src.detector.detector.os.path.isfile", return_value=False), patch( + "src.detector.detector.open", mock_open() + ), patch.object(sut, "_sha256sum", return_value="wrong_checksum_value"): + + with self.assertRaises(WrongChecksum) as context: + sut._get_model() + + # Verify exception message + self.assertIn("Checksum", str(context.exception)) + self.assertIn("is not equal with new checksum", str(context.exception)) + # Verify logger warning + self.mock_logger.warning.assert_called_once() + + @patch("src.detector.detector.requests.get") + @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") + def test_get_model_handles_http_error( + self, mock_clickhouse, mock_kafka_consume_handler, mock_requests_get + ): + """Test that HTTP errors during download are properly propagated.""" + # Setup mock to raise HTTP error + mock_requests_get.side_effect = HTTPError("Download failed") + + # Create test detector instance + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + + # Mock file operations + with patch("src.detector.detector.os.path.isfile", return_value=False), patch( + "src.detector.detector.open", mock_open() + ): + + with self.assertRaises(HTTPError): + sut._get_model() + + # Verify logger info was called + self.mock_logger.info.assert_any_call( + f"Get model: {sut.model} with checksum {sut.checksum}" + ) + + +class TestBootstrapDetectorInstance(unittest.TestCase): + def setUp(self): + patcher = patch("src.detector.detector.logger") + self.mock_logger = patcher.start() + self.addCleanup(patcher.stop) + + @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") + def test_bootstrap_normal_execution( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler + ): + """Test normal execution flow of bootstrap_detector_instance.""" + mock_get_model.return_value = (MagicMock(), MagicMock()) + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + # Mock the methods called in the loop + with patch.object(sut, "get_and_fill_data") as mock_get_data, patch.object( + sut, "detect" + ) as mock_detect, patch.object( + sut, "send_warning", side_effect=KeyboardInterrupt + ) as mock_send_warning, patch.object( + sut, "clear_data" + ) as mock_clear_data: + + try: + sut.bootstrap_detector_instance() + except KeyboardInterrupt: + pass + + # Verify method call sequence + mock_get_data.assert_called_once() + mock_detect.assert_called_once() + mock_send_warning.assert_called_once() + mock_clear_data.assert_called_once() + + @patch("src.detector.detector.ExactlyOnceKafkaConsumeHandler") + @patch("src.detector.detector.ClickHouseKafkaSender") + @patch("src.detector.detector.DetectorBase._get_model") + def test_bootstrap_graceful_shutdown( + self, mock_get_model, mock_clickhouse, mock_kafka_consume_handler + ): + """Test graceful shutdown on KeyboardInterrupt in bootstrap_detector_instance.""" + mock_get_model.return_value = (MagicMock(), MagicMock()) + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + + sut = TestDetector( + consume_topic="test_topic", detector_config=MINIMAL_DETECTOR_CONFIG + ) + + # Mock methods and raise KeyboardInterrupt + with patch.object(sut, "get_and_fill_data"), patch.object( + sut, "detect" + ), patch.object( + sut, "send_warning", side_effect=KeyboardInterrupt + ), patch.object( + sut, "clear_data" + ) as mock_clear_data: + + # Should not raise exception as it's handled internally + sut.bootstrap_detector_instance() + + # Verify shutdown message + self.mock_logger.info.assert_called_with("Closing down Detector...") + # Verify clear_data was called + mock_clear_data.assert_called_once() diff --git a/tests/detector/test_dga_detector.py b/tests/detector/test_dga_detector.py new file mode 100644 index 00000000..85a4ecb3 --- /dev/null +++ b/tests/detector/test_dga_detector.py @@ -0,0 +1,220 @@ +import math +import numpy as np +import unittest +from unittest.mock import MagicMock, patch, call + +from src.detector.plugins.dga_detector import DGADetector +from src.base.data_classes.batch import Batch + + +DEFAULT_DATA = { + "src_ip": "192.168.0.167", + "dns_ip": "10.10.0.10", + "response_ip": "252.79.173.222", + "timestamp": "", + "status": "NXDOMAIN", + "domain_name": "IF356gEnJHPdRxnkDId4RDUSgtqxx9I+pZ5n1V53MdghOGQncZWAQgAPRx3kswi.750jnH6iSqmiAAeyDUMX0W6SHGpVsVsKSX8ZkKYDs0GFh/9qU5N9cwl00XSD8ID.NNhBdHZIb7nc0hDQXFPlABDLbRwkJS38LZ8RMX4yUmR2Mb6YqTTJBn+nUcB9P+v.jBQdwdS53XV9W2p1BHjh.16.f.1.6037.tunnel.example.org", + "record_type": "A", + "size": "100b", +} + + +class TestDGADetector(unittest.TestCase): + def setUp(self): + patcher = patch("src.detector.plugins.dga_detector.logger") + self.mock_logger = patcher.start() + self.addCleanup(patcher.stop) + + def _create_detector(self, mock_kafka_handler=None, mock_clickhouse=None): + """Helper method to create a DGADetector instance with proper mocks.""" + if mock_kafka_handler is None: + mock_kafka_handler = MagicMock() + if mock_clickhouse is None: + mock_clickhouse = MagicMock() + + detector_config = { + "name": "dga_detector", + "detector_module_name": "dga_detector", + "detector_class_name": "DGADetector", + "model": "rf", + "checksum": "ba1f718179191348fe2abd51644d76191d42a5d967c6844feb3371b6f798bf06", + "base_url": "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021", + "threshold": 0.005, + } + + with patch( + "src.detector.detector.ExactlyOnceKafkaConsumeHandler", + return_value=mock_kafka_handler, + ), patch( + "src.detector.detector.ClickHouseKafkaSender", return_value=mock_clickhouse + ), patch.object( + DGADetector, "_get_model", return_value=(MagicMock(), MagicMock()) + ): + + detector = DGADetector(detector_config, "test_topic") + detector.model = MagicMock() + detector.scaler = MagicMock() + return detector + + def test_get_model_download_url(self): + """Test that the model download URL is correctly formatted.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + # overwrite model here again to not interefere with other tests when using it globally + detector.model = "rf" + self.maxDiff = None + expected_url = "https://heibox.uni-heidelberg.de/d/0d5cbcbe16cd46a58021/files/?p=%2Frf/ba1f718179191348fe2abd51644d76191d42a5d967c6844feb3371b6f798bf06/rf.pickle&dl=1" + self.assertEqual(detector.get_model_download_url(), expected_url) + + def test_detect(self): + mock_kafka = MagicMock() + mock_ch = MagicMock() + sut = self._create_detector(mock_kafka, mock_ch) + sut.messages = [DEFAULT_DATA] + with patch( + "src.detector.plugins.dga_detector.DGADetector.predict", + return_value=[[0.01, 0.99]], + ): + sut.detect() + self.assertNotEqual([], sut.warnings) + + def test_predict_calls_model(self): + """Test that predict method correctly uses the model with features.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + # Mock model prediction + mock_prediction = np.array([[0.2, 0.8]]) + detector.model.predict_proba.return_value = mock_prediction + + # Test prediction + message = {"domain_name": "google.com"} + result = detector.predict(message) + + # Verify model was called once + detector.model.predict_proba.assert_called_once() + + # Verify the argument was correct + called_features = detector.model.predict_proba.call_args[0][0] + expected_features = detector._get_features("google.com") + np.testing.assert_array_equal(called_features, expected_features) + + # Verify prediction result + np.testing.assert_array_equal(result, mock_prediction) + + def test_get_features_basic_attributes(self): + """Test basic label features calculation.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + # Test with 'google.com' + features = detector._get_features("google.com") + + # Basic features: label_length, label_max, label_average + label_length = features[0][0] # 2 (google, com) + label_max = features[0][1] # 6 (google) + label_average = features[0][2] # 10 (google.com) + + self.assertEqual(label_length, 2) + self.assertEqual(label_max, 6) + self.assertEqual(label_average, 10) # 10 characters including dot + + def test_get_features_empty_domain(self): + """Test handling of empty domain string.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features = detector._get_features("") + + # Basic features + # Note: "".split(".") returns [""] so length is 1 + self.assertEqual(features[0][0], 1) # label_length + self.assertEqual(features[0][1], 0) # label_max (empty string has length 0) + self.assertEqual(features[0][2], 0) # label_average (empty string) + + # Letter frequencies should all be 0 + for i in range(3, 29): # letter frequency indices + self.assertEqual(features[0][i], 0) + + def test_get_features_single_character(self): + """Test handling of single character domain.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features = detector._get_features("a") + + # Basic features + self.assertEqual(features[0][0], 1) # label_length + self.assertEqual(features[0][1], 1) # label_max + self.assertEqual(features[0][2], 1) # label_average + + # Letter frequency for 'a' should be 1.0 + self.assertEqual(features[0][3], 1.0) # 'a' is at index 3 + + def test_get_features_feature_vector_shape(self): + """Test that the feature vector has the expected shape.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features = detector._get_features("test.domain.com") + + expected_entropy = 44 + + self.assertEqual(features.shape, (1, expected_entropy)) + + def test_get_features_case_insensitivity(self): + """Test that letter frequency calculation is case-insensitive.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + features_upper = detector._get_features("GOOGLE.COM") + features_lower = detector._get_features("google.com") + + # Letter frequencies should be identical regardless of case + np.testing.assert_array_almost_equal( + features_upper[0][3:29], # letter frequency part + features_lower[0][3:29], + decimal=5, + ) + + def test_get_features_subdomain_handling(self): + """Test proper handling of different subdomains.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + # Test with multiple subdomains + features = detector._get_features("sub1.sub2.example.com") + + # Should have 4 labels (sub1, sub2, example, com) + self.assertEqual(features[0][0], 4) # label_length + self.assertEqual(features[0][1], 7) # label_max (example) + self.assertEqual( + features[0][2], 21 + ) # label_average (sub1.sub2.example.com has 21 characters) + + def test_entropy_calculation(self): + """Test the entropy calculation function directly.""" + mock_kafka = MagicMock() + mock_ch = MagicMock() + detector = self._create_detector(mock_kafka, mock_ch) + + # Test with uniform distribution (max entropy) + uniform = "abcd" + uniform_entropy = -4 * (0.25 * math.log(0.25, 2)) + self.assertAlmostEqual( + detector._get_features(uniform)[0][-3], uniform_entropy, delta=0.01 + ) + + # Test with repetitive pattern (low entropy) + repetitive = "aaaa" + self.assertAlmostEqual( + detector._get_features(repetitive)[0][-3], 0.0, delta=0.01 + ) diff --git a/tests/inspector/test_inspector.py b/tests/inspector/test_inspector.py index 1b2df9a2..b0c9230e 100644 --- a/tests/inspector/test_inspector.py +++ b/tests/inspector/test_inspector.py @@ -1,17 +1,23 @@ import unittest import uuid from datetime import datetime, timedelta -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, patch, AsyncMock import marshmallow_dataclass import numpy as np from streamad.model import ZScoreDetector, RShashDetector +from src.base.kafka_handler import ( + KafkaMessageFetchException, +) from src.base.data_classes.batch import Batch -from src.inspector.inspector import Inspector, main +from src.inspector.inspector import InspectorBase, main + +# use no_inspector for testing, as it has almost 0 domain logic +from src.inspector.plugins.no_inspector import NoInspector DEFAULT_DATA = { - "client_ip": "192.168.0.167", + "src_ip": "192.168.0.167", "dns_ip": "10.10.0.10", "response_ip": "252.79.173.222", "timestamp": "", @@ -21,6 +27,11 @@ "size": "100b", } +MINIMAL_NO_INSPECTOR_CONFIG = { + "name": "test_inspector", + "inspector_class_name": "NoInspector", +} + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" @@ -28,6 +39,7 @@ def get_batch(data): begin = datetime.now() end = begin + timedelta(0, 3) test_batch = Batch( + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", batch_id=uuid.uuid4(), begin_timestamp=begin, end_timestamp=end, @@ -37,7 +49,6 @@ def get_batch(data): class TestInit(unittest.TestCase): - @patch("src.inspector.inspector.CONSUME_TOPIC", "test_topic") @patch("src.inspector.inspector.ClickHouseKafkaSender") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") @@ -49,7 +60,11 @@ def test_init( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) self.assertEqual([], sut.messages) self.assertEqual(mock_kafka_consume_handler_instance, sut.kafka_consume_handler) @@ -77,7 +92,12 @@ def test_get_data_without_return_data( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) + sut.get_and_fill_data() self.assertEqual([], sut.messages) @@ -103,7 +123,12 @@ def test_get_data_with_return_data( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) + sut.messages = [] sut.get_and_fill_data() @@ -135,7 +160,13 @@ def test_get_data_with_no_return_data( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) + + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" sut.messages = [] sut.get_and_fill_data() @@ -162,7 +193,12 @@ def test_get_data_while_busy( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) + sut.messages = ["test_data"] sut.get_and_fill_data() @@ -186,7 +222,12 @@ def test_clear_data_without_existing_data( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) + sut.messages = [] sut.clear_data() @@ -208,7 +249,12 @@ def test_clear_data_with_existing_data( mock_produce_handler_instance = MagicMock() mock_kafka_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, + ) + sut.messages = ["test_data"] sut.begin_timestamp = "2024-05-21T08:31:27.000Z" sut.end_timestamp = "2024-05-21T08:31:29.000Z" @@ -219,133 +265,158 @@ def test_clear_data_with_existing_data( self.assertIsNone(sut.end_timestamp) -class TestDataFunction(unittest.TestCase): - - @patch("src.inspector.inspector.ClickHouseKafkaSender") +class TestSend(unittest.TestCase): + @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - def test_count_errors( - self, mock_kafka_consume_handler, mock_produce_handler, mock_clickhouse + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_send( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_kafka_produce_handler, + mock_logger, ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance + mock_kafka_produce_handler.return_value = mock_produce_handler_instance + batch_schema = marshmallow_dataclass.class_schema(Batch)() - sut = Inspector() - begin_timestamp = datetime.now() - end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - messages = [data] - np.testing.assert_array_equal( - np.asarray([[1.0], [0.0]]), - sut._count_errors(messages, begin_timestamp, end_timestamp), + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["pipeline-inspector_to_detector"], + config=MINIMAL_NO_INSPECTOR_CONFIG, ) - @patch("src.inspector.inspector.ClickHouseKafkaSender") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - def test_mean_packet_size( - self, mock_kafka_consume_handler, mock_produce_handler, mock_clickhouse - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - begin_timestamp = datetime.now() - end_timestamp = datetime.now() + timedelta(0, 0, 2) + sut.anomalies = [0.9, 0.9] + sut.X = np.array([[0.0], [0.0]]) + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" + sut.begin_timestamp = datetime.now() + sut.end_timestamp = datetime.now() + timedelta(0, 0, 2) data = DEFAULT_DATA data["timestamp"] = datetime.strftime( - begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT ) - messages = [data] - np.testing.assert_array_equal( - np.asarray([[100], [0.0]]), - sut._mean_packet_size(messages, begin_timestamp, end_timestamp), + sut.messages = [data] + mock_batch_tree_row_id = f"{uuid.UUID('754a64f3-a461-4e7b-b4cb-ab29df9c4dce')}-{uuid.UUID('f9b3cbb7-b26c-41be-8e7f-a69a9c133668')}" + mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") + with patch("src.inspector.inspector.uuid") as mock_uuid: + with patch( + "src.inspector.inspector.generate_collisions_resistant_uuid" + ) as mock_row_id: + mock_row_id.return_value = mock_batch_tree_row_id + mock_uuid.uuid4.return_value = mock_batch_id + sut.send_data() + + mock_produce_handler_instance.produce.assert_called_once_with( + topic="pipeline-inspector_to_detector", + data=batch_schema.dumps( + { + "batch_tree_row_id": mock_batch_tree_row_id, + "batch_id": mock_batch_id, + "begin_timestamp": sut.begin_timestamp, + "end_timestamp": sut.end_timestamp, + "data": [data], + } + ), + key="192.168.0.167", ) - @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.logger") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - def test_count_errors_empty_messages( - self, mock_kafka_consume_handler, mock_produce_handler, mock_clickhouse + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_send_not_suspicious( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, ): mock_kafka_consume_handler_instance = MagicMock() mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance + batch_schema = marshmallow_dataclass.class_schema(Batch)() - sut = Inspector() - begin_timestamp = datetime.now() - end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - np.testing.assert_array_equal( - np.asarray([[0.0], [0.0]]), - sut._count_errors([], begin_timestamp, end_timestamp), + sut = NoInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=MINIMAL_NO_INSPECTOR_CONFIG, ) - @patch("src.inspector.inspector.ClickHouseKafkaSender") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - def test_mean_packet_size_empty_messages( - self, mock_kafka_consume_handler, mock_produce_handler, mock_clickhouse - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - begin_timestamp = datetime.now() - end_timestamp = begin_timestamp + timedelta(0, 0, 2) + mock_is_subnet_suspicious = MagicMock(return_value=False) + sut.subnet_is_suspicious = mock_is_subnet_suspicious + sut.anomalies = [0.0, 0.0] + sut.X = np.array([[0.0], [0.0]]) + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" + sut.begin_timestamp = datetime.now() + sut.end_timestamp = datetime.now() + timedelta(0, 0, 2) data = DEFAULT_DATA data["timestamp"] = datetime.strftime( - begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - np.testing.assert_array_equal( - np.asarray([[0.0], [0.0]]), - sut._mean_packet_size([], begin_timestamp, end_timestamp), + sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT ) + data["logline_id"] = str(uuid.UUID("99a427a6-ba3f-4aa2-b848-210d994d9108")) + sut.messages = [data] + mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") + with patch("src.inspector.inspector.uuid") as mock_uuid: + mock_uuid.uuid4.return_value = mock_batch_id + sut.send_data() + + mock_produce_handler_instance.produce.assert_not_called() -class TestInspectFunction(unittest.TestCase): - @patch("src.inspector.inspector.logger") +class TestInspector(InspectorBase): + def __init__(self, consume_topic, produce_topics, config) -> None: + super().__init__(consume_topic, produce_topics, config) + self.inspected = False + self.anomalies_detected = False + + def _get_models(self, models) -> list: + return ["mock_model"] + + def inspect_anomalies(self) -> None: + self.inspected = True + if self.messages: + self.anomalies_detected = True + + def subnet_is_suspicious(self) -> bool: + return self.anomalies_detected + + +class TestInspectMethod(unittest.TestCase): @patch("src.inspector.inspector.ClickHouseKafkaSender") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - None, - ) - def test_inspect_none_models( + def test_inspect_no_models_configured( self, - mock_kafka_consume_handler, + mock_consume_handler, mock_produce_handler, mock_clickhouse, - mock_logger, ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - get_batch(None), + """Tests that inspect() raises NotImplementedError when no models are configured.""" + # Arrange + + config = MINIMAL_NO_INSPECTOR_CONFIG.copy() + config.update( + { + "inspector_class_name": "TestInspector", + "mode": "test_mode", + "anomaly_threshold": 0.5, + "score_threshold": 0.7, + "time_type": "test_time", + "time_range": "test_range", + } ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance + sut = TestInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=config, + ) + sut.messages = [{"test": "data"}] - sut = Inspector() + # Assert with self.assertRaises(NotImplementedError): sut.inspect() @@ -353,790 +424,243 @@ def test_inspect_none_models( @patch("src.inspector.inspector.ClickHouseKafkaSender") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - "", - ) - def test_inspect_empty_models( + def test_inspect_multiple_models_configured( self, - mock_kafka_consume_handler, + mock_consume_handler, mock_produce_handler, mock_clickhouse, mock_logger, ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - get_batch(None), + """Tests that inspect() uses only the first model when multiple models are configured.""" + # Setup + config = MINIMAL_NO_INSPECTOR_CONFIG.copy() + config.update( + { + "inspector_class_name": "TestInspector", + "mode": "test_mode", + "models": [{"model": "Model1"}, {"model": "Model2"}], + "anomaly_threshold": 0.5, + "score_threshold": 0.7, + "time_type": "test_time", + "time_range": "test_range", + } ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() - with self.assertRaises(NotImplementedError): - sut.inspect() - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}], - ) - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - @patch("src.inspector.inspector.MODE", "univariate") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_univariate( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, + sut = TestInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=config, ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() - sut.get_and_fill_data() + # Mock data + sut.messages = [{"test": "data"}] + + # Execute sut.inspect() - self.assertEqual([0, 0], sut.anomalies) - @patch("src.inspector.inspector.logger") + # Verify + self.assertTrue(sut.inspected) + mock_logger.warning.assert_called_with( + "Model List longer than 1. Only the first one is taken: Model1!" + ) + + @patch("src.inspector.inspector.ClickHouseKafkaSender") @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}], - ) - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - @patch("src.inspector.inspector.MODE", "univariate") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_univariate_model_init( + def test_inspect_single_model_configured( self, - mock_clickhouse, - mock_kafka_consume_handler, + mock_consume_handler, mock_produce_handler, - mock_logger, + mock_clickhouse, ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, + """Tests that inspect() works correctly with a single model configured.""" + # Setup + config = MINIMAL_NO_INSPECTOR_CONFIG.copy() + config.update( + { + "inspector_class_name": "TestInspector", + "mode": "test_mode", + "models": [{"model": "Model1"}], + "anomaly_threshold": 0.5, + "score_threshold": 0.7, + "time_type": "test_time", + "time_range": "test_range", + } ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() - sut.get_and_fill_data() - sut._get_models( - [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}] - ) - models = sut.models - sut.models = None - sut._get_models( - [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}] + sut = TestInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=config, ) - self.assertEqual(type(models), type(sut.models)) - models = sut.models - sut._get_models( - [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}] - ) - self.assertEqual(models, sut.models) - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ + # Mock data + sut.messages = [{"test": "data"}] + + # Execute + sut.inspect() + + # Verify + self.assertTrue(sut.inspected) + + +class TestBootStrapFunction(unittest.TestCase): + + def setUp(self): + self.inspectors = [ { - "model": "ZScoreDetector", - "module": "streamad.model", - "model_args": {"window_len": 10}, + "name": "test_inspector", + "inspector_class_name": "NoInspector", + "prefilter_name": "dominator_filter", + "inspector_module_name": "no_inspector", } - ], - ) - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - @patch("src.inspector.inspector.MODE", "univariate") + ] + @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_univariate_2( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.logger") + def test_bootstrap_normal_execution( + self, mock_logger, mock_consume_handler, mock_produce_handler, mock_clickhouse ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + """Tests that the bootstrap process executes all steps in the correct order.""" + # Setup + config = MINIMAL_NO_INSPECTOR_CONFIG.copy() + config.update( + { + "inspector_class_name": "TestInspector", + "mode": "test_mode", + "models": [{"model": "ZScoreDetector"}], + "anomaly_threshold": 0.5, + "score_threshold": 0.7, + "time_type": "test_time", + "time_range": "test_range", + } ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, + + sut = TestInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=config, ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertNotEqual([None, None], sut.anomalies) + # Mock data so send_data works + sut.messages = [{"src_ip": "192.168.0.1", "logline_id": "test_id"}] + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" + sut.begin_timestamp = datetime.now() + sut.end_timestamp = datetime.now() + timedelta(seconds=1) + + # Patch methods to control the loop + original_send_data = sut.send_data + + def mock_send_data(): + original_send_data() + # After first iteration, raise exception to break the loop + raise StopIteration("Test exception to break loop") + + sut.send_data = mock_send_data + # Track method calls + sut.inspect = MagicMock(wraps=sut.inspect) + sut.send_data = MagicMock(wraps=sut.send_data) + sut.clear_data = MagicMock(wraps=sut.clear_data) + + # Execute and verify + with self.assertRaises(StopIteration): + sut.bootstrap_inspection_process() + + # Verify method call order and count + sut.inspect.assert_called_once() + sut.send_data.assert_called_once() + sut.clear_data.assert_called_once() + + # Verify logger messages + mock_logger.info.assert_any_call("Starting test_inspector") - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - {"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}, - {"model": "KNNDetector", "module": "streamad.model", "model_args": {}}, - ], - ) - @patch("src.inspector.inspector.TIME_TYPE", "ms") - @patch("src.inspector.inspector.TIME_RANGE", 1) - @patch("src.inspector.inspector.MODE", "univariate") @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_univariate_two_models( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.logger") + def test_bootstrap_kafka_exception_handling( + self, mock_logger, mock_consume_handler, mock_produce_handler, mock_clickhouse ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT + """Tests that IOError is handled correctly (re-raised).""" + # Setup + config = MINIMAL_NO_INSPECTOR_CONFIG.copy() + config.update( + { + "inspector_class_name": "TestInspector", + "mode": "test_mode", + "models": [{"model": "ZScoreDetector"}], + "anomaly_threshold": 0.5, + "score_threshold": 0.7, + "time_type": "test_time", + "time_range": "test_range", + } ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, + + sut = TestInspector( + consume_topic="test_topic", + produce_topics=["produce_topic_1"], + config=config, ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertEqual([0, 0], sut.anomalies) - self.assertTrue(isinstance(sut.models[0], ZScoreDetector)) + # Mock data so send_data works + sut.messages = [{"src_ip": "192.168.0.1", "logline_id": "test_id"}] + sut.parent_row_id = f"{uuid.uuid4()}-{uuid.uuid4()}" + sut.begin_timestamp = datetime.now() + sut.end_timestamp = datetime.now() + timedelta(seconds=1) + + # Patch inspect to raise IOError + def mock_inspect(): + raise IOError("Test IO error") + + sut.inspect = mock_inspect + sut.get_and_fill_data = MagicMock() + sut.send_data = MagicMock() + sut.clear_data = MagicMock(wraps=sut.clear_data) + + # Execute and verify + with self.assertRaises(IOError) as context: + sut.bootstrap_inspection_process() + self.assertEqual(str(context.exception), "Test IO error") + + # Verify method calls + sut.get_and_fill_data.assert_called_once() + sut.send_data.assert_not_called() + sut.clear_data.assert_called_once() + + +class TestMainFunction(unittest.IsolatedAsyncioTestCase): + + def setUp(self): + self.inspectors = [ + { + "name": "test_inspector", + "inspector_class_name": "NoInspector", + "prefilter_name": "dominator_filter", + "inspector_module_name": "no_inspector", + } + ] @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [{"model": "RShashDetector", "module": "streamad.model", "model_args": {}}], - ) - @patch("src.inspector.inspector.MODE", "multivariate") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_multivariate( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, + @patch("src.inspector.plugins.no_inspector.NoInspector") + @patch("asyncio.create_task") + @patch("asyncio.run") + async def test_main_succesful_start( + self, mock_asyncio_run, mock_asyncio_create_task, mock_inspector, mock_logger ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertEqual([0, 0], sut.anomalies) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - { - "model": "RShashDetector", - "module": "streamad.model", - "model_args": {"window_len": 10}, - } - ], - ) - @patch("src.inspector.inspector.MODE", "multivariate") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_multivariate_window_len( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertNotEqual([None, None], sut.anomalies) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - {"model": "RShashDetector", "module": "streamad.model", "model_args": {}}, - {"model": "xStreamDetector", "module": "streamad.model", "model_args": {}}, - ], - ) - @patch("src.inspector.inspector.MODE", "multivariate") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_multivariate_two_models( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertEqual([0, 0], sut.anomalies) - self.assertTrue(isinstance(sut.models[0], RShashDetector)) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - {"model": "KNNDetector", "module": "streamad.model", "model_args": {}}, - {"model": "SpotDetector", "module": "streamad.model", "model_args": {}}, - ], - ) - @patch( - "src.inspector.inspector.ENSEMBLE", - { - "model": "WeightEnsemble", - "module": "streamad.process", - "model_args": {"ensemble_weights": [0.6, 0.4]}, - }, - ) - @patch("src.inspector.inspector.MODE", "ensemble") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_ensemble( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertEqual([0, 0], sut.anomalies) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - {"model": "KNNDetector", "module": "streamad.model", "model_args": {}}, - {"model": "SpotDetector", "module": "streamad.model", "model_args": {}}, - ], - ) - @patch( - "src.inspector.inspector.ENSEMBLE", - { - "model": "WeightEnsemble", - "module": "streamad.process", - "model_args": {"ensemble_weights": [0.6, 0.4]}, - }, - ) - @patch("src.inspector.inspector.MODE", "ensemble") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_ensemble_model_init( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut._get_ensemble() - ensemble = sut.ensemble - sut.ensemble = None - sut._get_ensemble() - self.assertEqual(type(ensemble), type(sut.ensemble)) - ensemble = sut.ensemble - sut._get_ensemble() - self.assertEqual(ensemble, sut.ensemble) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - { - "model": "KNNDetector", - "module": "streamad.model", - "model_args": {"window_len": 10}, - }, - { - "model": "SpotDetector", - "module": "streamad.model", - "model_args": {"window_len": 10}, - }, - ], - ) - @patch( - "src.inspector.inspector.ENSEMBLE", - { - "model": "WeightEnsemble", - "module": "streamad.process", - "model_args": {"ensemble_weights": [0.6, 0.4]}, - }, - ) - @patch("src.inspector.inspector.MODE", "ensemble") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_ensemble_window_len( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - sut.inspect() - self.assertNotEqual([None, None], sut.anomalies) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [ - {"model": "RShashDetector", "module": "streamad.model", "model_args": {}}, - {"model": "SpotDetector", "module": "streamad.model", "model_args": {}}, - ], - ) - @patch( - "src.inspector.inspector.ENSEMBLE", - { - "model": "WeightEnsemble", - "module": "streamad.process", - "model_args": {"ensemble_weights": [0.6, 0.4]}, - }, - ) - @patch("src.inspector.inspector.MODE", "ensemble") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_inspect_ensemble_invalid( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - test_batch = get_batch(None) - test_batch.begin_timestamp = datetime.now() - test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - test_batch.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - test_batch.data = [data] - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_kafka_consume_handler_instance.consume_as_object.return_value = ( - "test", - test_batch, - ) - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - sut.get_and_fill_data() - with self.assertRaises(NotImplementedError): - sut.inspect() - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [{"model": "INVALID", "module": "streamad.model"}], - ) - def test_invalid_model_univariate( - self, - mock_kafka_consume_handler, - mock_produce_handler, - mock_clickhouse, - mock_logger, - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - with self.assertRaises(NotImplementedError): - sut.inspect() - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.MODELS", - [{"model": "INVALID", "module": "streamad.model"}], - ) - @patch("src.inspector.inspector.MODE", "multivariate") - def test_invalid_model_multivariate( - self, - mock_kafka_consume_handler, - mock_produce_handler, - mock_clickhouse, - mock_logger, - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - with self.assertRaises(NotImplementedError): - sut.inspect() - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ClickHouseKafkaSender") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch( - "src.inspector.inspector.ENSEMBLE", - {"model": "INVALID", "module": "streamad.process"}, - ) - @patch("src.inspector.inspector.MODE", "ensemble") - def test_invalid_model_ensemble( - self, - mock_kafka_consume_handler, - mock_produce_handler, - mock_clickhouse, - mock_logger, - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - with self.assertRaises(NotImplementedError): - sut.inspect() - - @patch("src.inspector.inspector.ClickHouseKafkaSender") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch("src.inspector.inspector.MODE", "INVALID") - def test_invalid_mode( - self, mock_kafka_consume_handler, mock_produce_handler, mock_clickhouse - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - - sut = Inspector() - with self.assertRaises(NotImplementedError): - sut.inspect() - - -class TestSend(unittest.TestCase): - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch("src.inspector.inspector.SCORE_THRESHOLD", 0.1) - @patch("src.inspector.inspector.ANOMALY_THRESHOLD", 0.01) - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_send( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_kafka_produce_handler, - mock_logger, - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_kafka_produce_handler.return_value = mock_produce_handler_instance - batch_schema = marshmallow_dataclass.class_schema(Batch)() - - sut = Inspector() - sut.anomalies = [0.9, 0.9] - sut.X = np.array([[0.0], [0.0]]) - sut.begin_timestamp = datetime.now() - sut.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - sut.messages = [data] - mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") - with patch("src.inspector.inspector.uuid") as mock_uuid: - mock_uuid.uuid4.return_value = mock_batch_id - sut.send_data() - - mock_produce_handler_instance.produce.assert_called_once_with( - topic="pipeline-inspector_to_detector", - data=batch_schema.dumps( - { - "batch_id": mock_batch_id, - "begin_timestamp": sut.begin_timestamp, - "end_timestamp": sut.end_timestamp, - "data": [data], - } - ), - key="192.168.0.167", - ) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") - @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") - @patch("src.inspector.inspector.SCORE_THRESHOLD", 0.1) - @patch("src.inspector.inspector.ANOMALY_THRESHOLD", 0.01) - @patch("src.inspector.inspector.ClickHouseKafkaSender") - def test_send_not_suspicious( - self, - mock_clickhouse, - mock_kafka_consume_handler, - mock_produce_handler, - mock_logger, - ): - mock_kafka_consume_handler_instance = MagicMock() - mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance - mock_produce_handler_instance = MagicMock() - mock_produce_handler.return_value = mock_produce_handler_instance - batch_schema = marshmallow_dataclass.class_schema(Batch)() - - sut = Inspector() - sut.anomalies = [0.0, 0.0] - sut.X = np.array([[0.0], [0.0]]) - sut.begin_timestamp = datetime.now() - sut.end_timestamp = datetime.now() + timedelta(0, 0, 2) - data = DEFAULT_DATA - data["timestamp"] = datetime.strftime( - sut.begin_timestamp + timedelta(0, 0, 1), TIMESTAMP_FORMAT - ) - data["logline_id"] = uuid.UUID("99a427a6-ba3f-4aa2-b848-210d994d9108") - sut.messages = [data] - mock_batch_id = uuid.UUID("5ae0872e-5bb9-472c-8c37-8c173213a51f") - with patch("src.inspector.inspector.uuid") as mock_uuid: - mock_uuid.uuid4.return_value = mock_batch_id - sut.send_data() - - mock_produce_handler_instance.produce.assert_not_called() - - -class TestMainFunction(unittest.TestCase): - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.Inspector") - def test_main_loop_execution(self, mock_inspector, mock_logger): - # Arrange - mock_inspector_instance = mock_inspector.return_value - - mock_inspector_instance.get_and_fill_data = MagicMock() - mock_inspector_instance.clear_data = MagicMock() - - # Act - main(one_iteration=True) - - # Assert - self.assertTrue(mock_inspector_instance.get_and_fill_data.called) - self.assertTrue(mock_inspector_instance.clear_data.called) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.Inspector") - def test_main_io_error_handling(self, mock_inspector, mock_logger): - # Arrange - mock_inspector_instance = mock_inspector.return_value - - # Act - with patch.object( - mock_inspector_instance, - "get_and_fill_data", - side_effect=IOError("Simulated IOError"), - ): - with self.assertRaises(IOError): - main(one_iteration=True) - - # Assert - self.assertTrue(mock_inspector_instance.clear_data.called) - self.assertTrue(mock_inspector_instance.loop_exited) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.Inspector") - def test_main_value_error_handling(self, mock_inspector, mock_logger): - # Arrange - mock_inspector_instance = mock_inspector.return_value - - # Act - with patch.object( - mock_inspector_instance, - "get_and_fill_data", - side_effect=ValueError("Simulated ValueError"), - ): - main(one_iteration=True) - - # Assert - self.assertTrue(mock_inspector_instance.clear_data.called) - self.assertTrue(mock_inspector_instance.loop_exited) - - @patch("src.inspector.inspector.logger") - @patch("src.inspector.inspector.Inspector") - def test_main_keyboard_interrupt(self, mock_inspector, mock_logger): # Arrange - mock_inspector_instance = mock_inspector.return_value - mock_inspector_instance.get_and_fill_data.side_effect = KeyboardInterrupt + mock_inspector_instance = MagicMock() + mock_inspector_instance.start = AsyncMock() + mock_inspector.return_value = mock_inspector_instance + mock_asyncio_create_task.side_effect = lambda coro: coro # Act - main() + with patch("src.inspector.inspector.INSPECTORS", self.inspectors): + await main() # Assert - self.assertTrue(mock_inspector_instance.clear_data.called) - self.assertTrue(mock_inspector_instance.loop_exited) + mock_inspector_instance.start.assert_called_once() if __name__ == "__main__": diff --git a/tests/inspector/test_no_inspector.py b/tests/inspector/test_no_inspector.py new file mode 100644 index 00000000..01a7bc9f --- /dev/null +++ b/tests/inspector/test_no_inspector.py @@ -0,0 +1,119 @@ +import unittest +from unittest.mock import patch, MagicMock +import numpy as np +from src.inspector.plugins.no_inspector import NoInspector + + +class TestNoInspector(unittest.TestCase): + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def setUp( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.consume_topic = "test_topic" + self.produce_topics = ["output_topic"] + self.config = { + "name": "test-no-inspector", + "prefilter_name": "test-prefilter", + "inspector_module_name": "no_inspector", + "inspector_class_name": "NoInspector", + } + + # Create a mock InspectorBase + with patch("src.inspector.plugins.no_inspector.InspectorBase") as mock_base: + # Set up the mock to have a messages attribute + mock_base_instance = mock_base.return_value + mock_base_instance.messages = [] + mock_base_instance.name = "NoInspector" + + # Create the NoInspector instance + self.inspector = NoInspector( + self.consume_topic, self.produce_topics, self.config + ) + self.inspector.kafka_consume_handler = mock_kafka_consume_handler + # Manually set up messages for testing + self.inspector.messages = [ + {"domain_name": "example.com", "timestamp": "2025-01-01T00:00:00Z"}, + { + "domain_name": "malicious-domain.xyz", + "timestamp": "2025-01-01T00:00:01Z", + }, + ] + + def test_init(self): + # Verify constructor parameters were passed correctly + self.assertEqual(self.inspector.consume_topic, self.consume_topic) + self.assertEqual(self.inspector.produce_topics, self.produce_topics) + self.assertEqual(self.inspector.name, self.config["name"]) + + def test_inspect_anomalies(self): + # Act + self.inspector.inspect_anomalies() + + # Assert + # Should set anomalies to an array of 1s with the same length as messages + self.assertEqual(len(self.inspector.anomalies), len(self.inspector.messages)) + self.assertTrue(np.array_equal(self.inspector.anomalies, np.array([1, 1]))) + self.assertTrue(all(anomaly == 1 for anomaly in self.inspector.anomalies)) + + def test_inspect(self): + # Mock inspect_anomalies to verify it's called + with patch.object( + self.inspector, "inspect_anomalies" + ) as mock_inspect_anomalies: + # Act + self.inspector.inspect() + + # Assert + mock_inspect_anomalies.assert_called_once() + + def test_get_models(self): + # Act & Assert + # Should not raise any exceptions + try: + self.inspector._get_models(["model1", "model2"]) + except Exception as e: + self.fail(f"_get_models() raised {type(e).__name__} unexpectedly!") + + @patch("src.inspector.plugins.no_inspector.logger") + def test_subnet_is_suspicious(self, mock_logger): + # Arrange + self.inspector.anomalies = np.array([1, 1, 1]) + + # Act + result = self.inspector.subnet_is_suspicious() + + # Assert + mock_logger.info.assert_called_once_with( + f"{self.inspector.name}: 3 anomalies found" + ) + self.assertTrue(result) + + def test_subnet_is_suspicious_with_empty_messages(self): + # Arrange + self.inspector.messages = [] + self.inspector.anomalies = np.array([]) + + # Act + result = self.inspector.subnet_is_suspicious() + + # Assert + self.assertTrue(result) + + def test_inspect_flow(self): + # Act + self.inspector.inspect() + + # Assert + # After inspect, anomalies should be set + self.assertEqual(len(self.inspector.anomalies), len(self.inspector.messages)) + self.assertTrue(np.array_equal(self.inspector.anomalies, np.array([1, 1]))) + + # And subnet_is_suspicious should work + self.assertTrue(self.inspector.subnet_is_suspicious()) diff --git a/tests/inspector/test_stream_ad_inspector.py b/tests/inspector/test_stream_ad_inspector.py new file mode 100644 index 00000000..fe52dc1a --- /dev/null +++ b/tests/inspector/test_stream_ad_inspector.py @@ -0,0 +1,706 @@ +import unittest +from unittest.mock import patch, MagicMock, mock_open, call +from src.inspector.plugins.stream_ad_inspector import StreamADInspector +import importlib +import os +import sys +import uuid +from datetime import datetime, timedelta +from enum import Enum, unique +import asyncio +from abc import ABC, abstractmethod +import marshmallow_dataclass +import numpy as np +from streamad.util import StreamGenerator, CustomDS + +sys.path.append(os.getcwd()) +from src.base.clickhouse_kafka_sender import ClickHouseKafkaSender +from src.base.data_classes.batch import Batch +from src.base.utils import ( + setup_config, + get_zeek_sensor_topic_base_names, + generate_collisions_resistant_uuid, +) +from streamad.model import ZScoreDetector, RShashDetector + +DEFAULT_DATA = { + "src_ip": "192.168.0.167", + "dns_ip": "10.10.0.10", + "response_ip": "252.79.173.222", + "timestamp": "", + "status": "NXDOMAIN", + "host_domain_name": "24sata.info", + "record_type": "A", + "size": "100b", +} + + +def create_test_batch(): + test_batch = Batch( + batch_tree_row_id=f"{uuid.uuid4()}-{uuid.uuid4()}", + batch_id=uuid.uuid4(), + begin_timestamp=datetime.now(), + end_timestamp=datetime.now(), + data=[], + ) + test_batch.begin_timestamp = datetime.now() + test_batch.end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["ts"] = datetime.isoformat(test_batch.begin_timestamp + timedelta(0, 0, 1)) + return test_batch + + +class TestStreamAdInspectorSetup(unittest.TestCase): + @patch("src.inspector.inspector.PLUGIN_PATH", "src.inspector.plugins") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.logger") + def setUp( + self, mock_logger, mock_consume_handler, mock_produce_handler, mock_clickhouse + ): + """Initialize inspector instance before each test.""" + self.inspector = StreamADInspector( + consume_topic="consume_topic", + produce_topics=["produce_topic"], + config={ + "inspector_class_name": "StreamADInspector", + "inspector_module_name": "stream_ad_inspector", + "mode": "univariate", + "anomaly_threshold": 0.05, + "score_threshold": 0.05, + "time_type": "ms", + "time_range": 20, + "name": "test-inspector", + "ensemble": { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": "", + }, + }, + ) + + def test_init_attributes(self): + """Verify initial state of core attribute.""" + self.assertTrue(self.inspector.ensemble_config["model"] == "WeightEnsemble") + self.assertTrue(self.inspector.name == "test-inspector") + self.assertTrue(self.inspector.mode == "univariate") + + +class TestStreamAdInspectorInvalidSetup(unittest.TestCase): + @patch("src.inspector.inspector.PLUGIN_PATH", "src.inspector.plugins") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.logger") + def setUp( + self, mock_logger, mock_consume_handler, mock_produce_handler, mock_clickhouse + ): + self.consume_topic = ("consume_topic",) + self.produce_topics = (["produce_topic"],) + self.config = { + "inspector_class_name": "StreamADInspector", + "inspector_module_name": "stream_ad_inspector", + "mode": "univariate", + "anomaly_threshold": 0.05, + "score_threshold": 0.05, + "time_type": "ms", + "time_range": 20, + "name": "test-inspector", + "ensemble": { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": "", + }, + } + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_ensemble_invalid( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "RShashDetector", "module": "streamad.model", "model_args": {}}, + {"model": "SpotDetector", "module": "streamad.model", "model_args": {}}, + ] + self.config["mode"] = "ensemble" + self.config["ensemble"] = ( + { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": {"ensemble_weights": [0.6, 0.4]}, + }, + ) + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + mock_produce_handler_instance = MagicMock() + mock_produce_handler.return_value = mock_produce_handler_instance + + sut.get_and_fill_data() + with self.assertRaises(NotImplementedError): + sut.inspect() + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + def test_invalid_model_univariate( + self, + mock_kafka_consume_handler, + mock_produce_handler, + mock_clickhouse, + mock_logger, + ): + self.config["models"] = [{"model": "INVALID", "module": "streamad.model"}] + # mock_kafka_consume_handler_instance = MagicMock() + # mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + # mock_produce_handler_instance = MagicMock() + # mock_produce_handler.return_value = mock_produce_handler_instance + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + with self.assertRaises(NotImplementedError): + sut.inspect() + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + def test_invalid_model_multivariate( + self, + mock_kafka_consume_handler, + mock_produce_handler, + mock_clickhouse, + mock_logger, + ): + self.config["models"] = [{"model": "INVALID", "module": "streamad.model"}] + self.config["mode"] = "multivariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + + with self.assertRaises(NotImplementedError): + sut.inspect() + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + def test_invalid_model_ensemble( + self, + mock_kafka_consume_handler, + mock_produce_handler, + mock_clickhouse, + mock_logger, + ): + # mock_kafka_consume_handler_instance = MagicMock() + # mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + # mock_produce_handler_instance = MagicMock() + # mock_produce_handler.return_value = mock_produce_handler_instance + self.config["ensemble"] = [{"model": "INVALID", "module": "streamad.process"}] + self.config["mode"] = "ensemble" + + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + + with self.assertRaises(NotImplementedError): + sut.inspect() + + +class TestStreamAdInspectorMeanPacketSize(unittest.TestCase): + @patch("src.inspector.inspector.PLUGIN_PATH", "src.inspector.plugins") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.logger") + def setUp( + self, mock_logger, mock_consume_handler, mock_produce_handler, mock_clickhouse + ): + """Initialize inspector instance before each test.""" + self.sut = StreamADInspector( + consume_topic="consume_topic", + produce_topics=["produce_topic"], + config={ + "inspector_class_name": "StreamADInspector", + "inspector_module_name": "stream_ad_inspector", + "mode": "univariate", + "anomaly_threshold": 0.05, + "score_threshold": 0.05, + "time_type": "ms", + "time_range": 20, + "name": "test-inspector", + "ensemble": { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": "", + }, + }, + ) + + def test_count_errors(self): + begin_timestamp = datetime.now() + end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["ts"] = datetime.isoformat(begin_timestamp + timedelta(0, 0, 1)) + messages = [data] + np.testing.assert_array_equal( + np.asarray([[1.0], [0.0]]), + self.sut._count_errors(messages, begin_timestamp, end_timestamp), + ) + + def test_mean_packet_size(self): + begin_timestamp = datetime.now() + end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["ts"] = datetime.isoformat(begin_timestamp + timedelta(0, 0, 1)) + messages = [data] + np.testing.assert_array_equal( + np.asarray([[100], [0.0]]), + self.sut._mean_packet_size(messages, begin_timestamp, end_timestamp), + ) + + def test_count_errors_empty_messages(self): + + begin_timestamp = datetime.now() + end_timestamp = datetime.now() + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["ts"] = datetime.isoformat(begin_timestamp + timedelta(0, 0, 1)) + np.testing.assert_array_equal( + np.asarray([[0.0], [0.0]]), + self.sut._count_errors([], begin_timestamp, end_timestamp), + ) + + def test_mean_packet_size_empty_messages(self): + begin_timestamp = datetime.now() + end_timestamp = begin_timestamp + timedelta(0, 0, 2) + data = DEFAULT_DATA + data["ts"] = datetime.isoformat(begin_timestamp + timedelta(0, 0, 1)) + np.testing.assert_array_equal( + np.asarray([[0.0], [0.0]]), + self.sut._mean_packet_size([], begin_timestamp, end_timestamp), + ) + + +class TestInspectFunction(unittest.TestCase): + def setUp(self): + self.consume_topic = ("consume_topic",) + self.produce_topics = (["produce_topic"],) + self.config = { + "inspector_class_name": "StreamADInspector", + "inspector_module_name": "stream_ad_inspector", + "mode": "univariate", + "anomaly_threshold": 0.05, + "score_threshold": 0.05, + "time_type": "ms", + "time_range": 20, + "name": "test-inspector", + "ensemble": { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": "", + }, + } + + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.logger") + def test_inspect_none_models( + self, mock_logger, mock_consume_handler, mock_produce_handler, mock_clickhouse + ): + self.config["models"] = None + + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + with self.assertRaises(NotImplementedError): + sut.inspect() + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + def test_inspect_empty_models( + self, + mock_kafka_consume_handler, + mock_produce_handler, + mock_clickhouse, + mock_logger, + ): + self.config["models"] = [] + + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + + with self.assertRaises(NotImplementedError): + sut.inspect() + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_univariate( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}} + ] + self.config["mode"] = "univariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertEqual([0, 0], sut.anomalies) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_univariate_model_init( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}} + ] + self.config["mode"] = "univariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.models = sut._get_models( + [{"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}] + ) + self.assertEqual(ZScoreDetector, type(sut.models[0])) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_univariate_2( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + { + "model": "ZScoreDetector", + "module": "streamad.model", + "model_args": {"window_len": 10}, + } + ] + self.config["mode"] = "univariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertNotEqual([None, None], sut.anomalies) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_univariate_two_models( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "ZScoreDetector", "module": "streamad.model", "model_args": {}}, + {"model": "KNNDetector", "module": "streamad.model", "model_args": {}}, + ] + self.config["mode"] = "univariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertEqual([0, 0], sut.anomalies) + self.assertTrue(isinstance(sut.models[0], ZScoreDetector)) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_multivariate( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "RShashDetector", "module": "streamad.model", "model_args": {}} + ] + self.config["mode"] = "multivariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertEqual([0, 0], sut.anomalies) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_multivariate_window_len( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + { + "model": "RShashDetector", + "module": "streamad.model", + "model_args": {"window_len": 10}, + } + ] + self.config["mode"] = "multivariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertNotEqual([None, None], sut.anomalies) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_multivariate_two_models( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "RShashDetector", "module": "streamad.model", "model_args": {}}, + {"model": "xStreamDetector", "module": "streamad.model", "model_args": {}}, + ] + self.config["mode"] = "multivariate" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertEqual([0, 0], sut.anomalies) + self.assertTrue(isinstance(sut.models[0], RShashDetector)) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_ensemble( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + {"model": "KNNDetector", "module": "streamad.model", "model_args": {}}, + {"model": "SpotDetector", "module": "streamad.model", "model_args": {}}, + ] + self.config["ensemble"] = { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": {"ensemble_weights": [0.6, 0.4]}, + } + self.config["mode"] = "ensemble" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertEqual([0, 0], sut.anomalies) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_ensemble_model_init( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + { + "model": "KNNDetector", + "module": "streamad.model", + "model_args": {"window_len": 10}, + }, + { + "model": "SpotDetector", + "module": "streamad.model", + "model_args": {"window_len": 10}, + }, + ] + self.config["ensemble"] = { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": {"ensemble_weights": [0.6, 0.4]}, + } + self.config["mode"] = "ensemble" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut._get_ensemble() + ensemble = sut.ensemble + sut.ensemble = None + sut._get_ensemble() + self.assertEqual(type(ensemble), type(sut.ensemble)) + ensemble = sut.ensemble + sut._get_ensemble() + self.assertEqual(ensemble, sut.ensemble) + + @patch("src.inspector.inspector.logger") + @patch("src.inspector.inspector.ExactlyOnceKafkaProduceHandler") + @patch("src.inspector.inspector.ExactlyOnceKafkaConsumeHandler") + @patch("src.inspector.inspector.ClickHouseKafkaSender") + def test_inspect_ensemble_window_len( + self, + mock_clickhouse, + mock_kafka_consume_handler, + mock_produce_handler, + mock_logger, + ): + self.config["models"] = [ + { + "model": "KNNDetector", + "module": "streamad.model", + "model_args": {"window_len": 10}, + }, + { + "model": "SpotDetector", + "module": "streamad.model", + "model_args": {"window_len": 10}, + }, + ] + self.config["ensemble"] = { + "model": "WeightEnsemble", + "module": "streamad.process", + "model_args": {"ensemble_weights": [0.6, 0.4]}, + } + self.config["mode"] = "ensemble" + sut = StreamADInspector(self.consume_topic, self.produce_topics, self.config) + test_batch = create_test_batch() + mock_kafka_consume_handler_instance = MagicMock() + mock_kafka_consume_handler.return_value = mock_kafka_consume_handler_instance + mock_kafka_consume_handler_instance.consume_as_object.return_value = ( + "test", + test_batch, + ) + sut.kafka_consume_handler = mock_kafka_consume_handler_instance + + sut.get_and_fill_data() + sut.inspect() + self.assertNotEqual([None, None], sut.anomalies) diff --git a/tests/kafka/test_exactly_once_kafka_consume_handler.py b/tests/kafka/test_exactly_once_kafka_consume_handler.py index a93d2fc8..b95203d6 100644 --- a/tests/kafka/test_exactly_once_kafka_consume_handler.py +++ b/tests/kafka/test_exactly_once_kafka_consume_handler.py @@ -362,6 +362,7 @@ def test_consume_as_object_valid_data(self): value = batch_schema.dumps( { "batch_id": uuid.uuid4(), + "batch_tree_row_id": uuid.uuid4(), "begin_timestamp": datetime.datetime.now(), "end_timestamp": datetime.datetime.now(), "data": [{"field1": "value1", "field2": "value2"}], @@ -385,6 +386,7 @@ def test_consume_as_object_valid_data_with_inner_strings(self): value = batch_schema.dumps( { "batch_id": uuid.uuid4(), + "batch_tree_row_id": uuid.uuid4(), "begin_timestamp": datetime.datetime.now(), "end_timestamp": datetime.datetime.now(), "data": [ diff --git a/tests/kafka/test_exactly_once_kafka_produce_handler.py b/tests/kafka/test_exactly_once_kafka_produce_handler.py index 557774ea..ff5c119b 100644 --- a/tests/kafka/test_exactly_once_kafka_produce_handler.py +++ b/tests/kafka/test_exactly_once_kafka_produce_handler.py @@ -25,15 +25,17 @@ class TestInit(unittest.TestCase): }, ], ) + @patch("src.base.kafka_handler.uuid") @patch("src.base.kafka_handler.Producer") - def test_init(self, mock_producer): + def test_init(self, mock_producer, mock_uuid): mock_producer_instance = MagicMock() mock_producer.return_value = mock_producer_instance - + mock_uuid.uuid4.return_value = "fixed‑uuid‑1234‑abcd‑5678‑90ef" expected_conf = { "bootstrap.servers": "127.0.0.1:9999,127.0.0.2:9998,127.0.0.3:9997", - "transactional.id": "test_transactional_id", + "transactional.id": f"test_transactional_id-{mock_uuid.uuid4.return_value}", "enable.idempotence": True, + "message.max.bytes": 1000000000, } sut = ExactlyOnceKafkaProduceHandler() @@ -44,6 +46,7 @@ def test_init(self, mock_producer): mock_producer.assert_called_once_with(expected_conf) mock_producer_instance.init_transactions.assert_called_once() + @patch("src.base.kafka_handler.uuid") @patch("src.base.kafka_handler.logger") @patch( "src.base.kafka_handler.KAFKA_BROKERS", @@ -63,14 +66,16 @@ def test_init(self, mock_producer): ], ) @patch("src.base.kafka_handler.Producer") - def test_init_fail(self, mock_producer, mock_logger): + def test_init_fail(self, mock_producer, mock_logger, mock_uuid): mock_producer_instance = MagicMock() mock_producer.return_value = mock_producer_instance + mock_uuid.uuid4.return_value = "fixed‑uuid‑1234‑abcd‑5678‑90ef" expected_conf = { "bootstrap.servers": "127.0.0.1:9999,127.0.0.2:9998,127.0.0.3:9997", - "transactional.id": "default_tid", + "transactional.id": f"default_tid-{mock_uuid.uuid4.return_value}", "enable.idempotence": True, + "message.max.bytes": 1000000000, } with patch.object( diff --git a/tests/kafka/test_simple_kafka_produce_handler.py b/tests/kafka/test_simple_kafka_produce_handler.py index 9436d07d..1b7b4067 100644 --- a/tests/kafka/test_simple_kafka_produce_handler.py +++ b/tests/kafka/test_simple_kafka_produce_handler.py @@ -29,6 +29,7 @@ def test_successful(self): "bootstrap.servers": "127.0.0.1:9999,127.0.0.2:9998,127.0.0.3:9997", "enable.idempotence": False, "acks": "1", + "message.max.bytes": 1000000000, } # Act diff --git a/tests/logcollector/test_batch_handler.py b/tests/logcollector/test_batch_handler.py index d6626b3c..85661cd7 100644 --- a/tests/logcollector/test_batch_handler.py +++ b/tests/logcollector/test_batch_handler.py @@ -8,7 +8,6 @@ class TestInit(unittest.TestCase): - @patch("src.logcollector.batch_handler.PRODUCE_TOPIC", "test_topic") @patch("src.logcollector.batch_handler.BufferedBatch") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") @@ -22,10 +21,12 @@ def test_init_with_buffer( mock_buffered_batch.return_value = mock_batch_instance # Act - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) # Assert - self.assertEqual("test_topic", sut.topic) + self.assertEqual(["test_topic"], sut.topics) self.assertEqual(mock_batch_instance, sut.batch) self.assertIsNone(sut.timer) self.assertEqual(mock_handler_instance, sut.kafka_produce_handler) @@ -41,7 +42,6 @@ class TestDel(unittest.TestCase): class TestAddMessage(unittest.TestCase): @patch("src.logcollector.batch_handler.logger") - @patch("src.logcollector.batch_handler.BATCH_SIZE", 1000) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._reset_timer") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") @@ -66,7 +66,10 @@ def test_add_message_normal( ) ) - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + sut.timer = MagicMock() # Act @@ -76,21 +79,36 @@ def test_add_message_normal( mock_send_batch.assert_not_called() mock_reset_timer.assert_not_called() + @patch("src.logcollector.batch_handler.get_batch_configuration") @patch("src.logcollector.batch_handler.logger") - @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_full_messages( - self, mock_clickhouse, mock_send_batch, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_send_batch, + mock_produce_handler, + mock_logger, + mock_get_batch_config, ): + # Arrange + mock_get_batch_config.return_value = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": "16", "ipv6_prefix_length": "32"}, + } + # Arrange mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance key = "test_key" - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + sut.timer = MagicMock() # Act @@ -116,14 +134,25 @@ def test_add_message_full_messages( ) mock_send_batch.assert_called_once() + @patch("src.logcollector.batch_handler.get_batch_configuration") @patch("src.logcollector.batch_handler.logger") - @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._send_batch_for_key") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_add_message_full_messages_with_different_keys( - self, mock_clickhouse, mock_send_batch, mock_produce_handler, mock_logger + self, + mock_clickhouse, + mock_send_batch, + mock_produce_handler, + mock_logger, + mock_get_batch_config, ): + mock_get_batch_config.return_value = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": "16", "ipv6_prefix_length": "32"}, + } + # Arrange mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance @@ -131,7 +160,10 @@ def test_add_message_full_messages_with_different_keys( key = "test_key" other_key = "other_key" - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + sut.timer = MagicMock() # Act @@ -180,7 +212,6 @@ def test_add_message_full_messages_with_different_keys( mock_send_batch.assert_called_once() @patch("src.logcollector.batch_handler.logger") - @patch("src.logcollector.batch_handler.BATCH_SIZE", 100) @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.BufferedBatchSender._reset_timer") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") @@ -191,7 +222,10 @@ def test_add_message_no_timer( mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + sut.timer = None # Act @@ -228,7 +262,9 @@ def test_send_all_batches_with_existing_keys( mock_send_batch_instance = MagicMock() mock_send_batch.return_value = mock_send_batch_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) # Act sut._send_all_batches(reset_timer=False) @@ -251,7 +287,9 @@ def test_send_all_batches_with_one_key( mock_send_batch_instance = MagicMock() mock_send_batch.return_value = mock_send_batch_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) # Act sut._send_all_batches(reset_timer=False) @@ -279,7 +317,9 @@ def test_send_all_batches_with_existing_keys_and_reset_timer( mock_send_batch_instance = MagicMock() mock_send_batch.return_value = mock_send_batch_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) # Act sut._send_all_batches(reset_timer=True) @@ -303,7 +343,9 @@ def test_send_all_batches_with_no_keys( mock_send_batch_instance = MagicMock() mock_send_batch.return_value = mock_send_batch_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) # Act sut._send_all_batches(reset_timer=False) @@ -324,7 +366,10 @@ def test_send_batch_for_key_success( mock_batch.return_value = mock_batch_instance mock_batch_instance.complete_batch.return_value = "mock_data_packet" - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + key = "test_key" # Act @@ -345,7 +390,10 @@ def test_send_batch_for_key_value_error( mock_batch.return_value = mock_batch_instance mock_batch_instance.complete_batch.side_effect = ValueError("Mock exception") - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + key = "test_key" # Act @@ -357,7 +405,6 @@ def test_send_batch_for_key_value_error( class TestSendDataPacket(unittest.TestCase): - @patch("src.logcollector.batch_handler.PRODUCE_TOPIC", "test_topic") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_send_data_packet(self, mock_clickhouse, mock_produce_handler): @@ -374,7 +421,9 @@ def test_send_data_packet(self, mock_clickhouse, mock_produce_handler): "data": ["test_data"], } - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) # Act sut._send_data_packet(key, data) @@ -390,20 +439,28 @@ def test_send_data_packet(self, mock_clickhouse, mock_produce_handler): class TestResetTimer(unittest.TestCase): - @patch("src.logcollector.batch_handler.BATCH_TIMEOUT", 5.9) + @patch("src.logcollector.batch_handler.get_batch_configuration") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.Timer") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_reset_timer_with_existing_timer( - self, mock_clickhouse, mock_timer, mock_produce_handler + self, mock_clickhouse, mock_timer, mock_produce_handler, mock_get_batch_config ): # Arrange + mock_get_batch_config.return_value = { + "batch_size": "200000", + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": "16", "ipv6_prefix_length": "32"}, + } mock_timer_instance = MagicMock() mock_timer.return_value = mock_timer_instance mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + sut.timer = mock_timer_instance sut._send_all_batches = MagicMock() @@ -417,18 +474,27 @@ def test_reset_timer_with_existing_timer( mock_timer.assert_called_once_with(5.9, sut._send_all_batches) sut.timer.start.assert_called_once() - @patch("src.logcollector.batch_handler.BATCH_TIMEOUT", 4.6) + @patch("src.logcollector.batch_handler.get_batch_configuration") @patch("src.logcollector.batch_handler.ExactlyOnceKafkaProduceHandler") @patch("src.logcollector.batch_handler.Timer") @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_reset_timer_without_existing_timer( - self, mock_clickhouse, mock_timer, mock_produce_handler + self, mock_clickhouse, mock_timer, mock_produce_handler, mock_get_batch_config ): + + mock_get_batch_config.return_value = { + "batch_size": "200000", + "batch_timeout": 4.6, + "subnet_id": {"ipv4_prefix_length": "16", "ipv6_prefix_length": "32"}, + } # Arrange mock_produce_handler_instance = MagicMock() mock_produce_handler.return_value = mock_produce_handler_instance - sut = BufferedBatchSender() + sut = BufferedBatchSender( + collector_name="test-collector", produce_topics=["test_topic"] + ) + sut._send_all_batches = MagicMock() # Act diff --git a/tests/logcollector/test_buffered_batch.py b/tests/logcollector/test_buffered_batch.py index 8b82eb08..96e136ac 100644 --- a/tests/logcollector/test_buffered_batch.py +++ b/tests/logcollector/test_buffered_batch.py @@ -10,7 +10,7 @@ class TestInit(unittest.TestCase): @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_init(self, mock_clickhouse): # Act - sut = BufferedBatch() + sut = BufferedBatch(collector_name="my-collector") # Assert self.assertEqual({}, sut.batch) @@ -24,7 +24,7 @@ def test_add_message_empty_batch_and_empty_buffer(self, mock_clickhouse): key = "test_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act sut.add_message(key, uuid.uuid4(), message) @@ -42,7 +42,7 @@ def test_add_message_empty_batch_and_used_buffer(self, mock_clickhouse): message = "test_message" old_message = "old_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = {key: [old_message]} # Act @@ -65,7 +65,7 @@ def test_add_message_used_batch_and_empty_buffer(self, mock_clickhouse): message = "test_message" old_message = "old_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {key: [old_message]} # Act @@ -87,7 +87,7 @@ def test_add_message_used_batch_and_used_buffer(self, mock_clickhouse): old_message_1 = "old_message_1" old_message_2 = "old_message_2" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {key: [old_message_2]} sut.buffer = {key: [old_message_1]} @@ -115,7 +115,7 @@ def test_add_message_with_existing_other_key(self, mock_clickhouse): old_message_1 = "old_message_1" old_message_2 = "old_message_2" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {old_key: [old_message_2]} sut.buffer = {old_key: [old_message_1]} @@ -141,7 +141,7 @@ def test_get_number_of_messages_with_empty_batch(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act and Assert self.assertEqual(0, sut.get_message_count_for_batch_key(key)) @@ -152,7 +152,7 @@ def test_get_number_of_messages_with_used_batch_for_key(self, mock_clickhouse): key = "test_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {key: [message]} # Act and Assert @@ -167,7 +167,7 @@ def test_get_number_of_messages_with_used_batch_for_other_key( other_key = "other_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {other_key: [message]} # Act and Assert @@ -183,7 +183,7 @@ def test_get_number_of_messages_with_empty_batch_and_used_buffer( other_key = "other_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = {other_key: [message]} # Act and Assert @@ -205,7 +205,7 @@ def test_get_number_of_messages_with_multiple_keys_and_messages( message_4 = "message_4" message_5 = "message_5" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = { key_3: [message_1, message_5], key_1: [message_2], @@ -226,7 +226,7 @@ def test_get_number_of_buffered_messages_with_empty_buffer(self, mock_clickhouse # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act and Assert self.assertEqual(0, sut.get_message_count_for_buffer_key(key)) @@ -239,7 +239,7 @@ def test_get_number_of_buffered_messages_with_used_buffer_for_key( key = "test_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = {key: [message]} # Act and Assert @@ -254,7 +254,7 @@ def test_get_number_of_buffered_messages_with_used_buffer_for_other_key( other_key = "other_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = {other_key: [message]} # Act and Assert @@ -270,7 +270,7 @@ def test_get_number_of_buffered_messages_with_empty_buffer_and_used_batch( other_key = "other_key" message = "test_message" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {other_key: [message]} # Act and Assert @@ -292,7 +292,7 @@ def test_get_number_of_buffered_messages_with_multiple_keys_and_messages( message_4 = "message_4" message_5 = "message_5" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = { key_3: [message_1, message_5], key_1: [message_2], @@ -312,7 +312,7 @@ class TestSortMessages(unittest.TestCase): def test_sort_with_empty_list(self, mock_clickhouse): # Arrange list_of_timestamps_and_loglines = [] - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act result = sut._sort_by_timestamp(list_of_timestamps_and_loglines) @@ -326,30 +326,30 @@ def test_sort_with_sorted_list(self, mock_clickhouse): list_of_timestamps_and_loglines = [ ( "2024-05-21T08:31:28.119Z", - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', ), ( "2024-05-21T08:31:28.249Z", - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', ), ] expected_list = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', ] - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act result = sut._sort_by_timestamp(list_of_timestamps_and_loglines) @@ -363,30 +363,30 @@ def test_sort_with_unsorted_list(self, mock_clickhouse): list_of_timestamps_and_loglines = [ ( "2024-05-21T08:31:28.249Z", - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', ), ( "2024-05-21T08:31:28.119Z", - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', ), ] expected_list = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', ] - sut = BufferedBatch() + sut = BufferedBatch(collector_name="my-collector") # Act result = sut._sort_by_timestamp(list_of_timestamps_and_loglines) @@ -399,7 +399,7 @@ class TestExtractTuplesFromJson(unittest.TestCase): @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_empty_data(self, mock_clickhouse): # Arrange - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") data = [] # Act @@ -411,13 +411,13 @@ def test_empty_data(self, mock_clickhouse): @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_with_data(self, mock_clickhouse): # Arrange - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") data = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.299Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.299Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.106", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', @@ -425,14 +425,14 @@ def test_with_data(self, mock_clickhouse): expected_result = [ ( "2024-05-21T08:31:28.119Z", - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', ), ( "2024-05-21T08:31:28.299Z", - '{"timestamp": "2024-05-21T08:31:28.299Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.299Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.106", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', @@ -451,7 +451,7 @@ class TestSortBuffer(unittest.TestCase): def test_sort_empty_buffer(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = [] # Act @@ -464,17 +464,17 @@ def test_sort_empty_buffer(self, mock_clickhouse): def test_sort_sorted_buffer(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer[key] = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', - '{"timestamp": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.221", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "120b"}', @@ -491,31 +491,31 @@ def test_sort_sorted_buffer(self, mock_clickhouse): def test_sort_unsorted_buffer(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer[key] = [ - '{"timestamp": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.221", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "120b"}', - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', ] expected_buffer = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', - '{"timestamp": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.221", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "120b"}', @@ -533,7 +533,7 @@ class TestSortBatch(unittest.TestCase): def test_sort_empty_batch(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = [] # Act @@ -546,17 +546,17 @@ def test_sort_empty_batch(self, mock_clickhouse): def test_sort_sorted_batch(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch[key] = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', - '{"timestamp": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.221", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "120b"}', @@ -573,31 +573,31 @@ def test_sort_sorted_batch(self, mock_clickhouse): def test_sort_unsorted_buffer(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch[key] = [ - '{"timestamp": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.221", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "120b"}', - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', ] expected_batch = [ - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": ' '"192.168.0.105", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "A", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}', - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.230", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}', - '{"timestamp": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "client_ip": ' + '{"ts": "2024-05-21T08:31:28.378Z", "status": "NXDOMAIN", "src_ip": ' '"192.168.0.221", "dns_ip": "8.8.8.8", "host_domain_name": ' '"www.heidelberg-botanik.de", "record_type": "AAAA", "response_ip": ' '"b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "120b"}', @@ -616,19 +616,19 @@ def test_complete_batch_variant_1(self, mock_clickhouse): # Arrange key = "test_key" message_1 = ( - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": "192.168.0.105", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' ) message_2 = ( - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": "192.168.0.230", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' ) expected_messages = [message_1, message_2] - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act sut.add_message(key, uuid.uuid4(), message_2) @@ -655,27 +655,27 @@ def test_complete_batch_variant_2(self, mock_clickhouse): # Arrange key = "test_key" message_1 = ( - '{"timestamp": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '{"ts": "2024-05-21T08:31:28.119Z", "status": "NOERROR", "src_ip": "192.168.0.105", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' ) message_2 = ( - '{"timestamp": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '{"ts": "2024-05-21T08:31:28.249Z", "status": "NXDOMAIN", "src_ip": "192.168.0.230", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' ) message_3 = ( - '{"timestamp": "2024-05-21T08:31:28.319Z", "status": "NOERROR", "client_ip": "192.168.0.105", ' + '{"ts": "2024-05-21T08:31:28.319Z", "status": "NOERROR", "src_ip": "192.168.0.105", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "A", ' '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "150b"}' ) message_4 = ( - '{"timestamp": "2024-05-21T08:31:28.749Z", "status": "NXDOMAIN", "client_ip": "192.168.0.230", ' + '{"ts": "2024-05-21T08:31:28.749Z", "status": "NXDOMAIN", "src_ip": "192.168.0.230", ' '"dns_ip": "8.8.8.8", "host_domain_name": "www.heidelberg-botanik.de", "record_type": "AAAA", ' '"response_ip": "b937:2f2e:2c1c:82a:33ad:9e59:ceb9:8e1", "size": "100b"}' ) - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act sut.add_message(key, uuid.uuid4(), message_1) @@ -719,7 +719,7 @@ def test_complete_batch_variant_3(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = {key: ["message"]} # Act and Assert @@ -734,7 +734,7 @@ def test_complete_batch_variant_4(self, mock_clickhouse): # Arrange key = "test_key" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act and Assert with self.assertRaises(ValueError): @@ -748,7 +748,7 @@ class TestGetStoredKeys(unittest.TestCase): @patch("src.logcollector.batch_handler.ClickHouseKafkaSender") def test_get_stored_keys_without_any_keys_stored(self, mock_clickhouse): # Arrange - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") # Act and Assert self.assertEqual(set(), sut.get_stored_keys()) @@ -760,7 +760,7 @@ def test_get_stored_keys_with_keys_stored_only_in_batch(self, mock_clickhouse): key_2 = "key_2" key_3 = "key_3" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {key_1: "message_1", key_2: "message_2", key_3: "message_3"} # Act and Assert @@ -773,7 +773,7 @@ def test_get_stored_keys_with_keys_stored_only_in_buffer(self, mock_clickhouse): key_2 = "key_2" key_3 = "key_3" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.buffer = {key_1: "message_1", key_2: "message_2", key_3: "message_3"} # Act and Assert @@ -788,7 +788,7 @@ def test_get_stored_keys_with_keys_stored_in_both_batch_and_buffer( key_2 = "key_2" key_3 = "key_3" - sut = BufferedBatch() + sut = BufferedBatch(collector_name="test_collector") sut.batch = {key_2: "message_2", key_3: "message_3"} sut.buffer = {key_1: "message_1"} diff --git a/tests/logcollector/test_collector.py b/tests/logcollector/test_collector.py index 7c96eb42..86ca53d1 100644 --- a/tests/logcollector/test_collector.py +++ b/tests/logcollector/test_collector.py @@ -6,10 +6,10 @@ from unittest.mock import MagicMock, patch, AsyncMock, Mock from src.logcollector.collector import LogCollector, main +from src.base.utils import setup_config class TestInit(unittest.TestCase): - @patch("src.logcollector.collector.CONSUME_TOPIC", "test_topic") @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -28,7 +28,13 @@ def test_valid_init( mock_logline_handler.return_value = mock_logline_handler_instance mock_kafka_handler.return_value = mock_kafka_handler_instance - sut = LogCollector() + sut = LogCollector( + collector_name="test-collector", + consume_topic="test_topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) self.assertEqual(mock_batch_handler_instance, sut.batch_handler) self.assertEqual(mock_logline_handler_instance, sut.logline_handler) @@ -53,82 +59,66 @@ def setUp( mock_kafka_consume_handler, mock_logger, ): - self.sut = LogCollector() + self.sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) async def test_start_successful_execution(self): # Arrange - self.sut.fetch = AsyncMock() - - async def mock_gather(*args, **kwargs): - return None - - with patch( - "src.logcollector.collector.asyncio.gather", side_effect=mock_gather - ) as mock: - # Act - await self.sut.start() - - # Assert - mock.assert_called_once() - self.sut.fetch.assert_called_once() - - async def test_start_handles_keyboard_interrupt(self): - # Arrange - self.sut.fetch = AsyncMock() + self.sut.fetch = MagicMock() + await self.sut.start() + self.sut.fetch.assert_called_once() - async def mock_gather(*args, **kwargs): - raise KeyboardInterrupt - with patch( - "src.logcollector.collector.asyncio.gather", side_effect=mock_gather - ) as mock: - # Act - await self.sut.start() +class _StopFetching(RuntimeError): + """Raised inside the test to break the infinite fetch loop.""" - # Assert - mock.assert_called_once() - self.sut.fetch.assert_called_once() - -class TestFetch(unittest.IsolatedAsyncioTestCase): +class TestFetch(unittest.TestCase): @patch("src.logcollector.collector.LoglineHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") + @patch("src.logcollector.collector.LogCollector.send") + @patch("src.logcollector.collector.logger") @patch("src.logcollector.collector.ClickHouseKafkaSender") - async def asyncSetUp( + def test_handle_kafka_inputs( self, mock_clickhouse, - mock_kafka_handler, + mock_logger, + mock_send, + mock_kafka_consume, mock_batch_sender, mock_logline_handler, ): - self.sut = LogCollector() - self.sut.kafka_consume_handler = AsyncMock() - - @patch("src.logcollector.collector.LogCollector.send") - @patch("src.logcollector.collector.logger") - @patch("asyncio.get_running_loop") - @patch("src.logcollector.collector.ClickHouseKafkaSender") - async def test_handle_kafka_inputs( - self, mock_clickhouse, mock_get_running_loop, mock_logger, mock_send - ): - mock_send_instance = AsyncMock() + mock_consume_handler = MagicMock() + mock_consume_handler.consume.side_effect = [ + ("key1", "value1", "topic1"), + _StopFetching(), + ] + mock_kafka_consume.return_value = mock_consume_handler + mock_send_instance = MagicMock() mock_send.return_value = mock_send_instance - mock_loop = AsyncMock() - mock_get_running_loop.return_value = mock_loop - self.sut.kafka_consume_handler.consume.return_value = ( - "key1", - "value1", - "topic1", + self.sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, ) + original_fetch = self.sut.fetch - mock_loop.run_in_executor.side_effect = [ - ("key1", "value1", "topic1"), - asyncio.CancelledError(), - ] + def fetch_wrapper(*args, **kwargs): + try: + original_fetch(*args, **kwargs) + except _StopFetching: + return - with self.assertRaises(asyncio.CancelledError): - await self.sut.fetch() + with patch.object(self.sut, "fetch", new=fetch_wrapper): + self.sut.fetch() mock_send.assert_called_once() @@ -142,7 +132,13 @@ def setUp(self): patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler"), patch("src.logcollector.collector.ClickHouseKafkaSender"), ): - self.sut = LogCollector() + self.sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) def test_valid_logline(self): timestamp = datetime.datetime(2026, 2, 14, 16, 38, 6, 184006) @@ -169,15 +165,14 @@ def test_invalid_logline(self): mock_logline_handler = Mock() self.sut.logline_handler = mock_logline_handler.return_value self.sut.logline_handler.validate_logline_and_get_fields_as_json.return_value = { - "timestamp": str(timestamp), + "ts": str(timestamp), "status_code": "test_status", - "client_ip": "192.168.3.141", + "src_ip": "192.168.3.141", "record_type": "test_record_type", } # Act with ( - patch("src.logcollector.collector.IPV4_PREFIX_LENGTH", 24), patch( "src.logcollector.collector.uuid.uuid4", return_value=uuid.UUID("da3aec7f-b355-4a2c-a2f4-2066d49431a5"), @@ -188,12 +183,11 @@ def test_invalid_logline(self): # Assert self.sut.batch_handler.add_message.assert_called_once_with( "192.168.3.0_24", - '{"timestamp": "2026-02-14 16:38:06.184006", "status_code": "test_status", "client_ip": "192.168.3.141", "record_type": "test_record_type", "logline_id": "da3aec7f-b355-4a2c-a2f4-2066d49431a5"}', + '{"ts": "2026-02-14 16:38:06.184006", "status_code": "test_status", "src_ip": "192.168.3.141", "record_type": "test_record_type", "logline_id": "da3aec7f-b355-4a2c-a2f4-2066d49431a5"}', ) class TestGetSubnetId(unittest.TestCase): - @patch("src.logcollector.collector.IPV4_PREFIX_LENGTH", 24) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -205,18 +199,26 @@ def test_get_subnet_id_ipv4( mock_batch_handler, mock_kafka_handler, ): - # Arrange test_address = ipaddress.IPv4Address("192.168.1.1") expected_result = f"192.168.1.0_24" - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 64}, + } # Act result = sut._get_subnet_id(test_address) # Assert self.assertEqual(expected_result, result) - @patch("src.logcollector.collector.IPV4_PREFIX_LENGTH", 24) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -228,18 +230,26 @@ def test_get_subnet_id_ipv4_zero( mock_batch_handler, mock_kafka_handler, ): - # Arrange test_address = ipaddress.IPv4Address("0.0.0.0") expected_result = f"0.0.0.0_24" - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 64}, + } # Act result = sut._get_subnet_id(test_address) # Assert self.assertEqual(expected_result, result) - @patch("src.logcollector.collector.IPV4_PREFIX_LENGTH", 23) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -251,18 +261,26 @@ def test_get_subnet_id_ipv4_max( mock_batch_handler, mock_kafka_handler, ): - # Arrange test_address = ipaddress.IPv4Address("255.255.255.255") expected_result = f"255.255.254.0_23" - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 23, "ipv6_prefix_length": 64}, + } # Act result = sut._get_subnet_id(test_address) # Assert self.assertEqual(expected_result, result) - @patch("src.logcollector.collector.IPV6_PREFIX_LENGTH", 64) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -274,18 +292,26 @@ def test_get_subnet_id_ipv6( mock_batch_handler, mock_kafka_handler, ): - # Arrange test_address = ipaddress.IPv6Address("2001:db8:85a3:1234:5678:8a2e:0370:7334") expected_result = f"2001:db8:85a3:1234::_64" - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) # Act + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 64}, + } result = sut._get_subnet_id(test_address) # Assert self.assertEqual(expected_result, result) - @patch("src.logcollector.collector.IPV6_PREFIX_LENGTH", 64) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -300,15 +326,24 @@ def test_get_subnet_id_ipv6_zero( # Arrange test_address = ipaddress.IPv6Address("::") expected_result = f"::_64" - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 64}, + } # Act result = sut._get_subnet_id(test_address) # Assert self.assertEqual(expected_result, result) - @patch("src.logcollector.collector.IPV6_PREFIX_LENGTH", 48) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -323,16 +358,24 @@ def test_get_subnet_id_ipv6_max( # Arrange test_address = ipaddress.IPv6Address("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff") expected_result = f"ffff:ffff:ffff::_48" - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 48}, + } # Act result = sut._get_subnet_id(test_address) # Assert self.assertEqual(expected_result, result) - @patch("src.logcollector.collector.IPV4_PREFIX_LENGTH", 24) - @patch("src.logcollector.collector.IPV6_PREFIX_LENGTH", 48) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -346,15 +389,23 @@ def test_get_subnet_id_unsupported_type( ): # Arrange test_address = "192.168.1.1" # String instead of IPv4Address or IPv6Address - sut = LogCollector() - + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 48}, + } # Act & Assert with self.assertRaises(ValueError): # noinspection PyTypeChecker sut._get_subnet_id(test_address) - @patch("src.logcollector.collector.IPV4_PREFIX_LENGTH", 24) - @patch("src.logcollector.collector.IPV6_PREFIX_LENGTH", 48) @patch("src.logcollector.collector.ExactlyOnceKafkaConsumeHandler") @patch("src.logcollector.collector.BufferedBatchSender") @patch("src.logcollector.collector.LoglineHandler") @@ -368,7 +419,18 @@ def test_get_subnet_id_none( ): # Arrange test_address = None - sut = LogCollector() + sut = LogCollector( + collector_name="my-collector", + consume_topic="consume-topic", + produce_topics=["produce-topic"], + protocol="dns", + validation_config={}, + ) + sut.batch_configuration = { + "batch_size": 100, + "batch_timeout": 5.9, + "subnet_id": {"ipv4_prefix_length": 24, "ipv6_prefix_length": 48}, + } # Act & Assert with self.assertRaises(ValueError): @@ -376,21 +438,45 @@ def test_get_subnet_id_none( sut._get_subnet_id(test_address) -class TestMain(unittest.TestCase): +class TestMain(unittest.IsolatedAsyncioTestCase): + def setUp(self): + self.cs = [ + { + "name": "test_collector", + "protocol_base": "dns", + "required_log_information": [ + ["ts", "Timestamp", "%Y-%m-%dT%H:%M:%S"], + [ + "domain_name", + "RegEx", + "^(?=.{1,253}$)((?!-)[A-Za-z0-9-]{1,63}(?