From 6e630e90c48c80ecbed059afabd5109103f44e81 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Mon, 8 Apr 2024 12:04:42 -0700 Subject: [PATCH 1/9] Use new TPU segment option --- .../objectdetection_coral_multitpu.py | 4 +- src/modules/ObjectDetectionCoral/options.py | 247 +++++++++++++----- .../ObjectDetectionCoral/segment_and_test.py | 64 +++-- .../ObjectDetectionCoral/tpu_runner.py | 23 +- 4 files changed, 231 insertions(+), 107 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py index af1a737b..dde479ac 100644 --- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py +++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py @@ -271,11 +271,11 @@ def main(): if half_wall_start is not None: half_wall_time = time.perf_counter() - half_wall_start - print('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' % + logging.info('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' % (wall_time * 1000 / args.count, args.count, (time.perf_counter() - start_one) * 1000)) - print('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' % + logging.info('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' % (tot_infr_time / args.count, thread_cnt, half_wall_time * 1000 / half_infr_count, half_infr_count)) diff --git a/src/modules/ObjectDetectionCoral/options.py b/src/modules/ObjectDetectionCoral/options.py index 676dd4e2..14a036cf 100644 --- a/src/modules/ObjectDetectionCoral/options.py +++ b/src/modules/ObjectDetectionCoral/options.py @@ -12,13 +12,152 @@ def getEnvVariable(a, b): class Settings: def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str, - tpu_model_name: str, labels_name: str, tpu_segments_lists): + tpu_model_name: str, labels_name: str): self.model_name = model_name self.model_name_pattern = model_name_pattern self.cpu_model_name = std_model_name self.tpu_model_name = tpu_model_name self.labels_name = labels_name - self.tpu_segments_lists = tpu_segments_lists + + self.MODEL_SEGMENTS = { + 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { + # 104.2 ms per inference + 2: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + # 67.5 ms per inference + 3: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + # 49.1 ms per inference + 4: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + # 43.5 ms per inference + 5: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + # 37.0 ms per inference + 6: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + # 31.1 ms per inference + 7: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + # 27.1 ms per inference + 8: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + }, + 'efficientdet_lite2_448_ptq': { + # 32.1 ms per inference + 2: ['all_segments_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 19.5 ms per inference + 3: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 16.5 ms per inference + 4: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_2_of_3_edgetpu.tflite'], + # 13.6 ms per inference + 5: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 11.5 ms per inference + 7: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 11.3 ms per inference + 8: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'efficientdet_lite3_512_ptq': { + # 20.9 ms per inference + 4: ['15x_last_seg_efficientdet_lite3_512_ptq_segment_0_of_2_edgetpu.tflite', '15x_last_seg_efficientdet_lite3_512_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'efficientdet_lite3x_640_ptq': { + # 95.0 ms per inference + 2: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 70.6 ms per inference + 3: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'], + # 47.9 ms per inference + 4: ['2x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'], + # 38.7 ms per inference + 5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 35.1 ms per inference + 6: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 30.6 ms per inference + 7: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 27.3 ms per inference + 8: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov5m-int8': { + # 56.3 ms per inference + 2: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + # 32.2 ms per inference + 3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + # 25.9 ms per inference + 4: ['2x_last_seg_yolov5m-int8_segment_0_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_1_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_2_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_3_of_4_edgetpu.tflite'], + # 21.2 ms per inference + 5: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + # 18.8 ms per inference + 6: ['15x_last_seg_yolov5m-int8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_2_of_3_edgetpu.tflite'], + # 14.7 ms per inference + 7: ['all_segments_yolov5m-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_3_of_4_edgetpu.tflite'], + # 14.6 ms per inference + 8: ['all_segments_yolov5m-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov5l-int8': { + # 61.1 ms per inference + 3: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], + # 48.0 ms per inference + 4: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'], + # 39.0 ms per inference + 5: ['all_segments_yolov5l-int8_segment_0_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_4_of_5_edgetpu.tflite'], + # 31.5 ms per inference + 6: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], + # 26.7 ms per inference + 7: ['dumb_yolov5l-int8_segment_0_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_2_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_3_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_4_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_5_of_6_edgetpu.tflite'], + # 24.4 ms per inference + 8: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'], + }, + 'yolov8s_416_640px': { + # 25.6 ms per inference + 3: ['166x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov8m_416_640px': { + # 114.4 ms per inference + 2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], + # 71.9 ms per inference + 3: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + # 53.0 ms per inference + 4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + # 43.5 ms per inference + 5: ['166x_first_seg_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'], + # 31.8 ms per inference + 6: ['2x_first_seg_yolov8m_416_640px_segment_0_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_3_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_4_of_5_edgetpu.tflite'], + # 29.5 ms per inference + 7: ['all_segments_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'], + # 26.0 ms per inference + 8: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov8l_416_640px': { + # 169.6 ms per inference + 2: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 115.8 ms per inference + 3: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 89.7 ms per inference + 4: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 77.7 ms per inference + 5: ['4x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 64.2 ms per inference + 6: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 57.3 ms per inference + 7: ['3x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], + # 52.2 ms per inference + 8: ['166x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], + }, + 'ipcam-general-v8': { + # 53.4 ms per inference + 2: ['2x_last_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + # 24.3 ms per inference + 3: ['all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + # 19.9 ms per inference + 4: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 15.6 ms per inference + 5: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 15.2 ms per inference + 6: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 12.3 ms per inference + 7: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 10.9 ms per inference + 8: ['2x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + }, + } + + self.tpu_segments_lists = {} + if model_name_pattern in self.MODEL_SEGMENTS: + self.tpu_segments_lists = self.MODEL_SEGMENTS[model_name_pattern] + class Options: @@ -36,123 +175,92 @@ def __init__(self): self.MODEL_SETTINGS = { "yolov8": { # 59.88 ms throughput / 855.40 ms inference - "large": Settings('YOLOv8', 'yolov8l', \ + "large": Settings('YOLOv8', 'yolov8l_416_640px', 'yolov8l_416_640px.tflite', # 46Mb CPU 'yolov8l_416_640px_edgetpu.tflite', # 48Mb TPU - 'coco_labels.txt', - # 54.18 ms throughput / 754.56 ms inference - [['yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', - 'yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 55.79 ms throughput / 824.09 ms inference - ['yolov8l_448px_segment_0_of_3_edgetpu.tflite', - 'yolov8l_448px_segment_1_of_3_edgetpu.tflite', - 'yolov8l_448px_segment_2_of_3_edgetpu.tflite']]), + 'coco_labels.txt'), # 53.72 ms throughput / 762.86 ms inference - "medium": Settings('YOLOv8', 'yolov8m', \ - 'yolov8m-416_640px.tflite', # 21Mb CPU - 'yolov8m-416_640px_edgetpu.tflite', # 22Mb TPU - 'coco_labels.txt', - [['yolov8m__segment_0_of_2_edgetpu.tflite', - 'yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], - # 39.59 ms throughput / 574.83 ms inference - ['yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', - 'yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', - 'yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', - 'yolov8m_416_640px_segment_3_of_4_edgetpu.tflite']]), + "medium": Settings('YOLOv8', 'yolov8m_416_640px', \ + 'yolov8m_416_640px.tflite', # 21Mb CPU + 'yolov8m_416_640px_edgetpu.tflite', # 22Mb TPU + 'coco_labels.txt'), # 21.52 ms throughput / 291.35 ms inference - "small": Settings('YOLOv8', 'yolov8s', \ + "small": Settings('YOLOv8', 'yolov8s_416_640px', 'yolov8s_416_640px.tflite', # 11Mb CPU 'yolov8s_416_640px_edgetpu.tflite', # 12Mb TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # 10.35 ms throughput / 123.35 ms inference - "tiny": Settings('YOLOv8', 'yolov8n', + "tiny": Settings('YOLOv8', 'yolov8n_416_640px', 'yolov8n_416_640px.tflite', # 4Mb CPU 'yolov8n_416_640px_edgetpu.tflite', # 3Mb TPU - 'coco_labels.txt', []) + 'coco_labels.txt') }, - "yolov5": { - "large": Settings('YOLOv5', 'yolov5l', \ + "large": Settings('YOLOv5', 'yolov5l-int8', 'yolov5l-int8.tflite', # 46Mb CPU 'yolov5l-int8_edgetpu.tflite', # 48Mb TPU - 'coco_labels.txt', - [['yolov5l-int8_edgetpu_segment_0_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_1_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_2_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_3_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_4_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_5_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_6_of_7_edgetpu.tflite']]), - "medium": Settings('YOLOv5', 'yolov5m', \ + 'coco_labels.txt'), + "medium": Settings('YOLOv5', 'yolov5m-int8', 'yolov5m-int8.tflite', # 21Mb CPU 'yolov5m-int8_edgetpu.tflite', # 22Mb TPU - 'coco_labels.txt', - [['yolov5m-int8_edgetpu_segment_0_of_4_edgetpu.tflite', - 'yolov5m-int8_edgetpu_segment_1_of_4_edgetpu.tflite', - 'yolov5m-int8_edgetpu_segment_2_of_4_edgetpu.tflite', - 'yolov5m-int8_edgetpu_segment_3_of_4_edgetpu.tflite']]), - "small": Settings('YOLOv5', 'yolov5s', \ + 'coco_labels.txt'), + "small": Settings('YOLOv5', 'yolov5s-int8', 'yolov5s-int8.tflite', # 7Mb CPU 'yolov5s-int8_edgetpu.tflite', # 8Mb TPU - 'coco_labels.txt', []), - "tiny": Settings('YOLOv5', 'yolov5n', \ + 'coco_labels.txt'), + "tiny": Settings('YOLOv5', 'yolov5n-int8', 'yolov5n-int8.tflite', # 2Mb CPU 'yolov5n-int8_edgetpu.tflite', # 2Mb TPU - 'coco_labels.txt', []) + 'coco_labels.txt') }, - "efficientdet-lite": { # Large: EfficientDet-Lite3x 90 objects COCO 640x640x3 2 197.0 ms 43.9% mAP - "large": Settings('EfficientDet-Lite', 'efficientdet_lite3x_640', \ + "large": Settings('EfficientDet-Lite', 'efficientdet_lite3x_640_ptq', \ 'efficientdet_lite3x_640_ptq.tflite', # 14Mb CPU 'efficientdet_lite3x_640_ptq_edgetpu.tflite', # 20Mb TPU - 'coco_labels.txt', - [['efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', - 'efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite' - 'efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite']]), + 'coco_labels.txt'), # Medium: EfficientDet-Lite3 90 objects 512x512x3 2 107.6 ms 39.4% mAP - "medium": Settings('EfficientDet-Lite', 'efficientdet_lite3_512', \ + "medium": Settings('EfficientDet-Lite', 'efficientdet_lite3_512_ptq', \ 'efficientdet_lite3_512_ptq.tflite', # CPU 'efficientdet_lite3_512_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Small: EfficientDet-Lite2 90 objects COCO 448x448x3 2 104.6 ms 36.0% mAP - "small": Settings('EfficientDet-Lite', 'efficientdet_lite2_448', \ + "small": Settings('EfficientDet-Lite', 'efficientdet_lite2_448_ptq', \ 'efficientdet_lite2_448_ptq.tflite', # 10Mb CPU 'efficientdet_lite2_448_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', - [['efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', - 'efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite']]), + 'coco_labels.txt'), # Tiny: EfficientDet-Lite1 90 objects COCO 384x384x3 2 56.3 ms 34.3% mAP - "tiny": Settings('EfficientDet-Lite', 'efficientdet_lite1_384', \ + "tiny": Settings('EfficientDet-Lite', 'efficientdet_lite1_384_ptq', \ 'efficientdet_lite1_384_ptq.tflite', # 7Mb CPU 'efficientdet_lite1_384_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []) + 'coco_labels.txt') }, "mobilenet ssd": { # Large: SSD/FPN MobileNet V1 90 objects, COCO 640x640x3 TF-lite v2 229.4 ms 31.1% mAP - "large": Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v1_fpn_640', \ + "large": Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', \ 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq.tflite', # CPU 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Medium: SSDLite MobileDet 90 objects, COCO 320x320x3 TF-lite v1 9.1 ms 32.9% mAP "medium": Settings('MobileNet SSD', 'ssdlite_mobiledet_coco_', \ 'ssdlite_mobiledet_coco_qat_postprocess.tflite', # 5Mb CPU 'ssdlite_mobiledet_coco_qat_postprocess_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Small: SSD MobileNet V2 90 objects, COCO 300x300x3 TF-lite v2 7.6 ms 22.4% mAP "small": Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v2', \ 'tf2_ssd_mobilenet_v2_coco17_ptq.tflite', # 6.7Mb CPU 'tf2_ssd_mobilenet_v2_coco17_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Tiny: MobileNet V2 90 objects, COCO 300x300x3 TF-lite v2 Quant "tiny": Settings('MobileNet SSD', 'ssd_mobilenet_v2_coco_', \ 'ssd_mobilenet_v2_coco_quant_postprocess.tflite', # 6.6Mb CPU 'ssd_mobilenet_v2_coco_quant_postprocess_edgetpu.tflite', # TPU - 'coco_labels.txt', []) + 'coco_labels.txt') } } + self.ENABLE_MULTI_TPU = True self.MIN_CONFIDENCE = 0.5 @@ -221,7 +329,7 @@ def set_model(self, model_name): # Normalise input self.model_name = model_name.lower() - if self.model_name not in [ "mobilenet ssd", "efficientdet-lite", "yolov5", "yolov8" ]: # 'yolov5' - no sense including v5 anymore + if self.model_name not in [ "mobilenet ssd", "efficientdet-lite", "yolov5", "yolov8"]: # 'yolov5' - no sense including v5 anymore self.model_name = "mobilenet ssd" self.model_size = self.model_size.lower() @@ -254,5 +362,8 @@ def set_model(self, model_name): self.model_cpu_file = os.path.normpath(os.path.join(self.models_dir, self.cpu_model_name)) self.model_tpu_file = os.path.normpath(os.path.join(self.models_dir, self.tpu_model_name)) self.label_file = os.path.normpath(os.path.join(self.models_dir, self.labels_name)) - self.tpu_segments_lists = [ [os.path.normpath(os.path.join(self.models_dir, name)) for name in name_list] \ - for name_list in settings.tpu_segments_lists ] + + self.tpu_segments_lists = {} + for tpu_cnt, name_list in settings.tpu_segments_lists.items(): + self.tpu_segments_lists[tpu_cnt] = \ + [os.path.normpath(os.path.join(self.models_dir, name)) for name in name_list] diff --git a/src/modules/ObjectDetectionCoral/segment_and_test.py b/src/modules/ObjectDetectionCoral/segment_and_test.py index ed7706b1..72db2542 100644 --- a/src/modules/ObjectDetectionCoral/segment_and_test.py +++ b/src/modules/ObjectDetectionCoral/segment_and_test.py @@ -10,20 +10,24 @@ #'ssd_mobilenet_v2_coco_quant_postprocess', #'ssdlite_mobiledet_coco_qat_postprocess', #'ssd_mobilenet_v1_coco_quant_postprocess', - 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', + #'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', #'efficientdet_lite0_320_ptq', #'efficientdet_lite1_384_ptq', - 'efficientdet_lite2_448_ptq', - 'efficientdet_lite3_512_ptq', - 'efficientdet_lite3x_640_ptq', + #'efficientdet_lite2_448_ptq', + #'efficientdet_lite3_512_ptq', + #'efficientdet_lite3x_640_ptq', #'yolov5n-int8', #'yolov5s-int8', - 'yolov5m-int8', - 'yolov5l-int8', + #'yolov5m-int8', + #'yolov5l-int8', #'yolov8n_416_640px', # lg 1st seg - 'yolov8s_416_640px', # lg 1st seg - 'yolov8m_416_640px', # lg 1st seg - 'yolov8l_416_640px', # lg 1st seg + #'yolov8s_416_640px', # lg 1st seg + #'yolov8m_416_640px', # lg 1st seg + #'yolov8l_416_640px', # lg 1st seg + #'yolov8n_640px', + #'yolov8s_640px', + #'yolov8m_640px', # lg 1st seg + #'yolov8l_640px', # lg 1st seg 'ipcam-general-v8'] custom_args = { @@ -178,7 +182,7 @@ 8: ["--partition_search_step","5"]}}#''' seg_dir = "/media/seth/FAT_THUMB/all_segments/" -seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', '15x_last_seg/', '2x_last_seg/', 'dumb/'] +seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', 'inc_seg/', 'dumb/'] def seg_exists(filename, segment_type, segment_count): @@ -191,7 +195,7 @@ def seg_exists(filename, segment_type, segment_count): seg_list = [seg_dir+segment_type+filename+'_segment_{}_of_{}_edgetpu.tflite'.format(i, segment_count) for i in range(segment_count)] return (seg_list, any([True for s in seg_list if not os.path.exists(s)])) -MAX_TPU_COUNT = 8 +MAX_TPU_COUNT = 4 ''' # Generate segment files @@ -251,6 +255,17 @@ def seg_exists(filename, segment_type, segment_count): # for (auto latency : latencies) { # # sudo make DOCKER_IMAGE="ubuntu:20.04" DOCKER_CPUS="k8" DOCKER_TARGETS="tools" docker-build + + #// Encourage each segment slower than the previous to spread out the bottlenecks + #double latency_adjust = 1.0; + #for (int i = 1; i < num_segments_; ++i) + #{ + # if (latencies[i-1] < latencies[i]) + # latency_adjust *= 0.97; + # latencies[i-1] *= latency_adjust; + #} + #latencies[num_segments_-1] *= latency_adjust; + partition_with_profiling_dir = "libcoral/tools.2" elif '15x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.15" @@ -266,6 +281,8 @@ def seg_exists(filename, segment_type, segment_count): partition_with_profiling_dir = "libcoral/tools.last15" elif '2x_last_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.last2" + elif 'inc_seg' == seg_type: + partition_with_profiling_dir = "libcoral/tools.inc_seg" else: partition_with_profiling_dir = "libcoral/tools.orig" @@ -281,7 +298,7 @@ def seg_exists(filename, segment_type, segment_count): subprocess.run(cmd)#''' -seg_types += ['133x_first_seg/'] +seg_types += ['133x_first_seg/', '15x_last_seg/', '2x_last_seg/'] # Test timings fin_timings = {} @@ -293,15 +310,12 @@ def seg_exists(filename, segment_type, segment_count): for num_tpus in range(2,MAX_TPU_COUNT+1): - for seg_type in seg_types+['orig_code']: + for seg_type in seg_types: max_seg = 0 for sn in range(1,num_tpus+1): # Test against orig code - if seg_type == 'orig_code': - exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py" - else: - exe_file = "/home/seth/Downloads/coral_module/objectdetection_coral_multitpu.py" + exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py" # Get file types seg_list, file_missing = seg_exists(fn, seg_type, sn) @@ -312,19 +326,19 @@ def seg_exists(filename, segment_type, segment_count): cmd = ["python3",exe_file,"--model"] + \ seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp", - "--count","1000","--num-tpus",str(num_tpus)] + "--count","2000","--num-tpus",str(num_tpus)] print(cmd) - c = subprocess.run(cmd, capture_output=True) + c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(c.stdout) print(c.stderr) - ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stdout)[0]) + ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0]) timings.append((ms_time, num_tpus, fn, seg_type, sn)) timings = sorted(timings, key=lambda t: t[0]) # Print the top three print(f"TIMINGS FOR {num_tpus} TPUs AND {fn} MODEL:") - for t in range(min(5,len(timings))): + for t in range(min(10,len(timings))): print(timings[t]) # Get best segments, but @@ -345,6 +359,12 @@ def seg_exists(filename, segment_type, segment_count): shutil.copyfile(s, out_fname) fin_fnames[fn][num_tpus].append(out_fname) + # Create archive for this model / TPU count + if any(fin_fnames[fn][num_tpus]): + cmd = ['zip', '-9', f'objectdetection-{fn}-{num_tpus}-edgetpu.zip'] + fin_fnames[fn][num_tpus] + print(cmd) + subprocess.run(cmd) + print(fin_timings) print(fin_fnames) @@ -352,6 +372,6 @@ def seg_exists(filename, segment_type, segment_count): for fn, v in fin_fnames.items(): print(" '%s': {" % fn) for tpu_count, out_fnames in v.items(): - print(f" # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference") # assumes 1k test runs + print(f" # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference") print(f" {tpu_count}: "+str(out_fnames)+",") print(" },") diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 14fc9dbb..57213f53 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -510,15 +510,10 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: more than one list of segment files then use the list of files that best matches the number of TPUs we have, otherwise use the single list we have. If all else fails return the single TPU filename as a list. - NOTE: This method also updates self.device_count and self.segment_count - based on the choice of whether it uses a single model or a set of - segment file names """ # if TPU no-show then default is CPU self.device_type = 'CPU' - device_count = 1 # CPU. At this point we don't know if we have TPU - segment_count = 1 # Single CPU model file if not any(tpu_list): return [] @@ -526,22 +521,20 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: # If TPU found then default is single TPU model file (no segments) device_count = len(tpu_list) # TPUs. We've at least found one - segment_count = 1 # Single TPU model name at this point - if not any(options.tpu_segments_lists): + if not any(options.tpu_segments_lists) or device_count == 1: return [options.model_tpu_file] # We have a list of segment files - if isinstance(options.tpu_segments_lists[0], list): + if isinstance(options.tpu_segments_lists, dict): # Look for a good match between available TPUs and segment counts - # Prioritize first match - for fname_list in options.tpu_segments_lists: - segment_count = len(fname_list) - if segment_count <= device_count: - return fname_list + # Prioritize first match. Note we have only tested up to 8 TPUs, + # so best performance above that can probably be had by extrapolation. + device_count = min(device_count, 8) + if device_count in options.tpu_segments_lists: + return options.tpu_segments_lists[device_count] else: # Only one list of segments; use it regardless of even match to TPU count - segment_count = len(options.tpu_segments_lists) - if segment_count <= device_count: + if len(options.tpu_segments_lists) <= device_count: return options.tpu_segments_lists # Couldn't find a good fit, use single segment From 43ff9deb9df38f7097998c645c29f69f490f89d1 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Mon, 8 Apr 2024 12:32:36 -0700 Subject: [PATCH 2/9] minor tweaks --- .../objectdetection_coral_multitpu.py | 12 ++++++------ src/modules/ObjectDetectionCoral/tpu_runner.py | 4 +++- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py index dde479ac..ea76b6c0 100644 --- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py +++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py @@ -279,17 +279,17 @@ def main(): (tot_infr_time / args.count, thread_cnt, half_wall_time * 1000 / half_infr_count, half_infr_count)) - print('-------RESULTS--------') + logging.info('-------RESULTS--------') if not objs: - print('No objects detected') + logging.info('No objects detected') return if any(objs): for obj in objs: - print(_tpu_runner.labels.get(obj.id, obj.id)) - print(' id: ', obj.id) - print(' score: ', obj.score) - print(' bbox: ', obj.bbox) + logging.info(_tpu_runner.labels.get(obj.id, obj.id)) + logging.info(f' id: {obj.id}') + logging.info(f' score: {obj.score}') + logging.info(f' bbox: {obj.bbox}') if args.output: image = image.convert('RGB') diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 57213f53..bbf587fc 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -517,10 +517,12 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: if not any(tpu_list): return [] + device_count = len(tpu_list) # TPUs. We've at least found one self.device_type = 'Multi-TPU' + if device_count == 1: + self.device_type = 'TPU' # If TPU found then default is single TPU model file (no segments) - device_count = len(tpu_list) # TPUs. We've at least found one if not any(options.tpu_segments_lists) or device_count == 1: return [options.model_tpu_file] From 9d167d30882d4b06f1896725e191eddb9471b0b9 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Tue, 9 Apr 2024 09:17:59 -0700 Subject: [PATCH 3/9] Better queue balancer --- .../objectdetection_coral_multitpu.py | 2 +- .../ObjectDetectionCoral/tpu_runner.py | 94 +++++++++++++------ 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py index ea76b6c0..14b2073e 100644 --- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py +++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py @@ -275,7 +275,7 @@ def main(): (wall_time * 1000 / args.count, args.count, (time.perf_counter() - start_one) * 1000)) - logging.info('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' % + logging.info('%.2fms avg time blocked across %d threads; %.3fms ea for final %d inferences' % (tot_infr_time / args.count, thread_cnt, half_wall_time * 1000 / half_infr_count, half_infr_count)) diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index bbf587fc..825d1609 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -269,8 +269,7 @@ def enqueue(self, in_tensor, out_q: queue.Queue): def balance_queues(self): # Don't bother if someone else is working on balancing - if len(self.queues) <= 1 or len(self.tpu_list) <= 2 or \ - len(self.queues) == len(self.tpu_list) or \ + if len(self.queues) <= 1 or len(self.tpu_list) < 2 or \ not self.balance_lock.acquire(blocking=False): return @@ -278,22 +277,26 @@ def eval_timings(interpreter_counts): # How much time are we allocating for each segment time_alloc = [] - for idx in range(len(self.interpreters)): + for seg_i in range(len(self.interpreters)): # Find average runtime for this segment avg_times = [] for interpreters in self.interpreters: - avg_times += [i.timings[idx] / i.exec_count[idx] for i in interpreters if i.exec_count[idx] != 0] + avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0] if avg_times: avg_time = sum(avg_times) / len(avg_times) else: - return 0, 0, 0.0 + return 0, 0, 0.0, None # Adjust for number of TPUs allocated to it - time_alloc.append(avg_time / interpreter_counts[idx]) + if interpreter_counts[seg_i] > 0: + time_alloc.append(avg_time / interpreter_counts[seg_i]) + else: + # No interpreters result inf time + time_alloc.append(float('inf')) - min_t = 100000000 - min_i = -1 + min_gt1_t = float('inf') + min_gt1_i = -1 max_t = 0 max_i = -1 @@ -306,28 +309,67 @@ def eval_timings(interpreter_counts): # Min time needs to be lengthened so rem an interpreter, # but only if it has more than one interpreter - if t < min_t and len(self.interpreters[i]) > 1: - min_t = t - min_i = i + if t < min_gt1_t and len(self.interpreters[i]) > 1: + min_gt1_t = t + min_gt1_i = i + + # See if we can do better than the current max timing + untried_candidates = [] + for interp_i, interpreters in enumerate(self.interpreters): + # Doesn't make sense to pull a TPU from a queue just to re-add it. + if interp_i == max_i: + continue + # If it hasn't yet been tried for this segment + # (or if it has already found to be faster on this segment) + if any([True for i in interpreters if i.exec_count[max_i] == 0 or max_t-1.0 > i.timings[max_i] / i.exec_count[max_i]]): + untried_candidates.append(interp_i) - return min_i, max_i, max(time_alloc) + return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None interpreter_counts = [len(i) for i in self.interpreters] - min_i, max_i, current_max = eval_timings(interpreter_counts) + min_i, max_i, current_max, min_untried_i = eval_timings(interpreter_counts) interpreter_counts[min_i] -= 1 interpreter_counts[max_i] += 1 - _, _, new_max = eval_timings(interpreter_counts) + _, _, new_max, _ = eval_timings(interpreter_counts) - # Return if we don't want to swap (+/- 1 ms) + # Return if we don't want to swap if new_max+1.0 >= current_max: - self.balance_lock.release() - return + if min_untried_i is None: + self.balance_lock.release() + return + + # Swap slow segments with faster ones to see if we can run them faster. + # It might be a good way to optimize for heterogenous hardware. + logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}") + + # Stop them + new_max_i = self._rem_interpreter_from(min_untried_i) + new_min_untried_i = self._rem_interpreter_from(max_i) + + # Swap them + new_max_i.start(max_i, self.fbytes_list[max_i]) + self.interpreters[max_i].append(new_max_i) + + new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i]) + self.interpreters[min_untried_i].append(new_min_untried_i) + + else: + logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") - logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") + realloc_interp = self._rem_interpreter_from(min_i) + # Add to large (too-slow) queue + realloc_interp.start(max_i, self.fbytes_list[max_i]) + self.interpreters[max_i].append(realloc_interp) + + self.balance_lock.release() + self.print_queue_len() + + + def _rem_interpreter_from(self, interp_i): # Sending False kills the processing loop self.rebalancing_lock.acquire() - self.queues[min_i].put(False) + self.queues[interp_i].put(False) # This is ugly, but I can't think of something better # Threads are blocked by queues. Queues may not have a stream @@ -338,21 +380,15 @@ def eval_timings(interpreter_counts): # Block & wait realloc_interp = None with self.rebalancing_lock: - for idx, interpreter in enumerate(self.interpreters[min_i]): + for idx, interpreter in enumerate(self.interpreters[interp_i]): if not interpreter.interpreter: - realloc_interp = self.interpreters[min_i].pop(idx) + realloc_interp = self.interpreters[interp_i].pop(idx) break + if not realloc_interp: logging.warning("Unable to find killed interpreter") self.balance_lock.release() - return - - # Add to large (too-slow) queue - realloc_interp.start(max_i, self.fbytes_list[max_i]) - self.interpreters[max_i].append(realloc_interp) - - self.balance_lock.release() - self.print_queue_len() + return realloc_interp def print_queue_len(self): From 8f9580a12261a635ae5923f5868f6f3e500c9d9b Mon Sep 17 00:00:00 2001 From: Seth Price Date: Wed, 10 Apr 2024 20:04:00 -0700 Subject: [PATCH 4/9] Bug fixes --- .../ObjectDetectionCoral/tpu_runner.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 825d1609..f3ee24d8 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -239,20 +239,20 @@ def __init__(self, tpu_list: list, fname_list: list): with open(fname, "rb") as fd: self.fbytes_list.append(fd.read()) - self._init_interpreters() + with self.balance_lock: + self._init_interpreters() def _init_interpreters(self): start_boot_time = time.perf_counter_ns() # Fill TPUs with interpreters - with self.balance_lock: - for i, tpu_name in enumerate(self.tpu_list): - seg_idx = i % len(self.fname_list) + for i, tpu_name in enumerate(self.tpu_list): + seg_idx = i % len(self.fname_list) - i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock) - i.start(seg_idx, self.fbytes_list[seg_idx]) - self.interpreters[seg_idx].append(i) + i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock) + i.start(seg_idx, self.fbytes_list[seg_idx]) + self.interpreters[seg_idx].append(i) self.first_name = self.interpreters[0][0].input_details[0]['name'] @@ -261,8 +261,9 @@ def _init_interpreters(self): def enqueue(self, in_tensor, out_q: queue.Queue): - if not self.first_name: - self._init_interpreters() + with self.balance_lock: + if not self.first_name: + self._init_interpreters() self.queues[0].put(({self.first_name: in_tensor}, out_q)) @@ -500,11 +501,11 @@ def __init__(self, tpu_limit: int = -1): def _watchdog(self): self.watchdog_time = time.time() while not self.watchdog_shutdown: - if self.pipe and \ + if self.pipe and self.pipe.first_name is None and \ time.time() - self.watchdog_time > self.max_idle_secs_before_recycle: logging.warning("No work in {} seconds, watchdog shutting down TPUs.".format(self.max_idle_secs_before_recycle)) self.runner_lock.acquire(timeout=MAX_WAIT_TIME) - if self.pipe.first_name: + if self.pipe: self.pipe.delete() self.runner_lock.release() # Pipeline will reinitialize itself as needed From 77d86a1f100385701f4feecfb0aad3b2b4e84067 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Thu, 11 Apr 2024 09:11:40 -0700 Subject: [PATCH 5/9] TTL for rebalancing --- src/modules/ObjectDetectionCoral/tpu_runner.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index f3ee24d8..6aef596d 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -212,18 +212,18 @@ def __init__(self, tpu_list: list, fname_list: list): self.max_pipeline_queue_length = MAX_PIPELINE_QUEUE_LEN - self.fname_list = fname_list - self.tpu_list = tpu_list - self.interpreters = [[] for i in range(seg_count)] + self.fname_list = fname_list + self.tpu_list = tpu_list + self.interpreters = [[] for i in range(seg_count)] # Input queues for each segment; if we go over maxsize, something went wrong self.queues = [queue.Queue(maxsize=self.max_pipeline_queue_length) for i in range(seg_count)] # Lock for internal reorganization - self.balance_lock = threading.Lock() + self.balance_lock = threading.Lock() # Lock for interpreter use - self.rebalancing_lock = threading.Lock() + self.rebalancing_lock = threading.Lock() # Read file data self.fbytes_list = [] @@ -244,6 +244,7 @@ def __init__(self, tpu_list: list, fname_list: list): def _init_interpreters(self): + self.balance_ttl = len(self.tpu_list) * 2 start_boot_time = time.perf_counter_ns() # Fill TPUs with interpreters @@ -270,7 +271,7 @@ def enqueue(self, in_tensor, out_q: queue.Queue): def balance_queues(self): # Don't bother if someone else is working on balancing - if len(self.queues) <= 1 or len(self.tpu_list) < 2 or \ + if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \ not self.balance_lock.acquire(blocking=False): return @@ -363,6 +364,7 @@ def eval_timings(interpreter_counts): realloc_interp.start(max_i, self.fbytes_list[max_i]) self.interpreters[max_i].append(realloc_interp) + self.balance_ttl -= 1 self.balance_lock.release() self.print_queue_len() From 753310fc41552c431bdcc714d67a622b1c0462fc Mon Sep 17 00:00:00 2001 From: Seth Price Date: Thu, 11 Apr 2024 20:00:50 -0700 Subject: [PATCH 6/9] Code flow --- .../ObjectDetectionCoral/tpu_runner.py | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 6aef596d..8f44bae5 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -323,7 +323,7 @@ def eval_timings(interpreter_counts): continue # If it hasn't yet been tried for this segment # (or if it has already found to be faster on this segment) - if any([True for i in interpreters if i.exec_count[max_i] == 0 or max_t-1.0 > i.timings[max_i] / i.exec_count[max_i]]): + if any([True for i in interpreters if i.exec_count[max_i] < 10 or max_t-0.1 > i.timings[max_i] / i.exec_count[max_i]]): untried_candidates.append(interp_i) return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None @@ -334,12 +334,17 @@ def eval_timings(interpreter_counts): interpreter_counts[max_i] += 1 _, _, new_max, _ = eval_timings(interpreter_counts) - # Return if we don't want to swap - if new_max+1.0 >= current_max: - if min_untried_i is None: - self.balance_lock.release() - return + if new_max+1.0 < current_max: + # Allocate more TPUs to slow segments + logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") + realloc_interp = self._rem_interpreter_from(min_i) + + # Add to large (too-slow) queue + realloc_interp.start(max_i, self.fbytes_list[max_i]) + self.interpreters[max_i].append(realloc_interp) + + elif min_untried_i is not None: # Swap slow segments with faster ones to see if we can run them faster. # It might be a good way to optimize for heterogenous hardware. logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}") @@ -355,14 +360,12 @@ def eval_timings(interpreter_counts): new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i]) self.interpreters[min_untried_i].append(new_min_untried_i) + # FIXME: After we have TPUs evaluated and otherwise balanced, we could + # further optimize by ensuring the slowest segment doesn't contain any slow TPUs. else: - logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") - - realloc_interp = self._rem_interpreter_from(min_i) - - # Add to large (too-slow) queue - realloc_interp.start(max_i, self.fbytes_list[max_i]) - self.interpreters[max_i].append(realloc_interp) + # Return if we don't want to swap + self.balance_lock.release() + return self.balance_ttl -= 1 self.balance_lock.release() From fc935117976ab3f7a3705b3ca83e1302ab2de026 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Thu, 11 Apr 2024 23:49:52 -0700 Subject: [PATCH 7/9] Tweaks to TPU balancer --- .../ObjectDetectionCoral/tpu_runner.py | 152 ++++++++++-------- 1 file changed, 82 insertions(+), 70 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 8f44bae5..c1c9b728 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -243,7 +243,7 @@ def __init__(self, tpu_list: list, fname_list: list): self._init_interpreters() def _init_interpreters(self): - + # Set a Time To Live for balancing so we don't swap for inf in corner cases self.balance_ttl = len(self.tpu_list) * 2 start_boot_time = time.perf_counter_ns() @@ -269,73 +269,87 @@ def enqueue(self, in_tensor, out_q: queue.Queue): self.queues[0].put(({self.first_name: in_tensor}, out_q)) - def balance_queues(self): - # Don't bother if someone else is working on balancing - if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \ - not self.balance_lock.acquire(blocking=False): - return + def _eval_timings(interpreter_counts): + # How much time are we allocating for each segment + time_alloc = [] - def eval_timings(interpreter_counts): - # How much time are we allocating for each segment - time_alloc = [] + for seg_i in range(len(self.interpreters)): + # Find average runtime for this segment + avg_times = [] + for interpreters in self.interpreters: + avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0] - for seg_i in range(len(self.interpreters)): - # Find average runtime for this segment - avg_times = [] - for interpreters in self.interpreters: - avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0] + if avg_times: + avg_time = sum(avg_times) / len(avg_times) + else: + return 0, 0, 0.0, None - if avg_times: - avg_time = sum(avg_times) / len(avg_times) - else: - return 0, 0, 0.0, None + # Adjust for number of TPUs allocated to it + if interpreter_counts[seg_i] > 0: + time_alloc.append(avg_time / interpreter_counts[seg_i]) + else: + # No interpreters result inf time + time_alloc.append(float('inf')) + + min_gt1_t = float('inf') + min_gt1_i = -1 + max_t = 0 + max_i = -1 + + # Find segments that maybe should swap + for i, t in enumerate(time_alloc): + # Max time needs to be shortened so add an interpreter. + if t > max_t: + max_t = t + max_i = i + + # Min time needs to be lengthened so rem an interpreter, + # but only if it has more than one interpreter + if t < min_gt1_t and len(self.interpreters[i]) > 1: + min_gt1_t = t + min_gt1_i = i + + # See if we can do better than the current max timing with swapping + swap_i = None + for interp_i, interpreters in enumerate(self.interpreters): + # Doesn't make sense to pull a TPU from a queue just to re-add it. + if interp_i == max_i: + continue - # Adjust for number of TPUs allocated to it - if interpreter_counts[seg_i] > 0: - time_alloc.append(avg_time / interpreter_counts[seg_i]) - else: - # No interpreters result inf time - time_alloc.append(float('inf')) - - min_gt1_t = float('inf') - min_gt1_i = -1 - max_t = 0 - max_i = -1 - - # Find segments that maybe should swap - for i, t in enumerate(time_alloc): - # Max time needs to be shortened so add an interpreter. - if t > max_t: - max_t = t - max_i = i - - # Min time needs to be lengthened so rem an interpreter, - # but only if it has more than one interpreter - if t < min_gt1_t and len(self.interpreters[i]) > 1: - min_gt1_t = t - min_gt1_i = i - - # See if we can do better than the current max timing - untried_candidates = [] - for interp_i, interpreters in enumerate(self.interpreters): - # Doesn't make sense to pull a TPU from a queue just to re-add it. - if interp_i == max_i: - continue - # If it hasn't yet been tried for this segment - # (or if it has already found to be faster on this segment) - if any([True for i in interpreters if i.exec_count[max_i] < 10 or max_t-0.1 > i.timings[max_i] / i.exec_count[max_i]]): - untried_candidates.append(interp_i) + # Test all TPUs in this segment + for i in interpreters: + # Only calc valid time after a few runs + new_max_t = 0 + if i.exec_count[max_i] > 10: + new_max_t = i.timings[max_i] / i.exec_count[max_i] + new_swap_t = 0 + if i.exec_count[interp_i] > 10: + new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] + + # If it hasn't yet been tried for this segment or + # If it has already found to be faster on this segment + # and we aren't making the other segment the new worst. + if i.exec_count[max_i] < 10 or (max_t > new_max_t and max_t > new_swap_t): + swap_i = interp_i + break - return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None + return min_gt1_i, max_i, max(time_alloc), swap_i + + + def balance_queues(self): + # Don't bother if someone else is working on balancing + if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \ + not self.balance_lock.acquire(blocking=False): + return interpreter_counts = [len(i) for i in self.interpreters] - min_i, max_i, current_max, min_untried_i = eval_timings(interpreter_counts) + min_i, max_i, current_max, swap_i = self._eval_timings(interpreter_counts) interpreter_counts[min_i] -= 1 interpreter_counts[max_i] += 1 - _, _, new_max, _ = eval_timings(interpreter_counts) + _, _, new_max, _ = self._eval_timings(interpreter_counts) if new_max+1.0 < current_max: - # Allocate more TPUs to slow segments + # 1st Priority: Allocate more TPUs to slow segments logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") realloc_interp = self._rem_interpreter_from(min_i) @@ -344,24 +358,22 @@ def eval_timings(interpreter_counts): realloc_interp.start(max_i, self.fbytes_list[max_i]) self.interpreters[max_i].append(realloc_interp) - elif min_untried_i is not None: - # Swap slow segments with faster ones to see if we can run them faster. - # It might be a good way to optimize for heterogenous hardware. - logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}") + elif swap_i is not None: + # 2nd Priority: Swap slow segments with faster ones to see if we can + # run them faster. Hopefully still a good way to optimize for + # heterogenous hardware. + logging.info(f"Re-balancing between queues {swap_i} and {max_i}") # Stop them - new_max_i = self._rem_interpreter_from(min_untried_i) - new_min_untried_i = self._rem_interpreter_from(max_i) + new_max = self._rem_interpreter_from(swap_i) + new_swap = self._rem_interpreter_from(max_i) # Swap them - new_max_i.start(max_i, self.fbytes_list[max_i]) - self.interpreters[max_i].append(new_max_i) - - new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i]) - self.interpreters[min_untried_i].append(new_min_untried_i) + new_max.start(max_i, self.fbytes_list[max_i]) + self.interpreters[max_i].append(new_max) - # FIXME: After we have TPUs evaluated and otherwise balanced, we could - # further optimize by ensuring the slowest segment doesn't contain any slow TPUs. + new_swap.start(swap_i, self.fbytes_list[swap_i]) + self.interpreters[swap_i].append(new_swap) else: # Return if we don't want to swap self.balance_lock.release() From 1b16adab1df5bcada9a59566c0cde24e940b3fe3 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Fri, 12 Apr 2024 00:05:20 -0700 Subject: [PATCH 8/9] Addle TPU swap logic --- .../ObjectDetectionCoral/tpu_runner.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index c1c9b728..0c2bca16 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -272,12 +272,13 @@ def enqueue(self, in_tensor, out_q: queue.Queue): def _eval_timings(interpreter_counts): # How much time are we allocating for each segment time_alloc = [] + VALID_CNT_THRESH = 50 for seg_i in range(len(self.interpreters)): # Find average runtime for this segment avg_times = [] for interpreters in self.interpreters: - avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0] + avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] > VALID_CNT_THRESH] if avg_times: avg_time = sum(avg_times) / len(avg_times) @@ -293,7 +294,7 @@ def _eval_timings(interpreter_counts): min_gt1_t = float('inf') min_gt1_i = -1 - max_t = 0 + max_t = 0.0 max_i = -1 # Find segments that maybe should swap @@ -309,6 +310,10 @@ def _eval_timings(interpreter_counts): min_gt1_t = t min_gt1_i = i + # Only eval swapping max segment if we have many samples + if VALID_CNT_THRESH > sum([i.exec_count[max_i] for i in self.interpreters[max_i]]): + return min_gt1_i, max_i, max(time_alloc), None + # See if we can do better than the current max timing with swapping swap_i = None for interp_i, interpreters in enumerate(self.interpreters): @@ -319,17 +324,17 @@ def _eval_timings(interpreter_counts): # Test all TPUs in this segment for i in interpreters: # Only calc valid time after a few runs - new_max_t = 0 - if i.exec_count[max_i] > 10: + new_max_t = 0.0 + if i.exec_count[max_i] > VALID_CNT_THRESH: new_max_t = i.timings[max_i] / i.exec_count[max_i] - new_swap_t = 0 - if i.exec_count[interp_i] > 10: + new_swap_t = 0.0 + if i.exec_count[interp_i] > VALID_CNT_THRESH: new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] # If it hasn't yet been tried for this segment or # If it has already found to be faster on this segment # and we aren't making the other segment the new worst. - if i.exec_count[max_i] < 10 or (max_t > new_max_t and max_t > new_swap_t): + if i.exec_count[max_i] < VALID_CNT_THRESH or (max_t > new_max_t and max_t > new_swap_t): swap_i = interp_i break From 2a6f862d230e57001f1cab7f0eabe40997903c95 Mon Sep 17 00:00:00 2001 From: Seth Price Date: Fri, 12 Apr 2024 16:10:43 -0700 Subject: [PATCH 9/9] Better tpu balancing --- .../objectdetection_coral_multitpu.py | 2 +- .../ObjectDetectionCoral/tpu_runner.py | 35 ++++++++++++------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py index 14b2073e..d6b9cdf0 100644 --- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py +++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py @@ -244,7 +244,7 @@ def main(): tot_infr_time += infr_time # Start a timer for the last ~half of the run for more accurate benchmark - if chunk_i > (args.count-1) / 3.0: + if chunk_i > (args.count-1) / 2.0: half_infr_count += 1 if half_wall_start is None: half_wall_start = time.perf_counter() diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 0c2bca16..19647516 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -243,7 +243,7 @@ def __init__(self, tpu_list: list, fname_list: list): self._init_interpreters() def _init_interpreters(self): - # Set a Time To Live for balancing so we don't swap for inf in corner cases + # Set a Time To Live for balancing so we don't thrash self.balance_ttl = len(self.tpu_list) * 2 start_boot_time = time.perf_counter_ns() @@ -269,7 +269,7 @@ def enqueue(self, in_tensor, out_q: queue.Queue): self.queues[0].put(({self.first_name: in_tensor}, out_q)) - def _eval_timings(interpreter_counts): + def _eval_timings(self, interpreter_counts): # How much time are we allocating for each segment time_alloc = [] VALID_CNT_THRESH = 50 @@ -310,12 +310,17 @@ def _eval_timings(interpreter_counts): min_gt1_t = t min_gt1_i = i - # Only eval swapping max segment if we have many samples - if VALID_CNT_THRESH > sum([i.exec_count[max_i] for i in self.interpreters[max_i]]): - return min_gt1_i, max_i, max(time_alloc), None + # Only eval swapping max time segment if we have many samples in the current setup + for i in self.interpreters[max_i]: + if i.exec_count[max_i] < VALID_CNT_THRESH: + return min_gt1_i, max_i, max(time_alloc), None - # See if we can do better than the current max timing with swapping + # Undo avg interp count adjustment for TPU-to-TPU comparisons + max_t = max([i.timings[max_i] / i.exec_count[max_i] for i in self.interpreters[max_i]]) + + # See if we can do better than the current max time by swapping segments between TPUs swap_i = None + swap_t = float('inf') for interp_i, interpreters in enumerate(self.interpreters): # Doesn't make sense to pull a TPU from a queue just to re-add it. if interp_i == max_i: @@ -323,6 +328,10 @@ def _eval_timings(interpreter_counts): # Test all TPUs in this segment for i in interpreters: + # If TPU hasn't yet been tried for this segment or ... + if i.exec_count[max_i] < VALID_CNT_THRESH: + return min_gt1_i, max_i, max(time_alloc), interp_i + # Only calc valid time after a few runs new_max_t = 0.0 if i.exec_count[max_i] > VALID_CNT_THRESH: @@ -330,13 +339,13 @@ def _eval_timings(interpreter_counts): new_swap_t = 0.0 if i.exec_count[interp_i] > VALID_CNT_THRESH: new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] - - # If it hasn't yet been tried for this segment or - # If it has already found to be faster on this segment - # and we aren't making the other segment the new worst. - if i.exec_count[max_i] < VALID_CNT_THRESH or (max_t > new_max_t and max_t > new_swap_t): + + # If TPU has already found to be faster on this segment + # and we aren't making the other segment the new worst + # and we are choosing the best available candidate. + if max_t-0.5 > new_max_t and max_t > new_swap_t and swap_t > new_max_t: swap_i = interp_i - break + swap_t = new_max_t return min_gt1_i, max_i, max(time_alloc), swap_i @@ -367,7 +376,7 @@ def balance_queues(self): # 2nd Priority: Swap slow segments with faster ones to see if we can # run them faster. Hopefully still a good way to optimize for # heterogenous hardware. - logging.info(f"Re-balancing between queues {swap_i} and {max_i}") + logging.info(f"Auto-tuning between queues {swap_i} and {max_i}") # Stop them new_max = self._rem_interpreter_from(swap_i)