From 6e630e90c48c80ecbed059afabd5109103f44e81 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Mon, 8 Apr 2024 12:04:42 -0700
Subject: [PATCH 1/9] Use new TPU segment option

---
 .../objectdetection_coral_multitpu.py         |   4 +-
 src/modules/ObjectDetectionCoral/options.py   | 247 +++++++++++++-----
 .../ObjectDetectionCoral/segment_and_test.py  |  64 +++--
 .../ObjectDetectionCoral/tpu_runner.py        |  23 +-
 4 files changed, 231 insertions(+), 107 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
index af1a737b..dde479ac 100644
--- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
+++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
@@ -271,11 +271,11 @@ def main():
   if half_wall_start is not None:
     half_wall_time = time.perf_counter() - half_wall_start
   
-  print('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' %
+  logging.info('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' %
                             (wall_time * 1000 / args.count, args.count,
                             (time.perf_counter() - start_one) * 1000))
                             
-  print('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' %
+  logging.info('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' %
                             (tot_infr_time / args.count, thread_cnt,
                              half_wall_time * 1000 / half_infr_count, half_infr_count))
 
diff --git a/src/modules/ObjectDetectionCoral/options.py b/src/modules/ObjectDetectionCoral/options.py
index 676dd4e2..14a036cf 100644
--- a/src/modules/ObjectDetectionCoral/options.py
+++ b/src/modules/ObjectDetectionCoral/options.py
@@ -12,13 +12,152 @@ def getEnvVariable(a, b):
 
 class Settings:
     def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str, 
-                 tpu_model_name: str, labels_name: str, tpu_segments_lists):
+                 tpu_model_name: str, labels_name: str):
         self.model_name         = model_name
         self.model_name_pattern = model_name_pattern
         self.cpu_model_name     = std_model_name
         self.tpu_model_name     = tpu_model_name
         self.labels_name        = labels_name
-        self.tpu_segments_lists = tpu_segments_lists
+
+        self.MODEL_SEGMENTS = {
+            'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': {
+                 #  104.2 ms per inference
+                 2: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   67.5 ms per inference
+                 3: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'],
+                 #   49.1 ms per inference
+                 4: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   43.5 ms per inference
+                 5: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'],
+                 #   37.0 ms per inference
+                 6: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'],
+                 #   31.1 ms per inference
+                 7: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   27.1 ms per inference
+                 8: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'],
+             },
+             'efficientdet_lite2_448_ptq': {
+                 #   32.1 ms per inference
+                 2: ['all_segments_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   19.5 ms per inference
+                 3: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   16.5 ms per inference
+                 4: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_2_of_3_edgetpu.tflite'],
+                 #   13.6 ms per inference
+                 5: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   11.5 ms per inference
+                 7: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   11.3 ms per inference
+                 8: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'],
+             },
+             'efficientdet_lite3_512_ptq': {
+                 #   20.9 ms per inference
+                 4: ['15x_last_seg_efficientdet_lite3_512_ptq_segment_0_of_2_edgetpu.tflite', '15x_last_seg_efficientdet_lite3_512_ptq_segment_1_of_2_edgetpu.tflite'],
+             },
+             'efficientdet_lite3x_640_ptq': {
+                 #   95.0 ms per inference
+                 2: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   70.6 ms per inference
+                 3: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'],
+                 #   47.9 ms per inference
+                 4: ['2x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'],
+                 #   38.7 ms per inference
+                 5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   35.1 ms per inference
+                 6: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   30.6 ms per inference
+                 7: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'],
+                 #   27.3 ms per inference
+                 8: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'],
+             },
+             'yolov5m-int8': {
+                 #   56.3 ms per inference
+                 2: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'],
+                 #   32.2 ms per inference
+                 3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'],
+                 #   25.9 ms per inference
+                 4: ['2x_last_seg_yolov5m-int8_segment_0_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_1_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_2_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_3_of_4_edgetpu.tflite'],
+                 #   21.2 ms per inference
+                 5: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'],
+                 #   18.8 ms per inference
+                 6: ['15x_last_seg_yolov5m-int8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_2_of_3_edgetpu.tflite'],
+                 #   14.7 ms per inference
+                 7: ['all_segments_yolov5m-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_3_of_4_edgetpu.tflite'],
+                 #   14.6 ms per inference
+                 8: ['all_segments_yolov5m-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_3_edgetpu.tflite'],
+             },
+             'yolov5l-int8': {
+                 #   61.1 ms per inference
+                 3: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'],
+                 #   48.0 ms per inference
+                 4: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'],
+                 #   39.0 ms per inference
+                 5: ['all_segments_yolov5l-int8_segment_0_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_4_of_5_edgetpu.tflite'],
+                 #   31.5 ms per inference
+                 6: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'],
+                 #   26.7 ms per inference
+                 7: ['dumb_yolov5l-int8_segment_0_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_2_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_3_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_4_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_5_of_6_edgetpu.tflite'],
+                 #   24.4 ms per inference
+                 8: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'],
+             },
+             'yolov8s_416_640px': {
+                 #   25.6 ms per inference
+                 3: ['166x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'],
+             },
+             'yolov8m_416_640px': {
+                 #  114.4 ms per inference
+                 2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'],
+                 #   71.9 ms per inference
+                 3: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'],
+                 #   53.0 ms per inference
+                 4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'],
+                 #   43.5 ms per inference
+                 5: ['166x_first_seg_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'],
+                 #   31.8 ms per inference
+                 6: ['2x_first_seg_yolov8m_416_640px_segment_0_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_3_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_4_of_5_edgetpu.tflite'],
+                 #   29.5 ms per inference
+                 7: ['all_segments_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'],
+                 #   26.0 ms per inference
+                 8: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'],
+             },
+             'yolov8l_416_640px': {
+                 #  169.6 ms per inference
+                 2: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'],
+                 #  115.8 ms per inference
+                 3: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'],
+                 #   89.7 ms per inference
+                 4: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'],
+                 #   77.7 ms per inference
+                 5: ['4x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'],
+                 #   64.2 ms per inference
+                 6: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'],
+                 #   57.3 ms per inference
+                 7: ['3x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'],
+                 #   52.2 ms per inference
+                 8: ['166x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'],
+             },
+             'ipcam-general-v8': {
+                 #   53.4 ms per inference
+                 2: ['2x_last_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'],
+                 #   24.3 ms per inference
+                 3: ['all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'],
+                 #   19.9 ms per inference
+                 4: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'],
+                 #   15.6 ms per inference
+                 5: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'],
+                 #   15.2 ms per inference
+                 6: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'],
+                 #   12.3 ms per inference
+                 7: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'],
+                 #   10.9 ms per inference
+                 8: ['2x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'],
+             },
+        }        
+
+        self.tpu_segments_lists = {}
+        if model_name_pattern in self.MODEL_SEGMENTS:
+            self.tpu_segments_lists = self.MODEL_SEGMENTS[model_name_pattern]
+
 
 class Options:
 
@@ -36,123 +175,92 @@ def __init__(self):
         self.MODEL_SETTINGS = {
             "yolov8": {
                 # 59.88 ms throughput / 855.40 ms inference
-                "large":  Settings('YOLOv8', 'yolov8l', \
+                "large":  Settings('YOLOv8', 'yolov8l_416_640px',
                                    'yolov8l_416_640px.tflite',                                 # 46Mb CPU
                                    'yolov8l_416_640px_edgetpu.tflite',                         # 48Mb TPU
-                                   'coco_labels.txt',
-                                   # 54.18 ms throughput / 754.56 ms inference
-                                   [['yolov8l_416_640px_segment_0_of_2_edgetpu.tflite',
-                                     'yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'],
-                                   # 55.79 ms throughput / 824.09 ms inference
-                                    ['yolov8l_448px_segment_0_of_3_edgetpu.tflite',
-                                     'yolov8l_448px_segment_1_of_3_edgetpu.tflite',
-                                     'yolov8l_448px_segment_2_of_3_edgetpu.tflite']]),
+                                   'coco_labels.txt'),
                 # 53.72 ms throughput / 762.86 ms inference
-                "medium": Settings('YOLOv8', 'yolov8m', \
-                                   'yolov8m-416_640px.tflite',                                      # 21Mb CPU
-                                   'yolov8m-416_640px_edgetpu.tflite',                              # 22Mb TPU
-                                   'coco_labels.txt',
-                                   [['yolov8m__segment_0_of_2_edgetpu.tflite',
-                                     'yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'],
-                                   # 39.59 ms throughput / 574.83 ms inference
-                                    ['yolov8m_416_640px_segment_0_of_4_edgetpu.tflite',
-                                     'yolov8m_416_640px_segment_1_of_4_edgetpu.tflite',
-                                     'yolov8m_416_640px_segment_2_of_4_edgetpu.tflite',
-                                     'yolov8m_416_640px_segment_3_of_4_edgetpu.tflite']]),
+                "medium": Settings('YOLOv8', 'yolov8m_416_640px', \
+                                   'yolov8m_416_640px.tflite',                                      # 21Mb CPU
+                                   'yolov8m_416_640px_edgetpu.tflite',                              # 22Mb TPU
+                                   'coco_labels.txt'),
                 # 21.52 ms throughput / 291.35 ms inference
-                "small":  Settings('YOLOv8', 'yolov8s', \
+                "small":  Settings('YOLOv8', 'yolov8s_416_640px',
                                    'yolov8s_416_640px.tflite',                                      # 11Mb CPU
                                    'yolov8s_416_640px_edgetpu.tflite',                              # 12Mb TPU
-                                   'coco_labels.txt', []),
+                                   'coco_labels.txt'),
                 # 10.35 ms throughput / 123.35 ms inference
-                "tiny":   Settings('YOLOv8', 'yolov8n',
+                "tiny":   Settings('YOLOv8', 'yolov8n_416_640px',
                                    'yolov8n_416_640px.tflite',                                      # 4Mb CPU
                                    'yolov8n_416_640px_edgetpu.tflite',                              # 3Mb TPU
-                                   'coco_labels.txt', [])
+                                   'coco_labels.txt')
             },
-
             "yolov5": {
-                "large":  Settings('YOLOv5', 'yolov5l', \
+                "large":  Settings('YOLOv5', 'yolov5l-int8',
                                    'yolov5l-int8.tflite',                                      # 46Mb CPU
                                    'yolov5l-int8_edgetpu.tflite',                              # 48Mb TPU
-                                   'coco_labels.txt',
-                                   [['yolov5l-int8_edgetpu_segment_0_of_7_edgetpu.tflite',
-                                     'yolov5l-int8_edgetpu_segment_1_of_7_edgetpu.tflite',
-                                     'yolov5l-int8_edgetpu_segment_2_of_7_edgetpu.tflite',
-                                     'yolov5l-int8_edgetpu_segment_3_of_7_edgetpu.tflite',
-                                     'yolov5l-int8_edgetpu_segment_4_of_7_edgetpu.tflite',
-                                     'yolov5l-int8_edgetpu_segment_5_of_7_edgetpu.tflite',
-                                     'yolov5l-int8_edgetpu_segment_6_of_7_edgetpu.tflite']]),
-                "medium": Settings('YOLOv5', 'yolov5m', \
+                                   'coco_labels.txt'),
+                "medium": Settings('YOLOv5', 'yolov5m-int8',
                                    'yolov5m-int8.tflite',                                      # 21Mb CPU
                                    'yolov5m-int8_edgetpu.tflite',                              # 22Mb TPU
-                                   'coco_labels.txt',
-                                   [['yolov5m-int8_edgetpu_segment_0_of_4_edgetpu.tflite',
-                                     'yolov5m-int8_edgetpu_segment_1_of_4_edgetpu.tflite',
-                                     'yolov5m-int8_edgetpu_segment_2_of_4_edgetpu.tflite',
-                                     'yolov5m-int8_edgetpu_segment_3_of_4_edgetpu.tflite']]),
-                "small":  Settings('YOLOv5', 'yolov5s', \
+                                   'coco_labels.txt'),
+                "small":  Settings('YOLOv5', 'yolov5s-int8',
                                    'yolov5s-int8.tflite',                                      # 7Mb CPU
                                    'yolov5s-int8_edgetpu.tflite',                              # 8Mb TPU
-                                   'coco_labels.txt', []),
-                "tiny":   Settings('YOLOv5', 'yolov5n', \
+                                   'coco_labels.txt'),
+                "tiny":   Settings('YOLOv5', 'yolov5n-int8',
                                    'yolov5n-int8.tflite',                                      # 2Mb CPU
                                    'yolov5n-int8_edgetpu.tflite',                              # 2Mb TPU
-                                   'coco_labels.txt', [])
+                                   'coco_labels.txt')
             },
-
             "efficientdet-lite": {
                 # Large: EfficientDet-Lite3x 90 objects COCO	640x640x3 	2 	197.0 ms 	43.9% mAP
-                "large":  Settings('EfficientDet-Lite', 'efficientdet_lite3x_640', \
+                "large":  Settings('EfficientDet-Lite', 'efficientdet_lite3x_640_ptq', \
                                    'efficientdet_lite3x_640_ptq.tflite',                       # 14Mb CPU
                                    'efficientdet_lite3x_640_ptq_edgetpu.tflite',               # 20Mb TPU
-                                   'coco_labels.txt',
-                                   [['efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite',
-                                     'efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite'
-                                     'efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite']]),
+                                   'coco_labels.txt'),
                 # Medium: EfficientDet-Lite3 90 objects	512x512x3 	2 	107.6 ms 	39.4% mAP
-                "medium": Settings('EfficientDet-Lite', 'efficientdet_lite3_512', \
+                "medium": Settings('EfficientDet-Lite', 'efficientdet_lite3_512_ptq', \
                                    'efficientdet_lite3_512_ptq.tflite',                        # CPU
                                    'efficientdet_lite3_512_ptq_edgetpu.tflite',                # TPU
-                                   'coco_labels.txt', []),
+                                   'coco_labels.txt'),
                 # Small: EfficientDet-Lite2 90 objects COCO	448x448x3 	2 	104.6 ms 	36.0% mAP
-                "small":  Settings('EfficientDet-Lite', 'efficientdet_lite2_448', \
+                "small":  Settings('EfficientDet-Lite', 'efficientdet_lite2_448_ptq', \
                                    'efficientdet_lite2_448_ptq.tflite',                        # 10Mb CPU
                                    'efficientdet_lite2_448_ptq_edgetpu.tflite',                # TPU
-                                   'coco_labels.txt',
-                                   [['efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite',
-                                     'efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite']]),
+                                   'coco_labels.txt'),
                 # Tiny: EfficientDet-Lite1 90 objects COCO	384x384x3 	2 	56.3 ms 	34.3% mAP
-                "tiny":   Settings('EfficientDet-Lite', 'efficientdet_lite1_384', \
+                "tiny":   Settings('EfficientDet-Lite', 'efficientdet_lite1_384_ptq', \
                                    'efficientdet_lite1_384_ptq.tflite',                        # 7Mb CPU
                                    'efficientdet_lite1_384_ptq_edgetpu.tflite',                # TPU
-                                   'coco_labels.txt', [])
+                                   'coco_labels.txt')
             },
             "mobilenet ssd": {
                 # Large: SSD/FPN MobileNet V1 90 objects, COCO 640x640x3    TF-lite v2    229.4 ms    31.1% mAP
-                "large":  Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v1_fpn_640', \
+                "large":  Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', \
                                    'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq.tflite',       # CPU
                                    'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_edgetpu.tflite', # TPU
-                                   'coco_labels.txt', []),
+                                   'coco_labels.txt'),
                 # Medium: SSDLite MobileDet   90 objects, COCO 320x320x3    TF-lite v1    9.1 ms 	32.9% mAP
                 "medium": Settings('MobileNet SSD', 'ssdlite_mobiledet_coco_', \
                                    'ssdlite_mobiledet_coco_qat_postprocess.tflite',            # 5Mb CPU
                                    'ssdlite_mobiledet_coco_qat_postprocess_edgetpu.tflite',    # TPU
-                                   'coco_labels.txt', []),
+                                   'coco_labels.txt'),
                 # Small: SSD MobileNet V2 90 objects, COCO 300x300x3    TF-lite v2    7.6 ms    22.4% mAP
                 "small":  Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v2', \
                                    'tf2_ssd_mobilenet_v2_coco17_ptq.tflite',                   # 6.7Mb CPU
                                    'tf2_ssd_mobilenet_v2_coco17_ptq_edgetpu.tflite',           # TPU
-                                   'coco_labels.txt', []),
+                                   'coco_labels.txt'),
 
                 # Tiny: MobileNet V2 90 objects, COCO 300x300x3    TF-lite v2 Quant
                 "tiny":   Settings('MobileNet SSD', 'ssd_mobilenet_v2_coco_', \
                                    'ssd_mobilenet_v2_coco_quant_postprocess.tflite',           # 6.6Mb CPU
                                    'ssd_mobilenet_v2_coco_quant_postprocess_edgetpu.tflite',   # TPU
-                                   'coco_labels.txt', [])
+                                   'coco_labels.txt')
             }
         }
 
+
         self.ENABLE_MULTI_TPU                   = True
         
         self.MIN_CONFIDENCE                     = 0.5
@@ -221,7 +329,7 @@ def set_model(self, model_name):
         
         # Normalise input
         self.model_name = model_name.lower()
-        if self.model_name not in [ "mobilenet ssd", "efficientdet-lite", "yolov5", "yolov8" ]: # 'yolov5' - no sense including v5 anymore
+        if self.model_name not in [ "mobilenet ssd", "efficientdet-lite", "yolov5", "yolov8"]: # 'yolov5' - no sense including v5 anymore
             self.model_name = "mobilenet ssd"
 
         self.model_size = self.model_size.lower()
@@ -254,5 +362,8 @@ def set_model(self, model_name):
         self.model_cpu_file     = os.path.normpath(os.path.join(self.models_dir, self.cpu_model_name))
         self.model_tpu_file     = os.path.normpath(os.path.join(self.models_dir, self.tpu_model_name))
         self.label_file         = os.path.normpath(os.path.join(self.models_dir, self.labels_name))
-        self.tpu_segments_lists = [ [os.path.normpath(os.path.join(self.models_dir, name)) for name in name_list] \
-                                    for name_list in settings.tpu_segments_lists ]
+
+        self.tpu_segments_lists = {}
+        for tpu_cnt, name_list in settings.tpu_segments_lists.items():
+            self.tpu_segments_lists[tpu_cnt] = \
+                [os.path.normpath(os.path.join(self.models_dir, name)) for name in name_list]
diff --git a/src/modules/ObjectDetectionCoral/segment_and_test.py b/src/modules/ObjectDetectionCoral/segment_and_test.py
index ed7706b1..72db2542 100644
--- a/src/modules/ObjectDetectionCoral/segment_and_test.py
+++ b/src/modules/ObjectDetectionCoral/segment_and_test.py
@@ -10,20 +10,24 @@
     #'ssd_mobilenet_v2_coco_quant_postprocess',
     #'ssdlite_mobiledet_coco_qat_postprocess',
     #'ssd_mobilenet_v1_coco_quant_postprocess',
-    'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq',
+    #'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq',
     #'efficientdet_lite0_320_ptq',
     #'efficientdet_lite1_384_ptq',
-    'efficientdet_lite2_448_ptq',
-    'efficientdet_lite3_512_ptq',
-    'efficientdet_lite3x_640_ptq',
+    #'efficientdet_lite2_448_ptq',
+    #'efficientdet_lite3_512_ptq',
+    #'efficientdet_lite3x_640_ptq',
     #'yolov5n-int8',
     #'yolov5s-int8',
-    'yolov5m-int8',
-    'yolov5l-int8',
+    #'yolov5m-int8',
+    #'yolov5l-int8',
     #'yolov8n_416_640px', # lg 1st seg
-    'yolov8s_416_640px', # lg 1st seg
-    'yolov8m_416_640px', # lg 1st seg
-    'yolov8l_416_640px', # lg 1st seg
+    #'yolov8s_416_640px', # lg 1st seg
+    #'yolov8m_416_640px', # lg 1st seg
+    #'yolov8l_416_640px', # lg 1st seg
+    #'yolov8n_640px',
+    #'yolov8s_640px',
+    #'yolov8m_640px', # lg 1st seg
+    #'yolov8l_640px', # lg 1st seg
     'ipcam-general-v8']
 
 custom_args = {
@@ -178,7 +182,7 @@
         8: ["--partition_search_step","5"]}}#'''
    
 seg_dir = "/media/seth/FAT_THUMB/all_segments/"
-seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', '15x_last_seg/', '2x_last_seg/', 'dumb/']
+seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', 'inc_seg/', 'dumb/']
 
 
 def seg_exists(filename, segment_type, segment_count):
@@ -191,7 +195,7 @@ def seg_exists(filename, segment_type, segment_count):
         seg_list = [seg_dir+segment_type+filename+'_segment_{}_of_{}_edgetpu.tflite'.format(i, segment_count) for i in range(segment_count)]
     return (seg_list, any([True for s in seg_list if not os.path.exists(s)]))
 
-MAX_TPU_COUNT = 8
+MAX_TPU_COUNT = 4
 
 '''
 # Generate segment files
@@ -251,6 +255,17 @@ def seg_exists(filename, segment_type, segment_count):
                     #   for (auto latency : latencies) {
                     #
                     # sudo make DOCKER_IMAGE="ubuntu:20.04" DOCKER_CPUS="k8" DOCKER_TARGETS="tools" docker-build
+
+                    #// Encourage each segment slower than the previous to spread out the bottlenecks
+                    #double latency_adjust = 1.0;
+                    #for (int i = 1; i < num_segments_; ++i)
+                    #{
+                    #  if (latencies[i-1] < latencies[i])
+                    #    latency_adjust *= 0.97;
+                    #  latencies[i-1] *= latency_adjust;
+                    #}
+                    #latencies[num_segments_-1] *= latency_adjust;
+                    
                     partition_with_profiling_dir = "libcoral/tools.2"
                 elif '15x_first_seg' in seg_type:
                     partition_with_profiling_dir = "libcoral/tools.15"
@@ -266,6 +281,8 @@ def seg_exists(filename, segment_type, segment_count):
                     partition_with_profiling_dir = "libcoral/tools.last15"
                 elif '2x_last_seg' in seg_type:
                     partition_with_profiling_dir = "libcoral/tools.last2"
+                elif 'inc_seg' == seg_type:
+                    partition_with_profiling_dir = "libcoral/tools.inc_seg"
                 else:
                     partition_with_profiling_dir = "libcoral/tools.orig"
 
@@ -281,7 +298,7 @@ def seg_exists(filename, segment_type, segment_count):
             subprocess.run(cmd)#'''
            
 
-seg_types += ['133x_first_seg/']
+seg_types += ['133x_first_seg/', '15x_last_seg/', '2x_last_seg/']
 
 # Test timings
 fin_timings = {}
@@ -293,15 +310,12 @@ def seg_exists(filename, segment_type, segment_count):
 
     for num_tpus in range(2,MAX_TPU_COUNT+1):
 
-        for seg_type in seg_types+['orig_code']:
+        for seg_type in seg_types:
             max_seg = 0
             for sn in range(1,num_tpus+1):
 
                 # Test against orig code
-                if seg_type == 'orig_code':
-                    exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py"
-                else:
-                    exe_file = "/home/seth/Downloads/coral_module/objectdetection_coral_multitpu.py"
+                exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py"
 
                 # Get file types
                 seg_list, file_missing = seg_exists(fn, seg_type, sn)
@@ -312,19 +326,19 @@ def seg_exists(filename, segment_type, segment_count):
 
                 cmd = ["python3",exe_file,"--model"] + \
                       seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp",
-                      "--count","1000","--num-tpus",str(num_tpus)]
+                      "--count","2000","--num-tpus",str(num_tpus)]
                 print(cmd)
-                c = subprocess.run(cmd, capture_output=True)
+                c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                 print(c.stdout)
                 print(c.stderr)
-                ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stdout)[0])
+                ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0])
                 timings.append((ms_time, num_tpus, fn, seg_type, sn))
 
         timings = sorted(timings, key=lambda t: t[0])
 
         # Print the top three
         print(f"TIMINGS FOR {num_tpus} TPUs AND {fn} MODEL:")
-        for t in range(min(5,len(timings))):
+        for t in range(min(10,len(timings))):
             print(timings[t])
 
         # Get best segments, but
@@ -345,6 +359,12 @@ def seg_exists(filename, segment_type, segment_count):
             shutil.copyfile(s, out_fname)
             fin_fnames[fn][num_tpus].append(out_fname)
 
+        # Create archive for this model / TPU count
+        if any(fin_fnames[fn][num_tpus]):
+            cmd = ['zip', '-9', f'objectdetection-{fn}-{num_tpus}-edgetpu.zip'] + fin_fnames[fn][num_tpus]
+            print(cmd)
+            subprocess.run(cmd)
+
 print(fin_timings)
 print(fin_fnames)
  
@@ -352,6 +372,6 @@ def seg_exists(filename, segment_type, segment_count):
 for fn, v in fin_fnames.items():
     print("             '%s': {" % fn)
     for tpu_count, out_fnames in v.items():
-        print(f"                 # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference") # assumes 1k test runs
+        print(f"                 # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference")
         print(f"                 {tpu_count}: "+str(out_fnames)+",")
     print("             },")
diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index 14fc9dbb..57213f53 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -510,15 +510,10 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list:
         more than one list of segment files then use the list of files that best
         matches the number of TPUs we have, otherwise use the single list we
         have. If all else fails return the single TPU filename as a list.
-        NOTE: This method also updates self.device_count and self.segment_count
-              based on the choice of whether it uses a single model or a set of
-              segment file names
         """
 
         # if TPU no-show then default is CPU
         self.device_type   = 'CPU'
-        device_count  = 1  # CPU. At this point we don't know if we have TPU
-        segment_count = 1  # Single CPU model file
         if not any(tpu_list):
             return []
 
@@ -526,22 +521,20 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list:
 
         # If TPU found then default is single TPU model file (no segments)
         device_count  = len(tpu_list)  # TPUs. We've at least found one
-        segment_count = 1              # Single TPU model name at this point
-        if not any(options.tpu_segments_lists):
+        if not any(options.tpu_segments_lists) or device_count == 1:
             return [options.model_tpu_file]
             
         # We have a list of segment files
-        if isinstance(options.tpu_segments_lists[0], list):
+        if isinstance(options.tpu_segments_lists, dict):
             # Look for a good match between available TPUs and segment counts
-            # Prioritize first match
-            for fname_list in options.tpu_segments_lists:
-                segment_count = len(fname_list)
-                if segment_count <= device_count:
-                    return fname_list
+            # Prioritize first match. Note we have only tested up to 8 TPUs,
+            # so best performance above that can probably be had by extrapolation.
+            device_count = min(device_count, 8)
+            if device_count in options.tpu_segments_lists:
+                return options.tpu_segments_lists[device_count]
         else:
             # Only one list of segments; use it regardless of even match to TPU count
-            segment_count = len(options.tpu_segments_lists)
-            if segment_count <= device_count:
+            if len(options.tpu_segments_lists) <= device_count:
                 return options.tpu_segments_lists
 
         # Couldn't find a good fit, use single segment

From 43ff9deb9df38f7097998c645c29f69f490f89d1 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Mon, 8 Apr 2024 12:32:36 -0700
Subject: [PATCH 2/9] minor tweaks

---
 .../objectdetection_coral_multitpu.py                | 12 ++++++------
 src/modules/ObjectDetectionCoral/tpu_runner.py       |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
index dde479ac..ea76b6c0 100644
--- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
+++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
@@ -279,17 +279,17 @@ def main():
                             (tot_infr_time / args.count, thread_cnt,
                              half_wall_time * 1000 / half_infr_count, half_infr_count))
 
-  print('-------RESULTS--------')
+  logging.info('-------RESULTS--------')
   if not objs:
-    print('No objects detected')
+    logging.info('No objects detected')
     return
   
   if any(objs):
     for obj in objs:
-      print(_tpu_runner.labels.get(obj.id, obj.id))
-      print('  id:    ', obj.id)
-      print('  score: ', obj.score)
-      print('  bbox:  ', obj.bbox)
+      logging.info(_tpu_runner.labels.get(obj.id, obj.id))
+      logging.info(f'  id:    {obj.id}')
+      logging.info(f'  score: {obj.score}')
+      logging.info(f'  bbox:  {obj.bbox}')
   
   if args.output:
     image = image.convert('RGB')
diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index 57213f53..bbf587fc 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -517,10 +517,12 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list:
         if not any(tpu_list):
             return []
 
+        device_count  = len(tpu_list)  # TPUs. We've at least found one
         self.device_type   = 'Multi-TPU'
+        if device_count == 1:
+            self.device_type   = 'TPU'
 
         # If TPU found then default is single TPU model file (no segments)
-        device_count  = len(tpu_list)  # TPUs. We've at least found one
         if not any(options.tpu_segments_lists) or device_count == 1:
             return [options.model_tpu_file]
             

From 9d167d30882d4b06f1896725e191eddb9471b0b9 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Tue, 9 Apr 2024 09:17:59 -0700
Subject: [PATCH 3/9] Better queue balancer

---
 .../objectdetection_coral_multitpu.py         |  2 +-
 .../ObjectDetectionCoral/tpu_runner.py        | 94 +++++++++++++------
 2 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
index ea76b6c0..14b2073e 100644
--- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
+++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
@@ -275,7 +275,7 @@ def main():
                             (wall_time * 1000 / args.count, args.count,
                             (time.perf_counter() - start_one) * 1000))
                             
-  logging.info('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' %
+  logging.info('%.2fms avg time blocked across %d threads; %.3fms ea for final %d inferences' %
                             (tot_infr_time / args.count, thread_cnt,
                              half_wall_time * 1000 / half_infr_count, half_infr_count))
 
diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index bbf587fc..825d1609 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -269,8 +269,7 @@ def enqueue(self, in_tensor, out_q: queue.Queue):
 
     def balance_queues(self):
         # Don't bother if someone else is working on balancing
-        if len(self.queues) <= 1 or len(self.tpu_list) <= 2 or \
-           len(self.queues) == len(self.tpu_list) or \
+        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or \
            not self.balance_lock.acquire(blocking=False):
             return
 
@@ -278,22 +277,26 @@ def eval_timings(interpreter_counts):
             # How much time are we allocating for each segment
             time_alloc = []
 
-            for idx in range(len(self.interpreters)):
+            for seg_i in range(len(self.interpreters)):
                 # Find average runtime for this segment
                 avg_times = []
                 for interpreters in self.interpreters:
-                    avg_times += [i.timings[idx] / i.exec_count[idx] for i in interpreters if i.exec_count[idx] != 0]
+                    avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0]
 
                 if avg_times:
                     avg_time = sum(avg_times) / len(avg_times)
                 else:
-                    return 0, 0, 0.0
+                    return 0, 0, 0.0, None
 
                 # Adjust for number of TPUs allocated to it
-                time_alloc.append(avg_time / interpreter_counts[idx])
+                if interpreter_counts[seg_i] > 0:
+                    time_alloc.append(avg_time / interpreter_counts[seg_i])
+                else:
+                    # No interpreters result inf time
+                    time_alloc.append(float('inf'))
 
-            min_t = 100000000
-            min_i = -1
+            min_gt1_t = float('inf')
+            min_gt1_i = -1
             max_t = 0
             max_i = -1
 
@@ -306,28 +309,67 @@ def eval_timings(interpreter_counts):
 
                 # Min time needs to be lengthened so rem an interpreter,
                 # but only if it has more than one interpreter
-                if t < min_t and len(self.interpreters[i]) > 1:
-                    min_t = t
-                    min_i = i
+                if t < min_gt1_t and len(self.interpreters[i]) > 1:
+                    min_gt1_t = t
+                    min_gt1_i = i
+
+            # See if we can do better than the current max timing
+            untried_candidates = []
+            for interp_i, interpreters in enumerate(self.interpreters):
+                # Doesn't make sense to pull a TPU from a queue just to re-add it.
+                if interp_i == max_i:
+                    continue
+                # If it hasn't yet been tried for this segment
+                # (or if it has already found to be faster on this segment)
+                if any([True for i in interpreters if i.exec_count[max_i] == 0 or max_t-1.0 > i.timings[max_i] / i.exec_count[max_i]]):
+                    untried_candidates.append(interp_i)
 
-            return min_i, max_i, max(time_alloc)
+            return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None
 
         interpreter_counts = [len(i) for i in self.interpreters]
-        min_i, max_i, current_max = eval_timings(interpreter_counts)
+        min_i, max_i, current_max, min_untried_i = eval_timings(interpreter_counts)
         interpreter_counts[min_i] -= 1
         interpreter_counts[max_i] += 1
-        _, _, new_max = eval_timings(interpreter_counts)
+        _, _, new_max, _ = eval_timings(interpreter_counts)
 
-        # Return if we don't want to swap (+/- 1 ms)
+        # Return if we don't want to swap
         if new_max+1.0 >= current_max:
-            self.balance_lock.release()
-            return
+            if min_untried_i is None:
+                self.balance_lock.release()
+                return
+
+            # Swap slow segments with faster ones to see if we can run them faster.
+            # It might be a good way to optimize for heterogenous hardware.
+            logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}")
+
+            # Stop them
+            new_max_i         = self._rem_interpreter_from(min_untried_i)
+            new_min_untried_i = self._rem_interpreter_from(max_i)
+
+            # Swap them
+            new_max_i.start(max_i, self.fbytes_list[max_i])
+            self.interpreters[max_i].append(new_max_i)
+
+            new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i])
+            self.interpreters[min_untried_i].append(new_min_untried_i)
+
+        else:
+            logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})")
 
-        logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})")
+            realloc_interp = self._rem_interpreter_from(min_i)
 
+            # Add to large (too-slow) queue
+            realloc_interp.start(max_i, self.fbytes_list[max_i])
+            self.interpreters[max_i].append(realloc_interp)
+
+        self.balance_lock.release()
+        self.print_queue_len()
+
+
+    def _rem_interpreter_from(self, interp_i):
         # Sending False kills the processing loop
         self.rebalancing_lock.acquire()
-        self.queues[min_i].put(False)
+        self.queues[interp_i].put(False)
 
         # This is ugly, but I can't think of something better
         # Threads are blocked by queues. Queues may not have a stream
@@ -338,21 +380,15 @@ def eval_timings(interpreter_counts):
         # Block & wait
         realloc_interp = None
         with self.rebalancing_lock:
-            for idx, interpreter in enumerate(self.interpreters[min_i]):
+            for idx, interpreter in enumerate(self.interpreters[interp_i]):
                 if not interpreter.interpreter:
-                    realloc_interp = self.interpreters[min_i].pop(idx)
+                    realloc_interp = self.interpreters[interp_i].pop(idx)
                     break
+
         if not realloc_interp:
             logging.warning("Unable to find killed interpreter")
             self.balance_lock.release()
-            return
-
-        # Add to large (too-slow) queue
-        realloc_interp.start(max_i, self.fbytes_list[max_i])
-        self.interpreters[max_i].append(realloc_interp)
-
-        self.balance_lock.release()
-        self.print_queue_len()
+        return realloc_interp
 
 
     def print_queue_len(self):

From 8f9580a12261a635ae5923f5868f6f3e500c9d9b Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Wed, 10 Apr 2024 20:04:00 -0700
Subject: [PATCH 4/9] Bug fixes

---
 .../ObjectDetectionCoral/tpu_runner.py        | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index 825d1609..f3ee24d8 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -239,20 +239,20 @@ def __init__(self, tpu_list: list, fname_list: list):
             with open(fname, "rb") as fd:
                 self.fbytes_list.append(fd.read())
 
-        self._init_interpreters()
+        with self.balance_lock:
+            self._init_interpreters()
 
     def _init_interpreters(self):
 
         start_boot_time = time.perf_counter_ns()
 
         # Fill TPUs with interpreters
-        with self.balance_lock:
-            for i, tpu_name in enumerate(self.tpu_list):
-                seg_idx = i % len(self.fname_list)
+        for i, tpu_name in enumerate(self.tpu_list):
+            seg_idx = i % len(self.fname_list)
 
-                i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock)
-                i.start(seg_idx, self.fbytes_list[seg_idx])
-                self.interpreters[seg_idx].append(i)
+            i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock)
+            i.start(seg_idx, self.fbytes_list[seg_idx])
+            self.interpreters[seg_idx].append(i)
 
         self.first_name = self.interpreters[0][0].input_details[0]['name']
         
@@ -261,8 +261,9 @@ def _init_interpreters(self):
 
 
     def enqueue(self, in_tensor, out_q: queue.Queue):
-        if not self.first_name:
-            self._init_interpreters()
+        with self.balance_lock:
+            if not self.first_name:
+                self._init_interpreters()
 
         self.queues[0].put(({self.first_name: in_tensor}, out_q))
 
@@ -500,11 +501,11 @@ def __init__(self, tpu_limit: int = -1):
     def _watchdog(self):
         self.watchdog_time = time.time()
         while not self.watchdog_shutdown:
-            if self.pipe and \
+            if self.pipe and self.pipe.first_name is None and \
                 time.time() - self.watchdog_time > self.max_idle_secs_before_recycle:
                 logging.warning("No work in {} seconds, watchdog shutting down TPUs.".format(self.max_idle_secs_before_recycle))
                 self.runner_lock.acquire(timeout=MAX_WAIT_TIME)
-                if self.pipe.first_name:
+                if self.pipe:
                     self.pipe.delete()
                 self.runner_lock.release()
                 # Pipeline will reinitialize itself as needed

From 77d86a1f100385701f4feecfb0aad3b2b4e84067 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Thu, 11 Apr 2024 09:11:40 -0700
Subject: [PATCH 5/9] TTL for rebalancing

---
 src/modules/ObjectDetectionCoral/tpu_runner.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index f3ee24d8..6aef596d 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -212,18 +212,18 @@ def __init__(self, tpu_list: list, fname_list: list):
 
         self.max_pipeline_queue_length    = MAX_PIPELINE_QUEUE_LEN
         
-        self.fname_list         = fname_list
-        self.tpu_list           = tpu_list
-        self.interpreters       = [[]  for i in range(seg_count)]
+        self.fname_list   = fname_list
+        self.tpu_list     = tpu_list
+        self.interpreters = [[]  for i in range(seg_count)]
 
         # Input queues for each segment; if we go over maxsize, something went wrong
         self.queues = [queue.Queue(maxsize=self.max_pipeline_queue_length) for i in range(seg_count)]
 
         # Lock for internal reorganization
-        self.balance_lock       = threading.Lock()
+        self.balance_lock = threading.Lock()
 
         # Lock for interpreter use
-        self.rebalancing_lock   = threading.Lock()
+        self.rebalancing_lock = threading.Lock()
 
         # Read file data
         self.fbytes_list = []
@@ -244,6 +244,7 @@ def __init__(self, tpu_list: list, fname_list: list):
 
     def _init_interpreters(self):
 
+        self.balance_ttl  = len(self.tpu_list) * 2
         start_boot_time = time.perf_counter_ns()
 
         # Fill TPUs with interpreters
@@ -270,7 +271,7 @@ def enqueue(self, in_tensor, out_q: queue.Queue):
 
     def balance_queues(self):
         # Don't bother if someone else is working on balancing
-        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or \
+        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \
            not self.balance_lock.acquire(blocking=False):
             return
 
@@ -363,6 +364,7 @@ def eval_timings(interpreter_counts):
             realloc_interp.start(max_i, self.fbytes_list[max_i])
             self.interpreters[max_i].append(realloc_interp)
 
+        self.balance_ttl -= 1
         self.balance_lock.release()
         self.print_queue_len()
 

From 753310fc41552c431bdcc714d67a622b1c0462fc Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Thu, 11 Apr 2024 20:00:50 -0700
Subject: [PATCH 6/9] Code flow

---
 .../ObjectDetectionCoral/tpu_runner.py        | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index 6aef596d..8f44bae5 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -323,7 +323,7 @@ def eval_timings(interpreter_counts):
                     continue
                 # If it hasn't yet been tried for this segment
                 # (or if it has already found to be faster on this segment)
-                if any([True for i in interpreters if i.exec_count[max_i] == 0 or max_t-1.0 > i.timings[max_i] / i.exec_count[max_i]]):
+                if any([True for i in interpreters if i.exec_count[max_i] < 10 or max_t-0.1 > i.timings[max_i] / i.exec_count[max_i]]):
                     untried_candidates.append(interp_i)
 
             return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None
@@ -334,12 +334,17 @@ def eval_timings(interpreter_counts):
         interpreter_counts[max_i] += 1
         _, _, new_max, _ = eval_timings(interpreter_counts)
 
-        # Return if we don't want to swap
-        if new_max+1.0 >= current_max:
-            if min_untried_i is None:
-                self.balance_lock.release()
-                return
+        if new_max+1.0 < current_max:
+            # Allocate more TPUs to slow segments
+            logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})")
 
+            realloc_interp = self._rem_interpreter_from(min_i)
+
+            # Add to large (too-slow) queue
+            realloc_interp.start(max_i, self.fbytes_list[max_i])
+            self.interpreters[max_i].append(realloc_interp)
+
+        elif min_untried_i is not None:
             # Swap slow segments with faster ones to see if we can run them faster.
             # It might be a good way to optimize for heterogenous hardware.
             logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}")
@@ -355,14 +360,12 @@ def eval_timings(interpreter_counts):
             new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i])
             self.interpreters[min_untried_i].append(new_min_untried_i)
 
+            # FIXME: After we have TPUs evaluated and otherwise balanced, we could
+            # further optimize by ensuring the slowest segment doesn't contain any slow TPUs.
         else:
-            logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})")
-
-            realloc_interp = self._rem_interpreter_from(min_i)
-
-            # Add to large (too-slow) queue
-            realloc_interp.start(max_i, self.fbytes_list[max_i])
-            self.interpreters[max_i].append(realloc_interp)
+            # Return if we don't want to swap
+            self.balance_lock.release()
+            return
 
         self.balance_ttl -= 1
         self.balance_lock.release()

From fc935117976ab3f7a3705b3ca83e1302ab2de026 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.price@planet-labs.com>
Date: Thu, 11 Apr 2024 23:49:52 -0700
Subject: [PATCH 7/9] Tweaks to TPU balancer

---
 .../ObjectDetectionCoral/tpu_runner.py        | 152 ++++++++++--------
 1 file changed, 82 insertions(+), 70 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index 8f44bae5..c1c9b728 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -243,7 +243,7 @@ def __init__(self, tpu_list: list, fname_list: list):
             self._init_interpreters()
 
     def _init_interpreters(self):
-
+        # Set a Time To Live for balancing so we don't swap for inf in corner cases
         self.balance_ttl  = len(self.tpu_list) * 2
         start_boot_time = time.perf_counter_ns()
 
@@ -269,73 +269,87 @@ def enqueue(self, in_tensor, out_q: queue.Queue):
         self.queues[0].put(({self.first_name: in_tensor}, out_q))
 
 
-    def balance_queues(self):
-        # Don't bother if someone else is working on balancing
-        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \
-           not self.balance_lock.acquire(blocking=False):
-            return
+    def _eval_timings(interpreter_counts):
+        # How much time are we allocating for each segment
+        time_alloc = []
 
-        def eval_timings(interpreter_counts):
-            # How much time are we allocating for each segment
-            time_alloc = []
+        for seg_i in range(len(self.interpreters)):
+            # Find average runtime for this segment
+            avg_times = []
+            for interpreters in self.interpreters:
+                avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0]
 
-            for seg_i in range(len(self.interpreters)):
-                # Find average runtime for this segment
-                avg_times = []
-                for interpreters in self.interpreters:
-                    avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0]
+            if avg_times:
+                avg_time = sum(avg_times) / len(avg_times)
+            else:
+                return 0, 0, 0.0, None
 
-                if avg_times:
-                    avg_time = sum(avg_times) / len(avg_times)
-                else:
-                    return 0, 0, 0.0, None
+            # Adjust for number of TPUs allocated to it
+            if interpreter_counts[seg_i] > 0:
+                time_alloc.append(avg_time / interpreter_counts[seg_i])
+            else:
+                # No interpreters result inf time
+                time_alloc.append(float('inf'))
+
+        min_gt1_t = float('inf')
+        min_gt1_i = -1
+        max_t = 0
+        max_i = -1
+
+        # Find segments that maybe should swap
+        for i, t in enumerate(time_alloc):
+            # Max time needs to be shortened so add an interpreter.
+            if t > max_t:
+                max_t = t
+                max_i = i
+
+            # Min time needs to be lengthened so rem an interpreter,
+            # but only if it has more than one interpreter
+            if t < min_gt1_t and len(self.interpreters[i]) > 1:
+                min_gt1_t = t
+                min_gt1_i = i
+
+        # See if we can do better than the current max timing with swapping
+        swap_i = None
+        for interp_i, interpreters in enumerate(self.interpreters):
+            # Doesn't make sense to pull a TPU from a queue just to re-add it.
+            if interp_i == max_i:
+                continue
 
-                # Adjust for number of TPUs allocated to it
-                if interpreter_counts[seg_i] > 0:
-                    time_alloc.append(avg_time / interpreter_counts[seg_i])
-                else:
-                    # No interpreters result inf time
-                    time_alloc.append(float('inf'))
-
-            min_gt1_t = float('inf')
-            min_gt1_i = -1
-            max_t = 0
-            max_i = -1
-
-            # Find segments that maybe should swap
-            for i, t in enumerate(time_alloc):
-                # Max time needs to be shortened so add an interpreter.
-                if t > max_t:
-                    max_t = t
-                    max_i = i
-
-                # Min time needs to be lengthened so rem an interpreter,
-                # but only if it has more than one interpreter
-                if t < min_gt1_t and len(self.interpreters[i]) > 1:
-                    min_gt1_t = t
-                    min_gt1_i = i
-
-            # See if we can do better than the current max timing
-            untried_candidates = []
-            for interp_i, interpreters in enumerate(self.interpreters):
-                # Doesn't make sense to pull a TPU from a queue just to re-add it.
-                if interp_i == max_i:
-                    continue
-                # If it hasn't yet been tried for this segment
-                # (or if it has already found to be faster on this segment)
-                if any([True for i in interpreters if i.exec_count[max_i] < 10 or max_t-0.1 > i.timings[max_i] / i.exec_count[max_i]]):
-                    untried_candidates.append(interp_i)
+            # Test all TPUs in this segment
+            for i in interpreters:
+                # Only calc valid time after a few runs
+                new_max_t = 0
+                if i.exec_count[max_i] > 10:
+                    new_max_t = i.timings[max_i] / i.exec_count[max_i] 
+                new_swap_t = 0
+                if i.exec_count[interp_i] > 10:
+                    new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] 
+                    
+                # If it hasn't yet been tried for this segment or
+                # If it has already found to be faster on this segment
+                # and we aren't making the other segment the new worst.
+                if i.exec_count[max_i] < 10 or (max_t > new_max_t and max_t > new_swap_t):
+                    swap_i = interp_i
+                    break
 
-            return min_gt1_i, max_i, max(time_alloc), untried_candidates[0] if len(untried_candidates) > 0 else None
+        return min_gt1_i, max_i, max(time_alloc), swap_i    
+
+    
+    def balance_queues(self):
+        # Don't bother if someone else is working on balancing
+        if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \
+           not self.balance_lock.acquire(blocking=False):
+            return
 
         interpreter_counts = [len(i) for i in self.interpreters]
-        min_i, max_i, current_max, min_untried_i = eval_timings(interpreter_counts)
+        min_i, max_i, current_max, swap_i = self._eval_timings(interpreter_counts)
         interpreter_counts[min_i] -= 1
         interpreter_counts[max_i] += 1
-        _, _, new_max, _ = eval_timings(interpreter_counts)
+        _, _, new_max, _ = self._eval_timings(interpreter_counts)
 
         if new_max+1.0 < current_max:
-            # Allocate more TPUs to slow segments
+            # 1st Priority: Allocate more TPUs to slow segments
             logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})")
 
             realloc_interp = self._rem_interpreter_from(min_i)
@@ -344,24 +358,22 @@ def eval_timings(interpreter_counts):
             realloc_interp.start(max_i, self.fbytes_list[max_i])
             self.interpreters[max_i].append(realloc_interp)
 
-        elif min_untried_i is not None:
-            # Swap slow segments with faster ones to see if we can run them faster.
-            # It might be a good way to optimize for heterogenous hardware.
-            logging.info(f"Re-balancing between queues {min_untried_i} and {max_i}")
+        elif swap_i is not None:
+            # 2nd Priority: Swap slow segments with faster ones to see if we can
+            # run them faster. Hopefully still a good way to optimize for
+            # heterogenous hardware.
+            logging.info(f"Re-balancing between queues {swap_i} and {max_i}")
 
             # Stop them
-            new_max_i         = self._rem_interpreter_from(min_untried_i)
-            new_min_untried_i = self._rem_interpreter_from(max_i)
+            new_max  = self._rem_interpreter_from(swap_i)
+            new_swap = self._rem_interpreter_from(max_i)
 
             # Swap them
-            new_max_i.start(max_i, self.fbytes_list[max_i])
-            self.interpreters[max_i].append(new_max_i)
-
-            new_min_untried_i.start(min_untried_i, self.fbytes_list[min_untried_i])
-            self.interpreters[min_untried_i].append(new_min_untried_i)
+            new_max.start(max_i, self.fbytes_list[max_i])
+            self.interpreters[max_i].append(new_max)
 
-            # FIXME: After we have TPUs evaluated and otherwise balanced, we could
-            # further optimize by ensuring the slowest segment doesn't contain any slow TPUs.
+            new_swap.start(swap_i, self.fbytes_list[swap_i])
+            self.interpreters[swap_i].append(new_swap)
         else:
             # Return if we don't want to swap
             self.balance_lock.release()

From 1b16adab1df5bcada9a59566c0cde24e940b3fe3 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.price@planet-labs.com>
Date: Fri, 12 Apr 2024 00:05:20 -0700
Subject: [PATCH 8/9] Addle TPU swap logic

---
 .../ObjectDetectionCoral/tpu_runner.py        | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index c1c9b728..0c2bca16 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -272,12 +272,13 @@ def enqueue(self, in_tensor, out_q: queue.Queue):
     def _eval_timings(interpreter_counts):
         # How much time are we allocating for each segment
         time_alloc = []
+        VALID_CNT_THRESH = 50
 
         for seg_i in range(len(self.interpreters)):
             # Find average runtime for this segment
             avg_times = []
             for interpreters in self.interpreters:
-                avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] != 0]
+                avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] > VALID_CNT_THRESH]
 
             if avg_times:
                 avg_time = sum(avg_times) / len(avg_times)
@@ -293,7 +294,7 @@ def _eval_timings(interpreter_counts):
 
         min_gt1_t = float('inf')
         min_gt1_i = -1
-        max_t = 0
+        max_t = 0.0
         max_i = -1
 
         # Find segments that maybe should swap
@@ -309,6 +310,10 @@ def _eval_timings(interpreter_counts):
                 min_gt1_t = t
                 min_gt1_i = i
 
+        # Only eval swapping max segment if we have many samples
+        if VALID_CNT_THRESH > sum([i.exec_count[max_i] for i in self.interpreters[max_i]]):
+            return min_gt1_i, max_i, max(time_alloc), None
+
         # See if we can do better than the current max timing with swapping
         swap_i = None
         for interp_i, interpreters in enumerate(self.interpreters):
@@ -319,17 +324,17 @@ def _eval_timings(interpreter_counts):
             # Test all TPUs in this segment
             for i in interpreters:
                 # Only calc valid time after a few runs
-                new_max_t = 0
-                if i.exec_count[max_i] > 10:
+                new_max_t = 0.0
+                if i.exec_count[max_i] > VALID_CNT_THRESH:
                     new_max_t = i.timings[max_i] / i.exec_count[max_i] 
-                new_swap_t = 0
-                if i.exec_count[interp_i] > 10:
+                new_swap_t = 0.0
+                if i.exec_count[interp_i] > VALID_CNT_THRESH:
                     new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] 
                     
                 # If it hasn't yet been tried for this segment or
                 # If it has already found to be faster on this segment
                 # and we aren't making the other segment the new worst.
-                if i.exec_count[max_i] < 10 or (max_t > new_max_t and max_t > new_swap_t):
+                if i.exec_count[max_i] < VALID_CNT_THRESH or (max_t > new_max_t and max_t > new_swap_t):
                     swap_i = interp_i
                     break
 

From 2a6f862d230e57001f1cab7f0eabe40997903c95 Mon Sep 17 00:00:00 2001
From: Seth Price <seth.pricepages@gmail.com>
Date: Fri, 12 Apr 2024 16:10:43 -0700
Subject: [PATCH 9/9] Better tpu balancing

---
 .../objectdetection_coral_multitpu.py         |  2 +-
 .../ObjectDetectionCoral/tpu_runner.py        | 35 ++++++++++++-------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
index 14b2073e..d6b9cdf0 100644
--- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
+++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py
@@ -244,7 +244,7 @@ def main():
           tot_infr_time += infr_time
 
           # Start a timer for the last ~half of the run for more accurate benchmark
-          if chunk_i > (args.count-1) / 3.0:
+          if chunk_i > (args.count-1) / 2.0:
             half_infr_count += 1
             if half_wall_start is None:
               half_wall_start = time.perf_counter()
diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py
index 0c2bca16..19647516 100644
--- a/src/modules/ObjectDetectionCoral/tpu_runner.py
+++ b/src/modules/ObjectDetectionCoral/tpu_runner.py
@@ -243,7 +243,7 @@ def __init__(self, tpu_list: list, fname_list: list):
             self._init_interpreters()
 
     def _init_interpreters(self):
-        # Set a Time To Live for balancing so we don't swap for inf in corner cases
+        # Set a Time To Live for balancing so we don't thrash
         self.balance_ttl  = len(self.tpu_list) * 2
         start_boot_time = time.perf_counter_ns()
 
@@ -269,7 +269,7 @@ def enqueue(self, in_tensor, out_q: queue.Queue):
         self.queues[0].put(({self.first_name: in_tensor}, out_q))
 
 
-    def _eval_timings(interpreter_counts):
+    def _eval_timings(self, interpreter_counts):
         # How much time are we allocating for each segment
         time_alloc = []
         VALID_CNT_THRESH = 50
@@ -310,12 +310,17 @@ def _eval_timings(interpreter_counts):
                 min_gt1_t = t
                 min_gt1_i = i
 
-        # Only eval swapping max segment if we have many samples
-        if VALID_CNT_THRESH > sum([i.exec_count[max_i] for i in self.interpreters[max_i]]):
-            return min_gt1_i, max_i, max(time_alloc), None
+        # Only eval swapping max time segment if we have many samples in the current setup
+        for i in self.interpreters[max_i]:
+            if i.exec_count[max_i] < VALID_CNT_THRESH:
+                return min_gt1_i, max_i, max(time_alloc), None
 
-        # See if we can do better than the current max timing with swapping
+        # Undo avg interp count adjustment for TPU-to-TPU comparisons
+        max_t = max([i.timings[max_i] / i.exec_count[max_i] for i in self.interpreters[max_i]])
+
+        # See if we can do better than the current max time by swapping segments between TPUs
         swap_i = None
+        swap_t = float('inf')
         for interp_i, interpreters in enumerate(self.interpreters):
             # Doesn't make sense to pull a TPU from a queue just to re-add it.
             if interp_i == max_i:
@@ -323,6 +328,10 @@ def _eval_timings(interpreter_counts):
 
             # Test all TPUs in this segment
             for i in interpreters:
+                # If TPU hasn't yet been tried for this segment or ...
+                if i.exec_count[max_i] < VALID_CNT_THRESH:
+                    return min_gt1_i, max_i, max(time_alloc), interp_i    
+
                 # Only calc valid time after a few runs
                 new_max_t = 0.0
                 if i.exec_count[max_i] > VALID_CNT_THRESH:
@@ -330,13 +339,13 @@ def _eval_timings(interpreter_counts):
                 new_swap_t = 0.0
                 if i.exec_count[interp_i] > VALID_CNT_THRESH:
                     new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] 
-                    
-                # If it hasn't yet been tried for this segment or
-                # If it has already found to be faster on this segment
-                # and we aren't making the other segment the new worst.
-                if i.exec_count[max_i] < VALID_CNT_THRESH or (max_t > new_max_t and max_t > new_swap_t):
+
+                # If TPU has already found to be faster on this segment
+                # and we aren't making the other segment the new worst
+                # and we are choosing the best available candidate.
+                if max_t-0.5 > new_max_t and max_t > new_swap_t and swap_t > new_max_t:
                     swap_i = interp_i
-                    break
+                    swap_t = new_max_t 
 
         return min_gt1_i, max_i, max(time_alloc), swap_i    
 
@@ -367,7 +376,7 @@ def balance_queues(self):
             # 2nd Priority: Swap slow segments with faster ones to see if we can
             # run them faster. Hopefully still a good way to optimize for
             # heterogenous hardware.
-            logging.info(f"Re-balancing between queues {swap_i} and {max_i}")
+            logging.info(f"Auto-tuning between queues {swap_i} and {max_i}")
 
             # Stop them
             new_max  = self._rem_interpreter_from(swap_i)