diff --git a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py index af1a737b..d6b9cdf0 100644 --- a/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py +++ b/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py @@ -244,7 +244,7 @@ def main(): tot_infr_time += infr_time # Start a timer for the last ~half of the run for more accurate benchmark - if chunk_i > (args.count-1) / 3.0: + if chunk_i > (args.count-1) / 2.0: half_infr_count += 1 if half_wall_start is None: half_wall_start = time.perf_counter() @@ -271,25 +271,25 @@ def main(): if half_wall_start is not None: half_wall_time = time.perf_counter() - half_wall_start - print('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' % + logging.info('completed one run every %.2fms for %d runs; %.2fms wall time for a single run' % (wall_time * 1000 / args.count, args.count, (time.perf_counter() - start_one) * 1000)) - print('%.2fms avg time blocked across %d threads; %.2fms ea for final %d inferences' % + logging.info('%.2fms avg time blocked across %d threads; %.3fms ea for final %d inferences' % (tot_infr_time / args.count, thread_cnt, half_wall_time * 1000 / half_infr_count, half_infr_count)) - print('-------RESULTS--------') + logging.info('-------RESULTS--------') if not objs: - print('No objects detected') + logging.info('No objects detected') return if any(objs): for obj in objs: - print(_tpu_runner.labels.get(obj.id, obj.id)) - print(' id: ', obj.id) - print(' score: ', obj.score) - print(' bbox: ', obj.bbox) + logging.info(_tpu_runner.labels.get(obj.id, obj.id)) + logging.info(f' id: {obj.id}') + logging.info(f' score: {obj.score}') + logging.info(f' bbox: {obj.bbox}') if args.output: image = image.convert('RGB') diff --git a/src/modules/ObjectDetectionCoral/options.py b/src/modules/ObjectDetectionCoral/options.py index 676dd4e2..14a036cf 100644 --- a/src/modules/ObjectDetectionCoral/options.py +++ b/src/modules/ObjectDetectionCoral/options.py @@ -12,13 +12,152 @@ def getEnvVariable(a, b): class Settings: def __init__(self, model_name: str, model_name_pattern: str, std_model_name: str, - tpu_model_name: str, labels_name: str, tpu_segments_lists): + tpu_model_name: str, labels_name: str): self.model_name = model_name self.model_name_pattern = model_name_pattern self.cpu_model_name = std_model_name self.tpu_model_name = tpu_model_name self.labels_name = labels_name - self.tpu_segments_lists = tpu_segments_lists + + self.MODEL_SEGMENTS = { + 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq': { + # 104.2 ms per inference + 2: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + # 67.5 ms per inference + 3: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + # 49.1 ms per inference + 4: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + # 43.5 ms per inference + 5: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + # 37.0 ms per inference + 6: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + # 31.1 ms per inference + 7: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_2_edgetpu.tflite'], + # 27.1 ms per inference + 8: ['all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_segment_2_of_3_edgetpu.tflite'], + }, + 'efficientdet_lite2_448_ptq': { + # 32.1 ms per inference + 2: ['all_segments_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 19.5 ms per inference + 3: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 16.5 ms per inference + 4: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_3_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_2_of_3_edgetpu.tflite'], + # 13.6 ms per inference + 5: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 11.5 ms per inference + 7: ['166x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '166x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + # 11.3 ms per inference + 8: ['15x_first_seg_efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'efficientdet_lite3_512_ptq': { + # 20.9 ms per inference + 4: ['15x_last_seg_efficientdet_lite3_512_ptq_segment_0_of_2_edgetpu.tflite', '15x_last_seg_efficientdet_lite3_512_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'efficientdet_lite3x_640_ptq': { + # 95.0 ms per inference + 2: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 70.6 ms per inference + 3: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'], + # 47.9 ms per inference + 4: ['2x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite', '2x_first_seg_efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite'], + # 38.7 ms per inference + 5: ['15x_first_seg_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', '15x_first_seg_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 35.1 ms per inference + 6: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 30.6 ms per inference + 7: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + # 27.3 ms per inference + 8: ['all_segments_efficientdet_lite3x_640_ptq_segment_0_of_2_edgetpu.tflite', 'all_segments_efficientdet_lite3x_640_ptq_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov5m-int8': { + # 56.3 ms per inference + 2: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + # 32.2 ms per inference + 3: ['15x_first_seg_yolov5m-int8_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + # 25.9 ms per inference + 4: ['2x_last_seg_yolov5m-int8_segment_0_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_1_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_2_of_4_edgetpu.tflite', '2x_last_seg_yolov5m-int8_segment_3_of_4_edgetpu.tflite'], + # 21.2 ms per inference + 5: ['all_segments_yolov5m-int8_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_2_edgetpu.tflite'], + # 18.8 ms per inference + 6: ['15x_last_seg_yolov5m-int8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_yolov5m-int8_segment_2_of_3_edgetpu.tflite'], + # 14.7 ms per inference + 7: ['all_segments_yolov5m-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_3_of_4_edgetpu.tflite'], + # 14.6 ms per inference + 8: ['all_segments_yolov5m-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5m-int8_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov5l-int8': { + # 61.1 ms per inference + 3: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], + # 48.0 ms per inference + 4: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'], + # 39.0 ms per inference + 5: ['all_segments_yolov5l-int8_segment_0_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_5_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_4_of_5_edgetpu.tflite'], + # 31.5 ms per inference + 6: ['all_segments_yolov5l-int8_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_3_edgetpu.tflite'], + # 26.7 ms per inference + 7: ['dumb_yolov5l-int8_segment_0_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_1_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_2_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_3_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_4_of_6_edgetpu.tflite', 'dumb_yolov5l-int8_segment_5_of_6_edgetpu.tflite'], + # 24.4 ms per inference + 8: ['all_segments_yolov5l-int8_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov5l-int8_segment_3_of_4_edgetpu.tflite'], + }, + 'yolov8s_416_640px': { + # 25.6 ms per inference + 3: ['166x_first_seg_yolov8s_416_640px_segment_0_of_2_edgetpu.tflite', '166x_first_seg_yolov8s_416_640px_segment_1_of_2_edgetpu.tflite'], + }, + 'yolov8m_416_640px': { + # 114.4 ms per inference + 2: ['all_segments_yolov8m_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], + # 71.9 ms per inference + 3: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + # 53.0 ms per inference + 4: ['2x_first_seg_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + # 43.5 ms per inference + 5: ['166x_first_seg_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', '166x_first_seg_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'], + # 31.8 ms per inference + 6: ['2x_first_seg_yolov8m_416_640px_segment_0_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_1_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_2_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_3_of_5_edgetpu.tflite', '2x_first_seg_yolov8m_416_640px_segment_4_of_5_edgetpu.tflite'], + # 29.5 ms per inference + 7: ['all_segments_yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_3_of_4_edgetpu.tflite'], + # 26.0 ms per inference + 8: ['all_segments_yolov8m_416_640px_segment_0_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_1_of_3_edgetpu.tflite', 'all_segments_yolov8m_416_640px_segment_2_of_3_edgetpu.tflite'], + }, + 'yolov8l_416_640px': { + # 169.6 ms per inference + 2: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 115.8 ms per inference + 3: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 89.7 ms per inference + 4: ['all_segments_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', 'all_segments_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 77.7 ms per inference + 5: ['4x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '4x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 64.2 ms per inference + 6: ['15x_first_seg_yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', '15x_first_seg_yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], + # 57.3 ms per inference + 7: ['3x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '3x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], + # 52.2 ms per inference + 8: ['166x_first_seg_yolov8l_416_640px_segment_0_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_1_of_3_edgetpu.tflite', '166x_first_seg_yolov8l_416_640px_segment_2_of_3_edgetpu.tflite'], + }, + 'ipcam-general-v8': { + # 53.4 ms per inference + 2: ['2x_last_seg_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + # 24.3 ms per inference + 3: ['all_segments_ipcam-general-v8_segment_0_of_2_edgetpu.tflite', 'all_segments_ipcam-general-v8_segment_1_of_2_edgetpu.tflite'], + # 19.9 ms per inference + 4: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 15.6 ms per inference + 5: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 15.2 ms per inference + 6: ['15x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 12.3 ms per inference + 7: ['15x_first_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '15x_first_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + # 10.9 ms per inference + 8: ['2x_last_seg_ipcam-general-v8_segment_0_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_1_of_3_edgetpu.tflite', '2x_last_seg_ipcam-general-v8_segment_2_of_3_edgetpu.tflite'], + }, + } + + self.tpu_segments_lists = {} + if model_name_pattern in self.MODEL_SEGMENTS: + self.tpu_segments_lists = self.MODEL_SEGMENTS[model_name_pattern] + class Options: @@ -36,123 +175,92 @@ def __init__(self): self.MODEL_SETTINGS = { "yolov8": { # 59.88 ms throughput / 855.40 ms inference - "large": Settings('YOLOv8', 'yolov8l', \ + "large": Settings('YOLOv8', 'yolov8l_416_640px', 'yolov8l_416_640px.tflite', # 46Mb CPU 'yolov8l_416_640px_edgetpu.tflite', # 48Mb TPU - 'coco_labels.txt', - # 54.18 ms throughput / 754.56 ms inference - [['yolov8l_416_640px_segment_0_of_2_edgetpu.tflite', - 'yolov8l_416_640px_segment_1_of_2_edgetpu.tflite'], - # 55.79 ms throughput / 824.09 ms inference - ['yolov8l_448px_segment_0_of_3_edgetpu.tflite', - 'yolov8l_448px_segment_1_of_3_edgetpu.tflite', - 'yolov8l_448px_segment_2_of_3_edgetpu.tflite']]), + 'coco_labels.txt'), # 53.72 ms throughput / 762.86 ms inference - "medium": Settings('YOLOv8', 'yolov8m', \ - 'yolov8m-416_640px.tflite', # 21Mb CPU - 'yolov8m-416_640px_edgetpu.tflite', # 22Mb TPU - 'coco_labels.txt', - [['yolov8m__segment_0_of_2_edgetpu.tflite', - 'yolov8m_416_640px_segment_1_of_2_edgetpu.tflite'], - # 39.59 ms throughput / 574.83 ms inference - ['yolov8m_416_640px_segment_0_of_4_edgetpu.tflite', - 'yolov8m_416_640px_segment_1_of_4_edgetpu.tflite', - 'yolov8m_416_640px_segment_2_of_4_edgetpu.tflite', - 'yolov8m_416_640px_segment_3_of_4_edgetpu.tflite']]), + "medium": Settings('YOLOv8', 'yolov8m_416_640px', \ + 'yolov8m_416_640px.tflite', # 21Mb CPU + 'yolov8m_416_640px_edgetpu.tflite', # 22Mb TPU + 'coco_labels.txt'), # 21.52 ms throughput / 291.35 ms inference - "small": Settings('YOLOv8', 'yolov8s', \ + "small": Settings('YOLOv8', 'yolov8s_416_640px', 'yolov8s_416_640px.tflite', # 11Mb CPU 'yolov8s_416_640px_edgetpu.tflite', # 12Mb TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # 10.35 ms throughput / 123.35 ms inference - "tiny": Settings('YOLOv8', 'yolov8n', + "tiny": Settings('YOLOv8', 'yolov8n_416_640px', 'yolov8n_416_640px.tflite', # 4Mb CPU 'yolov8n_416_640px_edgetpu.tflite', # 3Mb TPU - 'coco_labels.txt', []) + 'coco_labels.txt') }, - "yolov5": { - "large": Settings('YOLOv5', 'yolov5l', \ + "large": Settings('YOLOv5', 'yolov5l-int8', 'yolov5l-int8.tflite', # 46Mb CPU 'yolov5l-int8_edgetpu.tflite', # 48Mb TPU - 'coco_labels.txt', - [['yolov5l-int8_edgetpu_segment_0_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_1_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_2_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_3_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_4_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_5_of_7_edgetpu.tflite', - 'yolov5l-int8_edgetpu_segment_6_of_7_edgetpu.tflite']]), - "medium": Settings('YOLOv5', 'yolov5m', \ + 'coco_labels.txt'), + "medium": Settings('YOLOv5', 'yolov5m-int8', 'yolov5m-int8.tflite', # 21Mb CPU 'yolov5m-int8_edgetpu.tflite', # 22Mb TPU - 'coco_labels.txt', - [['yolov5m-int8_edgetpu_segment_0_of_4_edgetpu.tflite', - 'yolov5m-int8_edgetpu_segment_1_of_4_edgetpu.tflite', - 'yolov5m-int8_edgetpu_segment_2_of_4_edgetpu.tflite', - 'yolov5m-int8_edgetpu_segment_3_of_4_edgetpu.tflite']]), - "small": Settings('YOLOv5', 'yolov5s', \ + 'coco_labels.txt'), + "small": Settings('YOLOv5', 'yolov5s-int8', 'yolov5s-int8.tflite', # 7Mb CPU 'yolov5s-int8_edgetpu.tflite', # 8Mb TPU - 'coco_labels.txt', []), - "tiny": Settings('YOLOv5', 'yolov5n', \ + 'coco_labels.txt'), + "tiny": Settings('YOLOv5', 'yolov5n-int8', 'yolov5n-int8.tflite', # 2Mb CPU 'yolov5n-int8_edgetpu.tflite', # 2Mb TPU - 'coco_labels.txt', []) + 'coco_labels.txt') }, - "efficientdet-lite": { # Large: EfficientDet-Lite3x 90 objects COCO 640x640x3 2 197.0 ms 43.9% mAP - "large": Settings('EfficientDet-Lite', 'efficientdet_lite3x_640', \ + "large": Settings('EfficientDet-Lite', 'efficientdet_lite3x_640_ptq', \ 'efficientdet_lite3x_640_ptq.tflite', # 14Mb CPU 'efficientdet_lite3x_640_ptq_edgetpu.tflite', # 20Mb TPU - 'coco_labels.txt', - [['efficientdet_lite3x_640_ptq_segment_0_of_3_edgetpu.tflite', - 'efficientdet_lite3x_640_ptq_segment_1_of_3_edgetpu.tflite' - 'efficientdet_lite3x_640_ptq_segment_2_of_3_edgetpu.tflite']]), + 'coco_labels.txt'), # Medium: EfficientDet-Lite3 90 objects 512x512x3 2 107.6 ms 39.4% mAP - "medium": Settings('EfficientDet-Lite', 'efficientdet_lite3_512', \ + "medium": Settings('EfficientDet-Lite', 'efficientdet_lite3_512_ptq', \ 'efficientdet_lite3_512_ptq.tflite', # CPU 'efficientdet_lite3_512_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Small: EfficientDet-Lite2 90 objects COCO 448x448x3 2 104.6 ms 36.0% mAP - "small": Settings('EfficientDet-Lite', 'efficientdet_lite2_448', \ + "small": Settings('EfficientDet-Lite', 'efficientdet_lite2_448_ptq', \ 'efficientdet_lite2_448_ptq.tflite', # 10Mb CPU 'efficientdet_lite2_448_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', - [['efficientdet_lite2_448_ptq_segment_0_of_2_edgetpu.tflite', - 'efficientdet_lite2_448_ptq_segment_1_of_2_edgetpu.tflite']]), + 'coco_labels.txt'), # Tiny: EfficientDet-Lite1 90 objects COCO 384x384x3 2 56.3 ms 34.3% mAP - "tiny": Settings('EfficientDet-Lite', 'efficientdet_lite1_384', \ + "tiny": Settings('EfficientDet-Lite', 'efficientdet_lite1_384_ptq', \ 'efficientdet_lite1_384_ptq.tflite', # 7Mb CPU 'efficientdet_lite1_384_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []) + 'coco_labels.txt') }, "mobilenet ssd": { # Large: SSD/FPN MobileNet V1 90 objects, COCO 640x640x3 TF-lite v2 229.4 ms 31.1% mAP - "large": Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v1_fpn_640', \ + "large": Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', \ 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq.tflite', # CPU 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Medium: SSDLite MobileDet 90 objects, COCO 320x320x3 TF-lite v1 9.1 ms 32.9% mAP "medium": Settings('MobileNet SSD', 'ssdlite_mobiledet_coco_', \ 'ssdlite_mobiledet_coco_qat_postprocess.tflite', # 5Mb CPU 'ssdlite_mobiledet_coco_qat_postprocess_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Small: SSD MobileNet V2 90 objects, COCO 300x300x3 TF-lite v2 7.6 ms 22.4% mAP "small": Settings('MobileNet SSD', 'tf2_ssd_mobilenet_v2', \ 'tf2_ssd_mobilenet_v2_coco17_ptq.tflite', # 6.7Mb CPU 'tf2_ssd_mobilenet_v2_coco17_ptq_edgetpu.tflite', # TPU - 'coco_labels.txt', []), + 'coco_labels.txt'), # Tiny: MobileNet V2 90 objects, COCO 300x300x3 TF-lite v2 Quant "tiny": Settings('MobileNet SSD', 'ssd_mobilenet_v2_coco_', \ 'ssd_mobilenet_v2_coco_quant_postprocess.tflite', # 6.6Mb CPU 'ssd_mobilenet_v2_coco_quant_postprocess_edgetpu.tflite', # TPU - 'coco_labels.txt', []) + 'coco_labels.txt') } } + self.ENABLE_MULTI_TPU = True self.MIN_CONFIDENCE = 0.5 @@ -221,7 +329,7 @@ def set_model(self, model_name): # Normalise input self.model_name = model_name.lower() - if self.model_name not in [ "mobilenet ssd", "efficientdet-lite", "yolov5", "yolov8" ]: # 'yolov5' - no sense including v5 anymore + if self.model_name not in [ "mobilenet ssd", "efficientdet-lite", "yolov5", "yolov8"]: # 'yolov5' - no sense including v5 anymore self.model_name = "mobilenet ssd" self.model_size = self.model_size.lower() @@ -254,5 +362,8 @@ def set_model(self, model_name): self.model_cpu_file = os.path.normpath(os.path.join(self.models_dir, self.cpu_model_name)) self.model_tpu_file = os.path.normpath(os.path.join(self.models_dir, self.tpu_model_name)) self.label_file = os.path.normpath(os.path.join(self.models_dir, self.labels_name)) - self.tpu_segments_lists = [ [os.path.normpath(os.path.join(self.models_dir, name)) for name in name_list] \ - for name_list in settings.tpu_segments_lists ] + + self.tpu_segments_lists = {} + for tpu_cnt, name_list in settings.tpu_segments_lists.items(): + self.tpu_segments_lists[tpu_cnt] = \ + [os.path.normpath(os.path.join(self.models_dir, name)) for name in name_list] diff --git a/src/modules/ObjectDetectionCoral/segment_and_test.py b/src/modules/ObjectDetectionCoral/segment_and_test.py index ed7706b1..72db2542 100644 --- a/src/modules/ObjectDetectionCoral/segment_and_test.py +++ b/src/modules/ObjectDetectionCoral/segment_and_test.py @@ -10,20 +10,24 @@ #'ssd_mobilenet_v2_coco_quant_postprocess', #'ssdlite_mobiledet_coco_qat_postprocess', #'ssd_mobilenet_v1_coco_quant_postprocess', - 'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', + #'tf2_ssd_mobilenet_v1_fpn_640x640_coco17_ptq', #'efficientdet_lite0_320_ptq', #'efficientdet_lite1_384_ptq', - 'efficientdet_lite2_448_ptq', - 'efficientdet_lite3_512_ptq', - 'efficientdet_lite3x_640_ptq', + #'efficientdet_lite2_448_ptq', + #'efficientdet_lite3_512_ptq', + #'efficientdet_lite3x_640_ptq', #'yolov5n-int8', #'yolov5s-int8', - 'yolov5m-int8', - 'yolov5l-int8', + #'yolov5m-int8', + #'yolov5l-int8', #'yolov8n_416_640px', # lg 1st seg - 'yolov8s_416_640px', # lg 1st seg - 'yolov8m_416_640px', # lg 1st seg - 'yolov8l_416_640px', # lg 1st seg + #'yolov8s_416_640px', # lg 1st seg + #'yolov8m_416_640px', # lg 1st seg + #'yolov8l_416_640px', # lg 1st seg + #'yolov8n_640px', + #'yolov8s_640px', + #'yolov8m_640px', # lg 1st seg + #'yolov8l_640px', # lg 1st seg 'ipcam-general-v8'] custom_args = { @@ -178,7 +182,7 @@ 8: ["--partition_search_step","5"]}}#''' seg_dir = "/media/seth/FAT_THUMB/all_segments/" -seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', '15x_last_seg/', '2x_last_seg/', 'dumb/'] +seg_types = ['', '2x_first_seg/', '15x_first_seg/', '166x_first_seg/', '3x_first_seg/', '4x_first_seg/', 'inc_seg/', 'dumb/'] def seg_exists(filename, segment_type, segment_count): @@ -191,7 +195,7 @@ def seg_exists(filename, segment_type, segment_count): seg_list = [seg_dir+segment_type+filename+'_segment_{}_of_{}_edgetpu.tflite'.format(i, segment_count) for i in range(segment_count)] return (seg_list, any([True for s in seg_list if not os.path.exists(s)])) -MAX_TPU_COUNT = 8 +MAX_TPU_COUNT = 4 ''' # Generate segment files @@ -251,6 +255,17 @@ def seg_exists(filename, segment_type, segment_count): # for (auto latency : latencies) { # # sudo make DOCKER_IMAGE="ubuntu:20.04" DOCKER_CPUS="k8" DOCKER_TARGETS="tools" docker-build + + #// Encourage each segment slower than the previous to spread out the bottlenecks + #double latency_adjust = 1.0; + #for (int i = 1; i < num_segments_; ++i) + #{ + # if (latencies[i-1] < latencies[i]) + # latency_adjust *= 0.97; + # latencies[i-1] *= latency_adjust; + #} + #latencies[num_segments_-1] *= latency_adjust; + partition_with_profiling_dir = "libcoral/tools.2" elif '15x_first_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.15" @@ -266,6 +281,8 @@ def seg_exists(filename, segment_type, segment_count): partition_with_profiling_dir = "libcoral/tools.last15" elif '2x_last_seg' in seg_type: partition_with_profiling_dir = "libcoral/tools.last2" + elif 'inc_seg' == seg_type: + partition_with_profiling_dir = "libcoral/tools.inc_seg" else: partition_with_profiling_dir = "libcoral/tools.orig" @@ -281,7 +298,7 @@ def seg_exists(filename, segment_type, segment_count): subprocess.run(cmd)#''' -seg_types += ['133x_first_seg/'] +seg_types += ['133x_first_seg/', '15x_last_seg/', '2x_last_seg/'] # Test timings fin_timings = {} @@ -293,15 +310,12 @@ def seg_exists(filename, segment_type, segment_count): for num_tpus in range(2,MAX_TPU_COUNT+1): - for seg_type in seg_types+['orig_code']: + for seg_type in seg_types: max_seg = 0 for sn in range(1,num_tpus+1): # Test against orig code - if seg_type == 'orig_code': - exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py" - else: - exe_file = "/home/seth/Downloads/coral_module/objectdetection_coral_multitpu.py" + exe_file = "/home/seth/CodeProject.AI-Server/src/modules/ObjectDetectionCoral/objectdetection_coral_multitpu.py" # Get file types seg_list, file_missing = seg_exists(fn, seg_type, sn) @@ -312,19 +326,19 @@ def seg_exists(filename, segment_type, segment_count): cmd = ["python3",exe_file,"--model"] + \ seg_list + ["--labels","coral/pycoral/test_data/coco_labels.txt","--input","/home/seth/coral/pycoral/test_data/grace_hopper.bmp", - "--count","1000","--num-tpus",str(num_tpus)] + "--count","2000","--num-tpus",str(num_tpus)] print(cmd) - c = subprocess.run(cmd, capture_output=True) + c = subprocess.run(cmd, check=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) print(c.stdout) print(c.stderr) - ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stdout)[0]) + ms_time = float(re.compile(r'threads; ([\d\.]+)ms ea').findall(c.stderr)[0]) timings.append((ms_time, num_tpus, fn, seg_type, sn)) timings = sorted(timings, key=lambda t: t[0]) # Print the top three print(f"TIMINGS FOR {num_tpus} TPUs AND {fn} MODEL:") - for t in range(min(5,len(timings))): + for t in range(min(10,len(timings))): print(timings[t]) # Get best segments, but @@ -345,6 +359,12 @@ def seg_exists(filename, segment_type, segment_count): shutil.copyfile(s, out_fname) fin_fnames[fn][num_tpus].append(out_fname) + # Create archive for this model / TPU count + if any(fin_fnames[fn][num_tpus]): + cmd = ['zip', '-9', f'objectdetection-{fn}-{num_tpus}-edgetpu.zip'] + fin_fnames[fn][num_tpus] + print(cmd) + subprocess.run(cmd) + print(fin_timings) print(fin_fnames) @@ -352,6 +372,6 @@ def seg_exists(filename, segment_type, segment_count): for fn, v in fin_fnames.items(): print(" '%s': {" % fn) for tpu_count, out_fnames in v.items(): - print(f" # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference") # assumes 1k test runs + print(f" # {fin_timings[fn][tpu_count][0]:6.1f} ms per inference") print(f" {tpu_count}: "+str(out_fnames)+",") print(" },") diff --git a/src/modules/ObjectDetectionCoral/tpu_runner.py b/src/modules/ObjectDetectionCoral/tpu_runner.py index 14fc9dbb..19647516 100644 --- a/src/modules/ObjectDetectionCoral/tpu_runner.py +++ b/src/modules/ObjectDetectionCoral/tpu_runner.py @@ -212,18 +212,18 @@ def __init__(self, tpu_list: list, fname_list: list): self.max_pipeline_queue_length = MAX_PIPELINE_QUEUE_LEN - self.fname_list = fname_list - self.tpu_list = tpu_list - self.interpreters = [[] for i in range(seg_count)] + self.fname_list = fname_list + self.tpu_list = tpu_list + self.interpreters = [[] for i in range(seg_count)] # Input queues for each segment; if we go over maxsize, something went wrong self.queues = [queue.Queue(maxsize=self.max_pipeline_queue_length) for i in range(seg_count)] # Lock for internal reorganization - self.balance_lock = threading.Lock() + self.balance_lock = threading.Lock() # Lock for interpreter use - self.rebalancing_lock = threading.Lock() + self.rebalancing_lock = threading.Lock() # Read file data self.fbytes_list = [] @@ -239,20 +239,21 @@ def __init__(self, tpu_list: list, fname_list: list): with open(fname, "rb") as fd: self.fbytes_list.append(fd.read()) - self._init_interpreters() + with self.balance_lock: + self._init_interpreters() def _init_interpreters(self): - + # Set a Time To Live for balancing so we don't thrash + self.balance_ttl = len(self.tpu_list) * 2 start_boot_time = time.perf_counter_ns() # Fill TPUs with interpreters - with self.balance_lock: - for i, tpu_name in enumerate(self.tpu_list): - seg_idx = i % len(self.fname_list) + for i, tpu_name in enumerate(self.tpu_list): + seg_idx = i % len(self.fname_list) - i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock) - i.start(seg_idx, self.fbytes_list[seg_idx]) - self.interpreters[seg_idx].append(i) + i = DynamicInterpreter(self.fname_list, tpu_name, self.queues, self.rebalancing_lock) + i.start(seg_idx, self.fbytes_list[seg_idx]) + self.interpreters[seg_idx].append(i) self.first_name = self.interpreters[0][0].input_details[0]['name'] @@ -261,73 +262,146 @@ def _init_interpreters(self): def enqueue(self, in_tensor, out_q: queue.Queue): - if not self.first_name: - self._init_interpreters() + with self.balance_lock: + if not self.first_name: + self._init_interpreters() self.queues[0].put(({self.first_name: in_tensor}, out_q)) + def _eval_timings(self, interpreter_counts): + # How much time are we allocating for each segment + time_alloc = [] + VALID_CNT_THRESH = 50 + + for seg_i in range(len(self.interpreters)): + # Find average runtime for this segment + avg_times = [] + for interpreters in self.interpreters: + avg_times += [i.timings[seg_i] / i.exec_count[seg_i] for i in interpreters if i.exec_count[seg_i] > VALID_CNT_THRESH] + + if avg_times: + avg_time = sum(avg_times) / len(avg_times) + else: + return 0, 0, 0.0, None + + # Adjust for number of TPUs allocated to it + if interpreter_counts[seg_i] > 0: + time_alloc.append(avg_time / interpreter_counts[seg_i]) + else: + # No interpreters result inf time + time_alloc.append(float('inf')) + + min_gt1_t = float('inf') + min_gt1_i = -1 + max_t = 0.0 + max_i = -1 + + # Find segments that maybe should swap + for i, t in enumerate(time_alloc): + # Max time needs to be shortened so add an interpreter. + if t > max_t: + max_t = t + max_i = i + + # Min time needs to be lengthened so rem an interpreter, + # but only if it has more than one interpreter + if t < min_gt1_t and len(self.interpreters[i]) > 1: + min_gt1_t = t + min_gt1_i = i + + # Only eval swapping max time segment if we have many samples in the current setup + for i in self.interpreters[max_i]: + if i.exec_count[max_i] < VALID_CNT_THRESH: + return min_gt1_i, max_i, max(time_alloc), None + + # Undo avg interp count adjustment for TPU-to-TPU comparisons + max_t = max([i.timings[max_i] / i.exec_count[max_i] for i in self.interpreters[max_i]]) + + # See if we can do better than the current max time by swapping segments between TPUs + swap_i = None + swap_t = float('inf') + for interp_i, interpreters in enumerate(self.interpreters): + # Doesn't make sense to pull a TPU from a queue just to re-add it. + if interp_i == max_i: + continue + + # Test all TPUs in this segment + for i in interpreters: + # If TPU hasn't yet been tried for this segment or ... + if i.exec_count[max_i] < VALID_CNT_THRESH: + return min_gt1_i, max_i, max(time_alloc), interp_i + + # Only calc valid time after a few runs + new_max_t = 0.0 + if i.exec_count[max_i] > VALID_CNT_THRESH: + new_max_t = i.timings[max_i] / i.exec_count[max_i] + new_swap_t = 0.0 + if i.exec_count[interp_i] > VALID_CNT_THRESH: + new_swap_t = i.timings[interp_i] / i.exec_count[interp_i] + + # If TPU has already found to be faster on this segment + # and we aren't making the other segment the new worst + # and we are choosing the best available candidate. + if max_t-0.5 > new_max_t and max_t > new_swap_t and swap_t > new_max_t: + swap_i = interp_i + swap_t = new_max_t + + return min_gt1_i, max_i, max(time_alloc), swap_i + + def balance_queues(self): # Don't bother if someone else is working on balancing - if len(self.queues) <= 1 or len(self.tpu_list) <= 2 or \ - len(self.queues) == len(self.tpu_list) or \ + if len(self.queues) <= 1 or len(self.tpu_list) < 2 or self.balance_ttl <= 0 or \ not self.balance_lock.acquire(blocking=False): return - def eval_timings(interpreter_counts): - # How much time are we allocating for each segment - time_alloc = [] - - for idx in range(len(self.interpreters)): - # Find average runtime for this segment - avg_times = [] - for interpreters in self.interpreters: - avg_times += [i.timings[idx] / i.exec_count[idx] for i in interpreters if i.exec_count[idx] != 0] - - if avg_times: - avg_time = sum(avg_times) / len(avg_times) - else: - return 0, 0, 0.0 + interpreter_counts = [len(i) for i in self.interpreters] + min_i, max_i, current_max, swap_i = self._eval_timings(interpreter_counts) + interpreter_counts[min_i] -= 1 + interpreter_counts[max_i] += 1 + _, _, new_max, _ = self._eval_timings(interpreter_counts) - # Adjust for number of TPUs allocated to it - time_alloc.append(avg_time / interpreter_counts[idx]) + if new_max+1.0 < current_max: + # 1st Priority: Allocate more TPUs to slow segments + logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") - min_t = 100000000 - min_i = -1 - max_t = 0 - max_i = -1 + realloc_interp = self._rem_interpreter_from(min_i) - # Find segments that maybe should swap - for i, t in enumerate(time_alloc): - # Max time needs to be shortened so add an interpreter. - if t > max_t: - max_t = t - max_i = i + # Add to large (too-slow) queue + realloc_interp.start(max_i, self.fbytes_list[max_i]) + self.interpreters[max_i].append(realloc_interp) - # Min time needs to be lengthened so rem an interpreter, - # but only if it has more than one interpreter - if t < min_t and len(self.interpreters[i]) > 1: - min_t = t - min_i = i + elif swap_i is not None: + # 2nd Priority: Swap slow segments with faster ones to see if we can + # run them faster. Hopefully still a good way to optimize for + # heterogenous hardware. + logging.info(f"Auto-tuning between queues {swap_i} and {max_i}") - return min_i, max_i, max(time_alloc) + # Stop them + new_max = self._rem_interpreter_from(swap_i) + new_swap = self._rem_interpreter_from(max_i) - interpreter_counts = [len(i) for i in self.interpreters] - min_i, max_i, current_max = eval_timings(interpreter_counts) - interpreter_counts[min_i] -= 1 - interpreter_counts[max_i] += 1 - _, _, new_max = eval_timings(interpreter_counts) + # Swap them + new_max.start(max_i, self.fbytes_list[max_i]) + self.interpreters[max_i].append(new_max) - # Return if we don't want to swap (+/- 1 ms) - if new_max+1.0 >= current_max: + new_swap.start(swap_i, self.fbytes_list[swap_i]) + self.interpreters[swap_i].append(new_swap) + else: + # Return if we don't want to swap self.balance_lock.release() return - logging.info(f"Re-balancing from queue {min_i} to {max_i} (max from {current_max:.2f} to {new_max:.2f})") + self.balance_ttl -= 1 + self.balance_lock.release() + self.print_queue_len() + + def _rem_interpreter_from(self, interp_i): # Sending False kills the processing loop self.rebalancing_lock.acquire() - self.queues[min_i].put(False) + self.queues[interp_i].put(False) # This is ugly, but I can't think of something better # Threads are blocked by queues. Queues may not have a stream @@ -338,21 +412,15 @@ def eval_timings(interpreter_counts): # Block & wait realloc_interp = None with self.rebalancing_lock: - for idx, interpreter in enumerate(self.interpreters[min_i]): + for idx, interpreter in enumerate(self.interpreters[interp_i]): if not interpreter.interpreter: - realloc_interp = self.interpreters[min_i].pop(idx) + realloc_interp = self.interpreters[interp_i].pop(idx) break + if not realloc_interp: logging.warning("Unable to find killed interpreter") self.balance_lock.release() - return - - # Add to large (too-slow) queue - realloc_interp.start(max_i, self.fbytes_list[max_i]) - self.interpreters[max_i].append(realloc_interp) - - self.balance_lock.release() - self.print_queue_len() + return realloc_interp def print_queue_len(self): @@ -464,11 +532,11 @@ def __init__(self, tpu_limit: int = -1): def _watchdog(self): self.watchdog_time = time.time() while not self.watchdog_shutdown: - if self.pipe and \ + if self.pipe and self.pipe.first_name is None and \ time.time() - self.watchdog_time > self.max_idle_secs_before_recycle: logging.warning("No work in {} seconds, watchdog shutting down TPUs.".format(self.max_idle_secs_before_recycle)) self.runner_lock.acquire(timeout=MAX_WAIT_TIME) - if self.pipe.first_name: + if self.pipe: self.pipe.delete() self.runner_lock.release() # Pipeline will reinitialize itself as needed @@ -510,38 +578,33 @@ def _get_model_filenames(self, options: Options, tpu_list: list) -> list: more than one list of segment files then use the list of files that best matches the number of TPUs we have, otherwise use the single list we have. If all else fails return the single TPU filename as a list. - NOTE: This method also updates self.device_count and self.segment_count - based on the choice of whether it uses a single model or a set of - segment file names """ # if TPU no-show then default is CPU self.device_type = 'CPU' - device_count = 1 # CPU. At this point we don't know if we have TPU - segment_count = 1 # Single CPU model file if not any(tpu_list): return [] + device_count = len(tpu_list) # TPUs. We've at least found one self.device_type = 'Multi-TPU' + if device_count == 1: + self.device_type = 'TPU' # If TPU found then default is single TPU model file (no segments) - device_count = len(tpu_list) # TPUs. We've at least found one - segment_count = 1 # Single TPU model name at this point - if not any(options.tpu_segments_lists): + if not any(options.tpu_segments_lists) or device_count == 1: return [options.model_tpu_file] # We have a list of segment files - if isinstance(options.tpu_segments_lists[0], list): + if isinstance(options.tpu_segments_lists, dict): # Look for a good match between available TPUs and segment counts - # Prioritize first match - for fname_list in options.tpu_segments_lists: - segment_count = len(fname_list) - if segment_count <= device_count: - return fname_list + # Prioritize first match. Note we have only tested up to 8 TPUs, + # so best performance above that can probably be had by extrapolation. + device_count = min(device_count, 8) + if device_count in options.tpu_segments_lists: + return options.tpu_segments_lists[device_count] else: # Only one list of segments; use it regardless of even match to TPU count - segment_count = len(options.tpu_segments_lists) - if segment_count <= device_count: + if len(options.tpu_segments_lists) <= device_count: return options.tpu_segments_lists # Couldn't find a good fit, use single segment