Fix global threads count var update (#923)

alonre24 · web-flow · commit d16b14c4623b · 2022-06-02T11:36:34.000+03:00
* Update global variable indicating the number of background threads from main thread to ensure synchronization upon creating onnx global structure
diff --git a/src/execution/background_workers.c b/src/execution/background_workers.c
@@ -21,15 +21,16 @@ int pthread_setname_np(const char *name);
 #endif
 #endif
 
-uintptr_t BGWorkersCounter; // Total number of BG threads running currently.
-pthread_key_t ThreadIdKey;  // Key to hold thread id in its local storage.
+uintptr_t LastThreadId;      // Last number given as thread id for BG threads running currently.
+pthread_key_t ThreadIdKey;   // Key to hold thread id in its local storage.
+unsigned int BGWorkersCount; // Total number of BG threads spawned.
 
 /**
  * @brief Save the id for some working thread in thread local storage.
  */
 static void _BGWorker_SaveThreadId() {
     // Let the current thread have the next available id, and increase the counter.
-    long id_value = __atomic_add_fetch(&BGWorkersCounter, 1, __ATOMIC_RELAXED);
+    long id_value = __atomic_add_fetch(&LastThreadId, 1, __ATOMIC_RELAXED);
     // Convert the id value to a pointer and store it the thread local storage.
     // First id is 1, so we won't confuse with NULL (which is the error return value)
     pthread_setspecific(ThreadIdKey, (const void *)id_value);
@@ -291,7 +292,7 @@ long BGWorker_GetThreadId() {
     return (long)(thread_id)-1;
 }
 
-uintptr_t BGWorker_GetThreadsCount() { return BGWorkersCounter; }
+uintptr_t BGWorker_GetThreadsCount() { return BGWorkersCount; }
 
 void *BGWorker_ThreadMain(void *arg) {
     _BGWorker_SaveThreadId();
diff --git a/src/execution/run_queue_info.c b/src/execution/run_queue_info.c
@@ -3,6 +3,8 @@
 #include "backends/backends.h"
 #include "background_workers.h"
 
+extern unsigned int BGWorkersCount;
+
 RunQueueInfo *RunQueue_Create(const char *device_str) {
 
     size_t device_str_len = strlen(device_str);
@@ -22,7 +24,7 @@ RunQueueInfo *RunQueue_Create(const char *device_str) {
         return NULL;
     }
 
-    // Create worker threads.
+    // Create worker threads, update the global counter.
     for (int i = 0; i < Config_GetNumThreadsPerQueue(); i++) {
         pthread_t thread;
         if (pthread_create(&thread, NULL, BGWorker_ThreadMain, run_queue_info) != 0) {
@@ -32,6 +34,7 @@ RunQueueInfo *RunQueue_Create(const char *device_str) {
         }
         run_queue_info->threads = array_append(run_queue_info->threads, thread);
     }
+    BGWorkersCount += Config_GetNumThreadsPerQueue();
 
     // Add the new device worker threads to onnx run sessions tracking.
     if (RAI_backends.onnx.add_new_device_cb) {
diff --git a/tests/flow/tests_onnx.py b/tests/flow/tests_onnx.py
@@ -2,6 +2,7 @@
 import sys
 import os
 import subprocess
+import psutil
 import redis
 from includes import *
 from RLTest import Env
@@ -554,6 +555,45 @@ def test_multiple_devices(self):
         self.env.assertEqual(backends_info['ai_onnxruntime_maximum_run_sessions_number'],
                              str(len(devices)*self.threads_per_queue))
 
+    # Stress test to validate that we have no race condition between the creation of the onnx global array (from
+    # the main threads) that contains an entry for every worker thread, and the background thread that runs the
+    # session and access this global array.
+    def test_synchronization(self):
+        if self.env.isCluster() or self.env.useSlaves or VALGRIND == 1:
+            self.env.debugPrint("skipping {} on cluster/slaves/valgrind modes".format(sys._getframe().f_code.co_name), force=True)
+            return
+
+        model_pb = load_file_content('mul_1.onnx')
+
+        def launch_redis_and_run_onnx(con, proc_id, pipes):
+            my_pipe = pipes[proc_id]
+            port = 6380 + proc_id  # Let every subprocess run on a fresh port.
+            redis_server = subprocess.Popen(['redis-server', '--port', str(port),
+                                             '--loadmodule', f'{ROOT}/install-{DEVICE.lower()}/redisai.so',
+                                             '--logfile', f'{self.env.logDir}/test_onnx_kill_switch_synchronization-{port}.log',
+                                             '--dir', f'{self.env.logDir}',
+                                             '--dbfilename', f'test_onnx_kill_switch_synchronization-{port}.rdb'])
+            # Wait until redis-server is up and ready to accept connections.
+            while len([c for c in psutil.net_connections("tcp")
+                       if c.pid == redis_server.pid and c.laddr.port == port]) == 0:
+                time.sleep(1)
+            # Create a connection to Redis that immediately loads and execute onnx model. This is for testing that
+            # there was a proper synchronization - otherwise, execution might cause a server crash.
+            r = redis.Redis(host='localhost', port=port)
+            r.flushall()
+            r.execute_command('AI.MODELSTORE', 'mul{1}', 'ONNX', 'CPU', 'BLOB', model_pb)
+            r.execute_command('AI.TENSORSET', 'a{1}', 'FLOAT', 3, 2, 'VALUES', 1.0, 2.0, 3.0, 4.0, 5.0, 6.0)
+            r.execute_command('AI.MODELEXECUTE', 'mul{1}', 'INPUTS', 1, 'a{1}', 'OUTPUTS', 1, 'b{1}')
+            my_pipe.send(1)  # To indicate that the flow was executed with success.
+            redis_server.kill()
+
+        num_parallel_clients = 50
+        parent_end_pipes, children_end_pipes = get_parent_children_pipes(num_parallel_clients)
+        run_test_multiproc(self.env, '{1}', num_parallel_clients, launch_redis_and_run_onnx,
+                           args=(children_end_pipes, ))
+        # Assert that all sub-processes have finished successfully.
+        self.env.assertEqual(sum([p.recv() for p in parent_end_pipes]), num_parallel_clients)
+
 
 def test_forbidden_external_initializers(env):
     if not TEST_ONNX: