Skip to content

Commit a65e26d

Browse files
WIP - AI.DAGRUN, commands code refactoring and improvement (#322) * [wip] refactored TENSORGET, TENSORSET, and MODELRUN to be re-used by AI.DAGRUN * [add] first version of dagrun with modelrun and persist working * [wip] refactored non-command methods within redisai.c into dag, run_info, background_workers, and model_script_run_session * [fix] fixed wrong includes * [add] adding init methods to RAI_DagOp and RedisAI_RunInfo * [wip] ai.tensorset, PERSIST and LOAD working as expected * [add] dagrun's tensorset and tensorget working as expected * [add] extended test for tensorset and tensorget * [wip] wip on modelrun within dagrun * [wip] fist version of tensorget |> modelrun |> tensorget working * [add] refactor RunInfo, so that instead of a single mctx and sctx pointer, it holds an array of (mctx | sctx) pointers (within RAI_DagOp). To be gradually adopted on modelrun and scriptrun ( for now only on dagrun ) * [add] added redisai-py as a requirement for tests ( it helps testing for more complex patterns ) * [add] added test for semantics of values we reply from the local context (ensuring that write after read does not alter the tensorget read value) * [wip] discover the DAGRUN device queue from the arguments of MODELRUN. If no MODELRUN default to CPU * [fix] fxied wrong reference passing on RedisAI_Parse_ModelRun_RedisCommand * [fix] fixed wrong reference passing on RedisAI_Parse_ModelRun_RedisCommand from RedisAI_DagRunSession * [wip] wip on minor optimizations * [add] exteded dag.h to have proper documentation * [add] extended model_script_run_session header file with documentation to better describe the context in which RedisAI blocking commands MODELRUN and SCRIPTRUN operate * [add] moved configuration properties and parsing out of redisai.c to config.h/c * [add] backends_intra_op_parallelism and backends_inter_op_parallelism working as expected for TF * [add] intra_op and inter_op parallelism working as expected for TF backend * [add] exclude perf profile reports from git * [add] wip on mem sanitizer * [add] working on RAI_FreeRunInfo and RAI_FreeDagOp * [add] using RAI_InitRunInfo on RedisAI_ScriptRun_RedisCommand * [add] using array data type on RedisAI_RunInfo rinfo->outkeys * [add] small leaks fix for dag * [add] partial refactor of RedisAI_ScriptRun_RedisCommand to make usage of RedisAI_RunInfo helper methods ( consistent constructors and destructors among modelrun,scriptrun and dagrun) * [add] kickoff negative testing of AI.DAGRUN * [add] extended negative testing on dag and removed complexity of tensor datatype (removed possibility to retain RString) * [add] extended AI.DAGRUN negative testing. fixed negative testing leaks * [add] more extensive tests and severall touches on same keys on AI.DAGRUN ci * Fixes for macOS and in general (#327) * Prevent a DAG run info to be considered batchable * Ensure sync on failing ONNX test Co-authored-by: Luca Antiga <[email protected]>
1 parent f349a42 commit a65e26d

35 files changed

+3508
-1095
lines changed

.circleci/config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ commands:
3737
command: |
3838
mkdir -p ~/workspace/tests
3939
make -C opt test SHOW=1
40+
no_output_timeout: 20m
4041
- run:
4142
name: Package
4243
command: make -C opt pack BRANCH="${CIRCLE_BRANCH//[^A-Za-z0-9._-]/_}" INTO=~/workspace/packages SHOW=1

.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
/build/
44
/install*
55
/test/venv/
6+
logs/
67
/test/logs/
78
.venv/
89
venv*/
@@ -13,6 +14,9 @@ venv*/
1314
*.tar.gz
1415
/VARIANT
1516

17+
### Cmake auto tools
18+
cmake-build-debug
19+
1620
# Misc
1721
.DS_Store
1822
*.swp
@@ -73,6 +77,10 @@ __pycache__
7377
*.idb
7478
*.pdb
7579

80+
# Debug/Profile files
81+
# ignore perf html reports
82+
*.html
83+
7684
# Kernel Module Compile Results
7785
*.mod*
7886
*.cmd
@@ -86,6 +94,10 @@ dkms.conf
8694
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
8795
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
8896

97+
.idea
98+
.idea/
99+
.idea/*
100+
89101
# User-specific stuff:
90102
.idea/workspace.xml
91103
.idea/tasks.xml

src/CMakeLists.txt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@ ADD_LIBRARY(redisai_obj OBJECT
22
util/dict.c
33
util/queue.c
44
redisai.c
5+
run_info.c
6+
background_workers.c
7+
model_script_run_session.c
8+
config.c
9+
dag.c
510
backends.c
11+
backends/util.c
612
model.c
713
err.c
814
script.c
@@ -13,13 +19,14 @@ ADD_LIBRARY(redisai_obj OBJECT
1319
rmutil/args.c
1420
rmutil/heap.c
1521
rmutil/priority_queue.c
16-
rmutil/vector.c)
22+
rmutil/vector.c run_info.c)
1723

1824
IF(BUILD_TF)
1925
ADD_LIBRARY(redisai_tensorflow_obj OBJECT
2026
backends/tensorflow.c
2127
backends/util.c
2228
err.c
29+
util/dict.c
2330
tensor.c)
2431
ENDIF()
2532

@@ -28,6 +35,7 @@ IF(BUILD_TFLITE)
2835
backends/tflite.c
2936
backends/util.c
3037
err.c
38+
util/dict.c
3139
tensor.c)
3240
ENDIF()
3341

@@ -36,6 +44,7 @@ IF(BUILD_TORCH)
3644
backends/torch.c
3745
backends/util.c
3846
err.c
47+
util/dict.c
3948
tensor.c)
4049
ENDIF()
4150

@@ -44,6 +53,7 @@ IF(BUILD_ORT)
4453
backends/onnxruntime.c
4554
backends/util.c
4655
err.c
56+
util/dict.c
4757
tensor.c)
4858
ENDIF()
4959

src/backends/tensorflow.c

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include "backends/util.h"
33
#include "tensor.h"
44
#include "util/arr_rm_alloc.h"
5+
#include "model.h"
56

67
#include "tensorflow/c/c_api.h"
78

@@ -292,29 +293,57 @@ RAI_Model *RAI_ModelCreateTF(RAI_Backend backend, const char* devicestr, RAI_Mod
292293

293294
if (device == RAI_DEVICE_CPU) {
294295
// Set number of GPU to 0 with
295-
// config.device_count = {'GPU': 0}
296-
uint8_t config[9] = {0x0a, 0x07, 0x0a, 0x03, 0x47, 0x50, 0x55, 0x10, 0x00};
297-
TF_SetConfig(sessionOptions, (void *)config, 9, status);
298-
}
299-
else if (device == RAI_DEVICE_GPU) {
296+
// config.device_count = {'GPU': 0}
297+
uint8_t config[] = {0x0a, 0x07, 0x0a, 0x03, 0x47, 0x50, 0x55, 0x10, 0x00};
298+
TF_SetConfig(sessionOptions, (void *)config, sizeof(config), optionsStatus);
299+
300+
if (TF_GetCode(optionsStatus) != TF_OK) {
301+
RAI_SetError(error, RAI_EMODELCONFIGURE,
302+
RedisModule_Strdup(TF_Message(optionsStatus)));
303+
// TODO: free memory
304+
return NULL;
305+
}
306+
307+
if (opts.backends_intra_op_parallelism > 0) {
308+
uint8_t proto[] = {0x10, (uint8_t)opts.backends_intra_op_parallelism};
309+
TF_SetConfig(sessionOptions, proto, sizeof(proto), optionsStatus);
310+
if (TF_GetCode(optionsStatus) != TF_OK) {
311+
RAI_SetError(error, RAI_EMODELCONFIGURE,
312+
RedisModule_Strdup(TF_Message(optionsStatus)));
313+
// TODO: free memory
314+
return NULL;
315+
}
316+
}
317+
318+
if (opts.backends_inter_op_parallelism > 0) {
319+
uint8_t proto1[] = {0x28, (uint8_t)opts.backends_inter_op_parallelism};
320+
TF_SetConfig(sessionOptions, proto1, sizeof(proto1), optionsStatus);
321+
if (TF_GetCode(optionsStatus) != TF_OK) {
322+
RAI_SetError(error, RAI_EMODELCONFIGURE,
323+
RedisModule_Strdup(TF_Message(optionsStatus)));
324+
// TODO: free memory
325+
return NULL;
326+
}
327+
}
328+
} else if (device == RAI_DEVICE_GPU) {
300329
if (deviceid == -1) {
301330
// Set
302331
// config.gpu_options.allow_growth = True
303332
uint8_t config[4] = {0x32, 0x02, 0x20, 0x01};
304-
TF_SetConfig(sessionOptions, (void *)config, 4, status);
333+
TF_SetConfig(sessionOptions, (void *)config, 4, optionsStatus);
305334
}
306335
else {
307336
// Set
308337
// config.gpu_options.allow_growth = True
309338
// config.gpu_options.visible_device_list = '<deviceid>'
310339
uint8_t config[7] = {0x32, 0x05, 0x20, 0x01, 0x2a, 0x01, 0x30};
311340
config[6] += (uint8_t)deviceid;
312-
TF_SetConfig(sessionOptions, (void *)config, 7, status);
341+
TF_SetConfig(sessionOptions, (void *)config, 7, optionsStatus);
313342
}
314343
}
315344

316345
if (TF_GetCode(optionsStatus) != TF_OK) {
317-
RAI_SetError(error, RAI_EMODELCONFIGURE, RedisModule_Strdup(TF_Message(status)));
346+
RAI_SetError(error, RAI_EMODELCONFIGURE, RedisModule_Strdup(TF_Message(optionsStatus)));
318347
// TODO: free memory
319348
return NULL;
320349
}
@@ -437,7 +466,7 @@ int RAI_ModelRunTF(RAI_ModelRunCtx** mctxs, RAI_Error *error) {
437466

438467
size_t batch_sizes[nbatches];
439468
size_t batch_offsets[nbatches];
440-
if (array_len(mctxs[0]->inputs) > 0) {
469+
if (ninputs > 0) {
441470
for (size_t b=0; b<nbatches; ++b) {
442471
batch_sizes[b] = RAI_TensorDim(mctxs[b]->inputs[0].tensor, 0);
443472
}
@@ -453,7 +482,6 @@ int RAI_ModelRunTF(RAI_ModelRunCtx** mctxs, RAI_Error *error) {
453482
for (size_t b=0; b<nbatches; ++b) {
454483
batched_input_tensors[b] = mctxs[b]->inputs[i].tensor;
455484
}
456-
// inputTensorsValues[i] = RAI_TFTensorFromTensor(mctx->inputs[i].tensor);
457485
inputTensorsValues[i] = RAI_TFTensorFromTensors(batched_input_tensors, nbatches);
458486
TF_Output port;
459487
port.oper = TF_GraphOperationByName(mctxs[0]->model->model, mctxs[0]->inputs[i].name);
@@ -495,21 +523,11 @@ int RAI_ModelRunTF(RAI_ModelRunCtx** mctxs, RAI_Error *error) {
495523

496524
for(size_t i=0; i<noutputs; ++i) {
497525
for (size_t b=0; b<nbatches; b++) {
498-
RAI_Tensor* output_tensor = RAI_TensorCreateFromTFTensor(outputTensorsValues[i], batch_offsets[b], batch_sizes[b]);
499-
mctxs[b]->outputs[i].tensor = RAI_TensorGetShallowCopy(output_tensor);
500-
RAI_TensorFree(output_tensor);
526+
mctxs[b]->outputs[i].tensor = RAI_TensorCreateFromTFTensor(outputTensorsValues[i], batch_offsets[b], batch_sizes[b]);
501527
}
502528
TF_DeleteTensor(outputTensorsValues[i]);
503529
}
504530

505-
// TODO: add (make sure we deallocate once)
506-
// for (size_t i=0 ; i<array_len(mctx->inputs); ++i) {
507-
// TF_DeleteTensor(inputTensorsValues[i]);
508-
// }
509-
// for (size_t i=0 ; i<array_len(mctx->outputs); ++i) {
510-
// TF_DeleteTensor(outputTensorsValues[i]);
511-
// }
512-
513531
TF_DeleteStatus(status);
514532

515533
return 0;

src/backends/util.c

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,20 @@
11
#include "backends/util.h"
22

3-
int parseDeviceStr(const char* devicestr, RAI_Device* device, int64_t* deviceid) {
3+
int parseDeviceStr(const char* devicestr, RAI_Device* device,
4+
int64_t* deviceid) {
45
if (strcasecmp(devicestr, "CPU") == 0) {
56
*device = RAI_DEVICE_CPU;
67
*deviceid = -1;
7-
}
8-
else if (strcasecmp(devicestr, "GPU") == 0) {
8+
} else if (strcasecmp(devicestr, "GPU") == 0) {
99
*device = RAI_DEVICE_GPU;
1010
*deviceid = -1;
11-
}
12-
else if (strncasecmp(devicestr, "GPU:", 4) == 0) {
11+
} else if (strncasecmp(devicestr, "GPU:", 4) == 0) {
1312
*device = RAI_DEVICE_GPU;
1413
sscanf(devicestr, "GPU:%lld", deviceid);
15-
}
16-
else {
14+
} else {
1715
return 0;
1816
}
1917

2018
return 1;
2119
}
20+

src/backends/util.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
#ifndef SRC_BACKENDS_UTIL_H_
22
#define SRC_BACKENDS_UTIL_H_
33

4-
#include "config.h"
54
#include <stdint.h>
65
#include <stdio.h>
76
#include <strings.h>
87

9-
int parseDeviceStr(const char* devicestr, RAI_Device* device, int64_t* deviceid);
8+
#include "config.h"
9+
10+
int parseDeviceStr(const char* devicestr, RAI_Device* device,
11+
int64_t* deviceid);
1012

1113
#endif /* SRC_BACKENDS_UTIL_H_ */

0 commit comments

Comments
 (0)