nextcloud · kyteinsky · Jan 31, 2025 · Jan 31, 2025 · marcelklehr · Feb 10, 2025
@@ -9,6 +9,8 @@
 > [!NOTE]
 > Be mindful to install the backend before the Context Chat php app (Context Chat php app would sends all the user-accessible files to the backend for indexing in the background. It is not an issue even if the request fails to an uninitialised backend since those files would be tried again in the next background job run.)
 >
+> The CPU (or the virtual CPU) should support AVX2 instructions for the embedder/LLM to work.
+>
 > The HTTP request timeout is 50 minutes for all requests and can be changed with the `request_timeout` app config for the php app `context_chat` using the occ command (`occ config:app:set context_chat request_timeout --value=3000`, value is in seconds). The same also needs to be done for docker socket proxy. See [Slow responding ExApps](https://github.com/cloud-py-api/docker-socket-proxy?tab=readme-ov-file#slow-responding-exapps)
 >
 > An end-to-end example on how to build and register the backend manually (with CUDA) is at the end of this readme

@@ -20,13 +20,14 @@ embedding:
   port: 5000
   workers: 1
   offload_after_mins: 15 # in minutes
-  request_timeout: 1800 # in seconds
+  request_timeout: 1680 # in seconds
   llama:
     # 'model_alias' is reserved
     # 'embedding' is always set to True
     model: multilingual-e5-large-instruct-q6_k.gguf
-    n_batch: 16
+    n_batch: 512
     n_ctx: 8192
+    logits_all: false
 
 llm:
   nc_texttotext:

@@ -20,14 +20,15 @@ embedding:
   port: 5000
   workers: 1
   offload_after_mins: 15 # in minutes
-  request_timeout: 1800 # in seconds
+  request_timeout: 1680 # in seconds
   llama:
     # 'model_alias' is reserved
     # 'embedding' is always set to True
     model: multilingual-e5-large-instruct-q6_k.gguf
-    n_batch: 16
+    n_batch: 512
     n_ctx: 8192
     n_gpu_layers: -1
+    logits_all: false
 
 llm:
   nc_texttotext: