gcs file existance check and reprocess from last processed position check updates (#917)

aashipandya · kaustubh-darekar · kartikpersistent · commit bfd0ba2de929 · 2025-01-28T15:45:09.000Z
Co-authored-by: kaustubh-darekar &lt;kaustubh_darekar@persistent.com&gt;
diff --git a/README.md b/README.md
@@ -161,7 +161,17 @@ Allow unauthenticated request : Yes
 | LLM_MODEL_CONFIG_ollama_<model_name>         | Optional      |               | Set ollama config as - model_name,model_local_url for local deployments |
 | RAGAS_EMBEDDING_MODEL         | Optional      | openai              | embedding model used by ragas evaluation framework                               |
 
-
+## LLMs Supported 
+1. OpenAI
+2. Gemini
+3. Azure OpenAI(dev)
+4. Anthropic(dev)
+5. Fireworks(dev)
+6. Groq(dev)
+7. Amazon Bedrock(dev)
+8. Ollama(dev)
+9. Diffbot
+10. Other OpenAI compabtile baseurl models(dev)
 
 ## For local llms (Ollama)
 1. Pull the docker imgage of ollama
diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py
@@ -59,8 +59,14 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
  
   if access_token is None:
     storage_client = storage.Client(project=gcs_project_id)
-    loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
-    pages = loader.load()
+    bucket = storage_client.bucket(gcs_bucket_name)
+    blob = bucket.blob(blob_name) 
+    
+    if blob.exists():
+        loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
+        pages = loader.load() 
+    else :
+      raise Exception('File does not exist, Please re-upload the file and try again.')
   else:
     creds= Credentials(access_token)
     storage_client = storage.Client(project=gcs_project_id, credentials=creds)
@@ -77,7 +83,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
             text += page.extract_text()
       pages = [Document(page_content = text)]
     else:
-      raise Exception('Blob Not Found')
+      raise Exception(f'File Not Found in GCS bucket - {gcs_bucket_name}')
   return gcs_blob_filename, pages
 
 def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed):
@@ -141,8 +147,9 @@ def copy_failed_file(source_bucket_name,dest_bucket_name,folder_name, file_name)
     storage_client = storage.Client()
     bucket = storage_client.bucket(source_bucket_name)
     folder_file_name = folder_name +'/'+file_name
-    source_blob = bucket.blob(folder_file_name)
-    bucket.copy_blob(source_blob,dest_bucket_name,file_name)
-    logging.info(f'Failed file {file_name} copied to {dest_bucket_name} from {source_bucket_name} in GCS successfully')
+    source_blob = source_bucket.blob(folder_file_name)
+    if source_blob.exists():
+      source_bucket.copy_blob(source_blob, dest_bucket, file_name)
+      logging.info(f'Failed file {file_name} copied to {dest_bucket_name} from {source_bucket_name} in GCS successfully')
   except Exception as e:
     raise Exception(e)  
diff --git a/backend/src/main.py b/backend/src/main.py
@@ -522,8 +522,8 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition):
     chunkId_chunkDoc_list=[]
     chunks =  graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
     
-    if chunks[0]['text'] is None or chunks[0]['text']=="" :
-      raise Exception(f"Chunks are not created for {file_name}. Please re-upload file and try.")    
+    if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks :
+      raise Exception(f"Chunks are not created for {file_name}. Please re-upload file and try again.")    
     else:
       for chunk in chunks:
         chunk_doc = Document(page_content=chunk['text'], metadata={'id':chunk['id'], 'position':chunk['position']})
@@ -532,15 +532,16 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition):
       if retry_condition ==  START_FROM_LAST_PROCESSED_POSITION:
         logging.info(f"Retry : start_from_last_processed_position")
         starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, params={"filename":file_name})
-        if starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
+        
+        if starting_chunk and starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
           return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
         
-        elif starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
+        elif starting_chunk and starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
           starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, params={"filename":file_name})
           return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
         
         else:
-          raise Exception(f"All chunks of {file_name} are alreday processed. If you want to re-process, Please start from begnning")    
+          raise Exception(f"All chunks of file are alreday processed. If you want to re-process, Please start from begnning")    
       
       else:
         logging.info(f"Retry : start_from_beginning with chunks {len(chunkId_chunkDoc_list)}")    
diff --git a/backend/src/shared/constants.py b/backend/src/shared/constants.py
@@ -1,14 +1,4 @@
-MODEL_VERSIONS = {
-        "openai-gpt-3.5": "gpt-3.5-turbo-0125",
-        "gemini-1.0-pro": "gemini-1.0-pro-001",
-        "gemini-1.5-pro": "gemini-1.5-pro-002",
-        "gemini-1.5-flash": "gemini-1.5-flash-002",
-        "openai-gpt-4": "gpt-4-turbo-2024-04-09",
-        "diffbot" : "gpt-4-turbo-2024-04-09",
-        "openai-gpt-4o-mini": "gpt-4o-mini-2024-07-18",
-        "openai-gpt-4o":"gpt-4o-2024-08-06",
-        "groq-llama3" : "llama3-70b-8192"
-         }
+
 OPENAI_MODELS = ["openai-gpt-3.5", "openai-gpt-4o", "openai-gpt-4o-mini"]
 GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"]
 GROQ_MODELS = ["groq-llama3"]
diff --git a/backend/src/shared/schema_extraction.py b/backend/src/shared/schema_extraction.py
@@ -2,7 +2,6 @@
 #from langchain_core.pydantic_v1 import BaseModel, Field
 from pydantic.v1 import BaseModel, Field
 from src.llm import get_llm
-from src.shared.constants import MODEL_VERSIONS
 from langchain_core.prompts import ChatPromptTemplate
 
 class Schema(BaseModel):