Skip to content

Commit bfd0ba2

Browse files
aashipandyakaustubh-darekar
authored andcommitted
gcs file existance check and reprocess from last processed position check updates (#917)
Co-authored-by: kaustubh-darekar <[email protected]>
1 parent 642cac4 commit bfd0ba2

File tree

5 files changed

+31
-24
lines changed

5 files changed

+31
-24
lines changed

README.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,17 @@ Allow unauthenticated request : Yes
161161
| LLM_MODEL_CONFIG_ollama_<model_name> | Optional | | Set ollama config as - model_name,model_local_url for local deployments |
162162
| RAGAS_EMBEDDING_MODEL | Optional | openai | embedding model used by ragas evaluation framework |
163163

164-
164+
## LLMs Supported
165+
1. OpenAI
166+
2. Gemini
167+
3. Azure OpenAI(dev)
168+
4. Anthropic(dev)
169+
5. Fireworks(dev)
170+
6. Groq(dev)
171+
7. Amazon Bedrock(dev)
172+
8. Ollama(dev)
173+
9. Diffbot
174+
10. Other OpenAI compabtile baseurl models(dev)
165175

166176
## For local llms (Ollama)
167177
1. Pull the docker imgage of ollama

backend/src/document_sources/gcs_bucket.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,14 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
5959

6060
if access_token is None:
6161
storage_client = storage.Client(project=gcs_project_id)
62-
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
63-
pages = loader.load()
62+
bucket = storage_client.bucket(gcs_bucket_name)
63+
blob = bucket.blob(blob_name)
64+
65+
if blob.exists():
66+
loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=load_document_content)
67+
pages = loader.load()
68+
else :
69+
raise Exception('File does not exist, Please re-upload the file and try again.')
6470
else:
6571
creds= Credentials(access_token)
6672
storage_client = storage.Client(project=gcs_project_id, credentials=creds)
@@ -77,7 +83,7 @@ def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, g
7783
text += page.extract_text()
7884
pages = [Document(page_content = text)]
7985
else:
80-
raise Exception('Blob Not Found')
86+
raise Exception(f'File Not Found in GCS bucket - {gcs_bucket_name}')
8187
return gcs_blob_filename, pages
8288

8389
def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed):
@@ -141,8 +147,9 @@ def copy_failed_file(source_bucket_name,dest_bucket_name,folder_name, file_name)
141147
storage_client = storage.Client()
142148
bucket = storage_client.bucket(source_bucket_name)
143149
folder_file_name = folder_name +'/'+file_name
144-
source_blob = bucket.blob(folder_file_name)
145-
bucket.copy_blob(source_blob,dest_bucket_name,file_name)
146-
logging.info(f'Failed file {file_name} copied to {dest_bucket_name} from {source_bucket_name} in GCS successfully')
150+
source_blob = source_bucket.blob(folder_file_name)
151+
if source_blob.exists():
152+
source_bucket.copy_blob(source_blob, dest_bucket, file_name)
153+
logging.info(f'Failed file {file_name} copied to {dest_bucket_name} from {source_bucket_name} in GCS successfully')
147154
except Exception as e:
148155
raise Exception(e)

backend/src/main.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -522,8 +522,8 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition):
522522
chunkId_chunkDoc_list=[]
523523
chunks = graph.query(QUERY_TO_GET_CHUNKS, params={"filename":file_name})
524524

525-
if chunks[0]['text'] is None or chunks[0]['text']=="" :
526-
raise Exception(f"Chunks are not created for {file_name}. Please re-upload file and try.")
525+
if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks :
526+
raise Exception(f"Chunks are not created for {file_name}. Please re-upload file and try again.")
527527
else:
528528
for chunk in chunks:
529529
chunk_doc = Document(page_content=chunk['text'], metadata={'id':chunk['id'], 'position':chunk['position']})
@@ -532,15 +532,16 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, retry_condition):
532532
if retry_condition == START_FROM_LAST_PROCESSED_POSITION:
533533
logging.info(f"Retry : start_from_last_processed_position")
534534
starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_POSITION, params={"filename":file_name})
535-
if starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
535+
536+
if starting_chunk and starting_chunk[0]["position"] < len(chunkId_chunkDoc_list):
536537
return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
537538

538-
elif starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
539+
elif starting_chunk and starting_chunk[0]["position"] == len(chunkId_chunkDoc_list):
539540
starting_chunk = graph.query(QUERY_TO_GET_LAST_PROCESSED_CHUNK_WITHOUT_ENTITY, params={"filename":file_name})
540541
return len(chunks), chunkId_chunkDoc_list[starting_chunk[0]["position"] - 1:]
541542

542543
else:
543-
raise Exception(f"All chunks of {file_name} are alreday processed. If you want to re-process, Please start from begnning")
544+
raise Exception(f"All chunks of file are alreday processed. If you want to re-process, Please start from begnning")
544545

545546
else:
546547
logging.info(f"Retry : start_from_beginning with chunks {len(chunkId_chunkDoc_list)}")

backend/src/shared/constants.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,4 @@
1-
MODEL_VERSIONS = {
2-
"openai-gpt-3.5": "gpt-3.5-turbo-0125",
3-
"gemini-1.0-pro": "gemini-1.0-pro-001",
4-
"gemini-1.5-pro": "gemini-1.5-pro-002",
5-
"gemini-1.5-flash": "gemini-1.5-flash-002",
6-
"openai-gpt-4": "gpt-4-turbo-2024-04-09",
7-
"diffbot" : "gpt-4-turbo-2024-04-09",
8-
"openai-gpt-4o-mini": "gpt-4o-mini-2024-07-18",
9-
"openai-gpt-4o":"gpt-4o-2024-08-06",
10-
"groq-llama3" : "llama3-70b-8192"
11-
}
1+
122
OPENAI_MODELS = ["openai-gpt-3.5", "openai-gpt-4o", "openai-gpt-4o-mini"]
133
GEMINI_MODELS = ["gemini-1.0-pro", "gemini-1.5-pro", "gemini-1.5-flash"]
144
GROQ_MODELS = ["groq-llama3"]

backend/src/shared/schema_extraction.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#from langchain_core.pydantic_v1 import BaseModel, Field
33
from pydantic.v1 import BaseModel, Field
44
from src.llm import get_llm
5-
from src.shared.constants import MODEL_VERSIONS
65
from langchain_core.prompts import ChatPromptTemplate
76

87
class Schema(BaseModel):

0 commit comments

Comments
 (0)