diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1260a5f..0ab2bfa 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -79,7 +79,7 @@ jobs: - name: Run coverage separately if: matrix.test-env == 'py312' run: | - uv run --frozen pytest test/test_client.py test/test_integration.py --cov=src/apihub_client --cov-report=xml --cov-report=html --cov-fail-under=85 + uv run --frozen pytest test/ --cov=src/apihub_client --cov-report=xml --cov-report=html --cov-fail-under=85 - name: Render the report to the PR if: matrix.test-env == 'py312' && github.event_name == 'pull_request' diff --git a/CLAUDE.md b/CLAUDE.md index b79daab..f748a8c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -65,3 +65,5 @@ This document contains critical information about working with this codebase. Fo - Follow existing patterns - Test thoroughly - Use Context7 mcp server if available to get latest documents. + +- Don't mention claude code any where in code, commit messages or in PR description or comment diff --git a/README.md b/README.md index 649051a..cbaea6f 100644 --- a/README.md +++ b/README.md @@ -63,6 +63,312 @@ print(result) ## 🛠️ Common Use Cases +### Document Splitter API + +Split documents into smaller parts using the doc-splitter service: + +```python +from apihub_client import DocSplitterClient + +# Initialize the doc-splitter client +doc_client = DocSplitterClient( + api_key="your-api-key-here", + base_url="http://localhost:8005" +) + +# Simple upload and wait for completion +result = doc_client.upload( + file_path="large_document.pdf", + wait_for_completion=True, + polling_interval=5 # Check status every 5 seconds +) + +# Download the split result +output_file = doc_client.download_result( + job_id=result["job_id"], + output_path="split_result.zip" +) +print(f"Downloaded result to: {output_file}") +``` + +#### Step-by-Step Doc-Splitter Processing + +```python +# Step 1: Upload document +upload_result = doc_client.upload(file_path="document.pdf") +job_id = upload_result["job_id"] +print(f"Upload completed. Job ID: {job_id}") + +# Step 2: Monitor status manually +status = doc_client.get_job_status(job_id) +print(f"Current status: {status['status']}") + +# Step 3: Wait for completion (with custom timeout) +final_result = doc_client.wait_for_completion( + job_id=job_id, + timeout=600, # Wait up to 10 minutes + polling_interval=3 # Check every 3 seconds +) + +# Step 4: Download the processed result +downloaded_file = doc_client.download_result( + job_id=job_id, + output_path="processed_document.zip" +) +print(f"Processing complete! Downloaded: {downloaded_file}") +``` + +#### Batch Processing with Doc-Splitter + +```python +import os +from pathlib import Path + +def process_documents_batch(file_paths): + """Process multiple documents with doc-splitter.""" + results = [] + + for file_path in file_paths: + try: + print(f"Processing {file_path}...") + + # Upload and wait for completion + result = doc_client.upload( + file_path=file_path, + wait_for_completion=True, + polling_interval=5 + ) + + # Generate output filename + input_name = Path(file_path).stem + output_path = f"{input_name}_split.zip" + + # Download result + downloaded_file = doc_client.download_result( + job_id=result["job_id"], + output_path=output_path + ) + + results.append({ + "input": file_path, + "output": downloaded_file, + "job_id": result["job_id"], + "success": True + }) + + except Exception as e: + print(f"Failed to process {file_path}: {e}") + results.append({ + "input": file_path, + "error": str(e), + "success": False + }) + + return results + +# Process multiple files +files = ["document1.pdf", "document2.pdf", "document3.pdf"] +results = process_documents_batch(files) + +# Summary +successful = [r for r in results if r["success"]] +failed = [r for r in results if not r["success"]] +print(f"Processed: {len(successful)} successful, {len(failed)} failed") +``` + +### Generic Unstract API + +Process documents using dynamic endpoints like invoice, contract, receipt, etc.: + +```python +from apihub_client import GenericUnstractClient + +# Initialize the generic client +client = GenericUnstractClient( + api_key="your-api-key-here", + base_url="http://localhost:8005" +) + +# Simple processing with automatic completion waiting +result = client.process( + endpoint="invoice", + file_path="invoice.pdf", + wait_for_completion=True, + polling_interval=5 # Check status every 5 seconds +) +print("Invoice processing completed:", result) +``` + +#### Step-by-Step Generic API Processing + +```python +# Step 1: Start processing +process_result = client.process( + endpoint="contract", + file_path="contract.pdf" +) +execution_id = process_result["execution_id"] +print(f"Processing started. Execution ID: {execution_id}") + +# Step 2: Check status manually +status = client.check_status("contract", execution_id) +print(f"Current status: {status}") + +# Step 3: Wait for completion (with custom timeout) +final_result = client.wait_for_completion( + endpoint="contract", + execution_id=execution_id, + timeout=600, # Wait up to 10 minutes + polling_interval=3 # Check every 3 seconds +) + +# Step 4: Get result later (if needed) +result = client.get_result("contract", execution_id) +print("Processing complete:", result) +``` + +#### Batch Processing with Generic APIs + +```python +def process_documents_batch(endpoint, file_paths): + """Process multiple documents with the same endpoint.""" + results = [] + + for file_path in file_paths: + try: + print(f"Processing {file_path} with {endpoint} endpoint...") + + # Process and wait for completion + result = client.process( + endpoint=endpoint, + file_path=file_path, + wait_for_completion=True, + polling_interval=5 + ) + + results.append({ + "input": file_path, + "execution_id": result["execution_id"], + "result": result, + "success": True + }) + + except Exception as e: + print(f"Failed to process {file_path}: {e}") + results.append({ + "input": file_path, + "error": str(e), + "success": False + }) + + return results + +# Process multiple invoices +invoice_files = ["invoice1.pdf", "invoice2.pdf", "invoice3.pdf"] +results = process_documents_batch("invoice", invoice_files) + +# Process multiple contracts +contract_files = ["contract1.pdf", "contract2.pdf"] +contract_results = process_documents_batch("contract", contract_files) + +# Summary +successful = [r for r in results if r["success"]] +failed = [r for r in results if not r["success"]] +print(f"Processed: {len(successful)} successful, {len(failed)} failed") +``` + +### Integration: Doc-Splitter + Extraction APIs + +Combine doc-splitter with extraction APIs for complete document processing: + +```python +from apihub_client import ApiHubClient, DocSplitterClient + +# Initialize both clients +api_client = ApiHubClient( + api_key="your-api-key", + base_url="https://api-hub.us-central.unstract.com/api/v1" +) + +doc_splitter = DocSplitterClient( + api_key="your-api-key", + base_url="http://localhost:8005" +) + +# Step 1: Split the large document +split_result = doc_splitter.upload( + file_path="large_contract.pdf", + wait_for_completion=True +) + +# Step 2: Download split result +doc_splitter.download_result( + job_id=split_result["job_id"], + output_path="split_documents.zip" +) + +# Step 3: Process individual documents (example with one document) +# (assuming you extract individual PDFs from the zip) +table_result = api_client.extract( + endpoint="bank_statement", + vertical="table", + sub_vertical="bank_statement", + file_path="individual_page.pdf", + wait_for_completion=True +) +print("Extracted data:", table_result) +``` + +### Complete Workflow: All Three Clients + +```python +from apihub_client import ApiHubClient, DocSplitterClient, GenericUnstractClient + +# Initialize all clients +api_client = ApiHubClient( + api_key="your-api-key", + base_url="https://api-hub.us-central.unstract.com/api/v1" +) + +doc_splitter = DocSplitterClient( + api_key="your-api-key", + base_url="http://localhost:8005" +) + +generic_client = GenericUnstractClient( + api_key="your-api-key", + base_url="http://localhost:8005" +) + +# Workflow: Split → Extract → Process with Generic API +# Step 1: Split large document +split_result = doc_splitter.upload( + file_path="large_document.pdf", + wait_for_completion=True +) + +# Step 2: Extract tables from split documents +# (after extracting individual files from the zip) +table_result = api_client.extract( + endpoint="discover_tables", + vertical="table", + sub_vertical="discover_tables", + file_path="split_page_1.pdf", + wait_for_completion=True +) + +# Step 3: Process with generic invoice API +invoice_result = generic_client.process( + endpoint="invoice", + file_path="split_page_2.pdf", + wait_for_completion=True +) + +print("Complete workflow finished!") +print("Tables extracted:", len(table_result.get('data', []))) +print("Invoice processed:", invoice_result.get('execution_id')) +``` + ### All Table Extraction API ```python @@ -208,6 +514,32 @@ client = ApiHubClient(api_key: str, base_url: str) - `api_key` (str): Your API key for authentication - `base_url` (str): The base URL of the ApiHub service +### DocSplitterClient + +Client for interacting with doc-splitter APIs for document splitting operations. + +```python +doc_client = DocSplitterClient(api_key: str, base_url: str) +``` + +**Parameters:** + +- `api_key` (str): Your API key for authentication +- `base_url` (str): The base URL of the doc-splitter service + +### GenericUnstractClient + +Client for interacting with generic Unstract APIs using dynamic endpoints. + +```python +generic_client = GenericUnstractClient(api_key: str, base_url: str) +``` + +**Parameters:** + +- `api_key` (str): Your API key for authentication +- `base_url` (str): The base URL of the Unstract service + #### Methods ##### extract() @@ -300,18 +632,200 @@ wait_for_complete( - `ApiHubClientException`: If processing fails or times out +#### DocSplitterClient Methods + +##### upload() + +Upload a document for splitting. + +```python +upload( + file_path: str, + wait_for_completion: bool = False, + polling_interval: int = 5, +) -> dict +``` + +**Parameters:** + +- `file_path` (str): Path to the file to upload +- `wait_for_completion` (bool): If True, polls until completion and returns final result +- `polling_interval` (int): Seconds between status checks when waiting (default: 5) + +**Returns:** + +- `dict`: Response containing job_id and status information + +##### get_job_status() + +Check the status of a splitting job. + +```python +get_job_status(job_id: str) -> dict +``` + +**Parameters:** + +- `job_id` (str): The job ID to check status for + +**Returns:** + +- `dict`: Status information including current processing state + +##### download_result() + +Download the result of a completed splitting job. + +```python +download_result( + job_id: str, + output_path: str | None = None +) -> str +``` + +**Parameters:** + +- `job_id` (str): The job ID to download results for +- `output_path` (str, optional): Path where to save the downloaded file. If None, uses 'result\_{job_id}.zip' + +**Returns:** + +- `str`: Path to the downloaded file + +##### wait_for_completion() + +Wait for a splitting job to complete by polling its status. + +```python +wait_for_completion( + job_id: str, + timeout: int = 600, + polling_interval: int = 3 +) -> dict +``` + +**Parameters:** + +- `job_id` (str): The job ID to wait for +- `timeout` (int): Maximum time to wait in seconds (default: 600) +- `polling_interval` (int): Seconds between status checks (default: 3) + +**Returns:** + +- `dict`: Final job status information when completed + +**Raises:** + +- `ApiHubClientException`: If processing fails or times out + +#### GenericUnstractClient Methods + +##### process() + +Process a document using the specified endpoint. + +```python +process( + endpoint: str, + file_path: str, + wait_for_completion: bool = False, + polling_interval: int = 5, + timeout: int = 600, +) -> dict +``` + +**Parameters:** + +- `endpoint` (str): The endpoint name (e.g., 'invoice', 'contract', 'receipt') +- `file_path` (str): Path to the file to upload +- `wait_for_completion` (bool): If True, polls until completion and returns final result +- `polling_interval` (int): Seconds between status checks when waiting (default: 5) +- `timeout` (int): Maximum time to wait for completion in seconds (default: 600) + +**Returns:** + +- `dict`: Response containing execution_id and processing information + +##### get_result() + +Get the result of a processing operation. + +```python +get_result(endpoint: str, execution_id: str) -> dict +``` + +**Parameters:** + +- `endpoint` (str): The endpoint name used for processing +- `execution_id` (str): The execution ID to get results for + +**Returns:** + +- `dict`: Processing result or status information + +##### wait_for_completion() + +Wait for a processing operation to complete by polling its status. + +```python +wait_for_completion( + endpoint: str, + execution_id: str, + timeout: int = 600, + polling_interval: int = 3, +) -> dict +``` + +**Parameters:** + +- `endpoint` (str): The endpoint name used for processing +- `execution_id` (str): The execution ID to wait for +- `timeout` (int): Maximum time to wait in seconds (default: 600) +- `polling_interval` (int): Seconds between status checks (default: 3) + +**Returns:** + +- `dict`: Final processing result when completed + +##### check_status() + +Check the current status of a processing operation. + +```python +check_status(endpoint: str, execution_id: str) -> str | None +``` + +**Parameters:** + +- `endpoint` (str): The endpoint name used for processing +- `execution_id` (str): The execution ID to check status for + +**Returns:** + +- `str | None`: Current status string, or None if not available + +**Raises:** + +- `ApiHubClientException`: If processing fails or times out + ### Exception Handling +All clients (`ApiHubClient`, `DocSplitterClient`, and `GenericUnstractClient`) use the same exception handling: + ```python -from apihub_client import ApiHubClientException +from apihub_client import ApiHubClientException, GenericUnstractClient + +generic_client = GenericUnstractClient(api_key="key", base_url="http://localhost:8005") try: - result = client.extract( - endpoint="bank_statement", - vertical="table", - sub_vertical="bank_statement", - file_path="document.pdf" + result = generic_client.process( + endpoint="invoice", + file_path="invoice.pdf", + wait_for_completion=True ) + + print("Processing completed:", result["execution_id"]) + except ApiHubClientException as e: print(f"Error: {e.message}") print(f"Status Code: {e.status_code}") diff --git a/src/apihub_client/__init__.py b/src/apihub_client/__init__.py index da0a5c8..5cc9b3a 100644 --- a/src/apihub_client/__init__.py +++ b/src/apihub_client/__init__.py @@ -6,9 +6,16 @@ """ from .client import ApiHubClient, ApiHubClientException +from .doc_splitter import DocSplitterClient +from .generic_client import GenericUnstractClient __version__ = "0.1.1" __author__ = "Unstract Team" __email__ = "support@unstract.com" -__all__ = ["ApiHubClient", "ApiHubClientException"] +__all__ = [ + "ApiHubClient", + "ApiHubClientException", + "DocSplitterClient", + "GenericUnstractClient", +] diff --git a/src/apihub_client/doc_splitter.py b/src/apihub_client/doc_splitter.py new file mode 100644 index 0000000..250fbd2 --- /dev/null +++ b/src/apihub_client/doc_splitter.py @@ -0,0 +1,201 @@ +import logging +import time +from pathlib import Path + +import requests + +from .client import ApiHubClientException + + +class DocSplitterClient: + """ + Client for interacting with doc-splitter APIs. + + Handles document splitting operations including file upload, + job status monitoring, and result download. + """ + + logger = logging.getLogger(__name__) + + def __init__( + self, + api_key: str, + base_url: str, + ) -> None: + """ + Initialize the DocSplitterClient. + + Args: + api_key: API key for authentication + base_url: Base URL of the doc-splitter service + """ + self.api_key = api_key + self.base_url = base_url.rstrip("/") + self.headers = {"apikey": self.api_key} + + def upload( + self, + file_path: str, + wait_for_completion: bool = False, + polling_interval: int = 5, + ) -> dict: + """ + Upload a document for splitting. + + Args: + file_path: Path to the file to upload + wait_for_completion: If True, polls for completion and returns final result + polling_interval: Seconds to wait between status checks (default: 5) + + Returns: + dict: Response containing job_id and status information + + Raises: + ApiHubClientException: If upload fails + """ + url = f"{self.base_url}/api/v1/doc-splitter/documents/upload" + + self.logger.info("Uploading file for splitting: %s", file_path) + + try: + with open(file_path, "rb") as file: + files = {"file": file} + response = requests.post(url, headers=self.headers, files=files) + except FileNotFoundError as e: + raise ApiHubClientException(f"File not found: {file_path}", None) from e + + self.logger.debug("Request Headers Sent: %s", response.request.headers) + self.logger.debug("Request URL: %s", response.request.url) + + if response.status_code not in [200, 202]: + self.logger.error("Upload failed: %s", response.text) + raise ApiHubClientException(response.text, response.status_code) + + data = response.json() + # Extract job_id from the nested data structure + if "data" in data and isinstance(data["data"], dict): + job_id = data["data"].get("job_id") + else: + job_id = data.get("job_id") + self.logger.info("Upload completed successfully. Job ID: %s", job_id) + + # If wait_for_completion is True, poll for status and return final result + if wait_for_completion: + if not job_id: + self.logger.warning("No job_id in response, returning initial data") + return data + + return self.wait_for_completion(job_id, polling_interval=polling_interval) + + return data + + def get_job_status(self, job_id: str) -> dict: + """ + Check the status of a splitting job. + + Args: + job_id: The job ID to check status for + + Returns: + dict: Status information + + Raises: + ApiHubClientException: If status check fails + """ + url = f"{self.base_url}/api/v1/doc-splitter/jobs/status" + params = {"job_id": job_id} + + self.logger.debug("Checking status for job ID: %s", job_id) + response = requests.get(url, headers=self.headers, params=params) + + if response.status_code != 200: + raise ApiHubClientException(response.text, response.status_code) + + return response.json() + + def download_result(self, job_id: str, output_path: str | None = None) -> str: + """ + Download the result of a completed splitting job. + + Args: + job_id: The job ID to download results for + output_path: Path where to save the downloaded file. + If None, uses 'result_{job_id}.zip' + + Returns: + str: Path to the downloaded file + + Raises: + ApiHubClientException: If download fails + """ + url = f"{self.base_url}/api/v1/doc-splitter/jobs/download" + params = {"job_id": job_id} + + if output_path is None: + output_path = f"result_{job_id}.zip" + + self.logger.info("Downloading result for job ID: %s to %s", job_id, output_path) + response = requests.get(url, headers=self.headers, params=params, stream=True) + + if response.status_code != 200: + raise ApiHubClientException(response.text, response.status_code) + + # Ensure output directory exists + Path(output_path).parent.mkdir(parents=True, exist_ok=True) + + # Write the file + with open(output_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + + self.logger.info("Download completed: %s", output_path) + return output_path + + def wait_for_completion( + self, job_id: str, timeout: int = 600, polling_interval: int = 3 + ) -> dict: + """ + Wait for a splitting job to complete by polling its status. + + Args: + job_id: The job ID to wait for + timeout: Maximum time to wait in seconds (default: 600) + polling_interval: Seconds to wait between status checks (default: 3) + + Returns: + dict: Final job status information when completed + + Raises: + ApiHubClientException: If processing fails or times out + """ + self.logger.info( + "Waiting for completion. Polling every %d seconds", polling_interval + ) + start_time = time.time() + + while time.time() - start_time < timeout: + status_result = self.get_job_status(job_id) + # Extract status from nested data structure + if "data" in status_result and isinstance(status_result["data"], dict): + status = status_result["data"].get("status") + else: + status = status_result.get("status") + self.logger.info("Current status: %s", status) + + if status and status.upper() == "COMPLETED": + self.logger.info("Processing completed") + return status_result + elif status and status.upper() == "FAILED": + self.logger.error("Processing failed") + raise ApiHubClientException( + f"Processing failed for job_id: {job_id}", + None, + ) + + time.sleep(polling_interval) + + # If we reach here, we've timed out + raise ApiHubClientException( + f"Timeout waiting for completion. Job ID: {job_id}", + None, + ) diff --git a/src/apihub_client/generic_client.py b/src/apihub_client/generic_client.py new file mode 100644 index 0000000..9e89482 --- /dev/null +++ b/src/apihub_client/generic_client.py @@ -0,0 +1,207 @@ +import logging +import time + +import requests + +from .client import ApiHubClientException + + +class GenericUnstractClient: + """ + Client for interacting with generic Unstract APIs. + + Handles dynamic endpoint processing operations including file upload + and result retrieval using execution_id tracking. + """ + + logger = logging.getLogger(__name__) + + def __init__( + self, + api_key: str, + base_url: str, + ) -> None: + """ + Initialize the GenericUnstractClient. + + Args: + api_key: API key for authentication + base_url: Base URL of the Unstract service + """ + self.api_key = api_key + self.base_url = base_url.rstrip("/") + self.headers = {"apikey": self.api_key} + + def process( + self, + endpoint: str, + file_path: str, + wait_for_completion: bool = False, + polling_interval: int = 5, + timeout: int = 600, + ) -> dict: + """ + Process a document using the specified endpoint. + + Args: + endpoint: The endpoint name (e.g., 'invoice', 'contract', 'receipt') + file_path: Path to the file to upload + wait_for_completion: If True, polls for completion and returns final result + polling_interval: Seconds to wait between status checks (default: 5) + timeout: Maximum time to wait for completion in seconds (default: 600) + + Returns: + dict: Response containing execution_id and processing information + + Raises: + ApiHubClientException: If upload fails + """ + url = f"{self.base_url}/api/v1/{endpoint}" + + self.logger.info("Processing file with endpoint %s: %s", endpoint, file_path) + + try: + with open(file_path, "rb") as file: + files = {"files": file} + response = requests.post(url, headers=self.headers, files=files) + except FileNotFoundError as e: + raise ApiHubClientException(f"File not found: {file_path}", None) from e + + self.logger.debug("Request Headers Sent: %s", response.request.headers) + self.logger.debug("Request URL: %s", response.request.url) + + if response.status_code != 200: + self.logger.error("Processing failed: %s", response.text) + raise ApiHubClientException(response.text, response.status_code) + + data = response.json() + execution_id = data.get("execution_id") + self.logger.info( + "Processing started successfully. Execution ID: %s", execution_id + ) + + # If wait_for_completion is True, poll for status and return final result + if wait_for_completion: + if not execution_id: + self.logger.warning( + "No execution_id in response, returning initial data" + ) + return data + + return self.wait_for_completion( + endpoint, + execution_id, + polling_interval=polling_interval, + timeout=timeout, + ) + + return data + + def get_result(self, endpoint: str, execution_id: str) -> dict: + """ + Get the result of a processing operation. + + Args: + endpoint: The endpoint name used for processing + execution_id: The execution ID to get results for + + Returns: + dict: Processing result or status information + + Raises: + ApiHubClientException: If result retrieval fails + """ + url = f"{self.base_url}/api/v1/{endpoint}" + params = {"execution_id": execution_id} + + self.logger.debug( + "Getting result for endpoint %s, execution ID: %s", endpoint, execution_id + ) + response = requests.get(url, headers=self.headers, params=params) + + if response.status_code != 200: + raise ApiHubClientException(response.text, response.status_code) + + return response.json() + + def wait_for_completion( + self, + endpoint: str, + execution_id: str, + timeout: int = 600, + polling_interval: int = 3, + ) -> dict: + """ + Wait for a processing operation to complete by polling its status. + + Args: + endpoint: The endpoint name used for processing + execution_id: The execution ID to wait for + timeout: Maximum time to wait in seconds (default: 600) + polling_interval: Seconds to wait between status checks (default: 3) + + Returns: + dict: Final processing result when completed + + Raises: + ApiHubClientException: If processing fails or times out + """ + self.logger.info( + "Waiting for completion. Polling every %d seconds", polling_interval + ) + start_time = time.time() + + while time.time() - start_time < timeout: + result = self.get_result(endpoint, execution_id) + status = result.get("status") + self.logger.info("Current status: %s", status) + + # Check for completion - different APIs may use different status values + if status in ["COMPLETED", "SUCCESS", "FINISHED"]: + self.logger.info("Processing completed") + return result + elif status in ["FAILED", "ERROR"]: + self.logger.error("Processing failed") + error_message = result.get("error", "Unknown error") + raise ApiHubClientException( + ( + f"Processing failed for execution_id: {execution_id}. " + f"Error: {error_message}" + ), + None, + ) + elif status in ["PROCESSING", "IN_PROGRESS", "RUNNING"]: + # Continue polling + pass + else: + # For unknown status, assume it's still processing + self.logger.debug("Unknown status: %s, continuing to poll", status) + + time.sleep(polling_interval) + + # If we reach here, we've timed out + raise ApiHubClientException( + f"Timeout waiting for completion. Execution ID: {execution_id}", + None, + ) + + def check_status(self, endpoint: str, execution_id: str) -> str | None: + """ + Check the current status of a processing operation. + + Args: + endpoint: The endpoint name used for processing + execution_id: The execution ID to check status for + + Returns: + str | None: Current status string, or None if not available + + Raises: + ApiHubClientException: If status check fails + """ + try: + result = self.get_result(endpoint, execution_id) + return result.get("status") + except ApiHubClientException: + # Re-raise the exception to let caller handle it + raise diff --git a/test/test_client.py b/test/test_client.py index 67f901a..2975203 100644 --- a/test/test_client.py +++ b/test/test_client.py @@ -5,7 +5,6 @@ import pytest import requests_mock - from apihub_client.client import ApiHubClient, ApiHubClientException @@ -427,3 +426,64 @@ def test_extract_real_polling_timing(self, client, mock_file_content): # Should have completed quickly due to short polling interval assert (end_time - start_time) < 2.0 # Should complete within 2 seconds assert result["result"] == "final_data" + + def test_wait_for_complete_standalone_success(self, client): + """Test wait_for_complete method called standalone.""" + with requests_mock.Mocker() as m: + # Mock status responses (first PROCESSING, then COMPLETED) + m.get( + "https://api.test.com/status?file_hash=standalone_hash", + [ + {"json": {"status": "PROCESSING"}, "status_code": 200}, + {"json": {"status": "COMPLETED"}, "status_code": 200}, + ], + ) + + # Mock retrieve response + m.get( + "https://api.test.com/retrieve?file_hash=standalone_hash", + json={ + "file_hash": "standalone_hash", + "status": "COMPLETED", + "result": {"data": "standalone_result"}, + }, + status_code=200, + ) + + with patch("time.sleep") as mock_sleep: + result = client.wait_for_complete( + "standalone_hash", timeout=300, polling_interval=2 + ) + + assert result["status"] == "COMPLETED" + assert result["result"]["data"] == "standalone_result" + mock_sleep.assert_called_with(2) + + def test_wait_for_complete_timeout_exception(self, client): + """Test wait_for_complete method timeout exception.""" + with requests_mock.Mocker() as m: + # Mock status responses that never complete + m.get( + "https://api.test.com/status?file_hash=timeout_hash", + json={"status": "PROCESSING"}, + status_code=200, + ) + + with patch("time.sleep"): + # Use return_value instead of side_effect for timeout simulation + with patch("time.time") as mock_time: + # First call returns 0, subsequent calls return 601 (timeout) + mock_time.side_effect = [0, 601] + with pytest.raises(ApiHubClientException) as exc_info: + client.wait_for_complete("timeout_hash", timeout=600) + + assert "Timeout waiting for completion" in exc_info.value.message + assert "timeout_hash" in exc_info.value.message + assert exc_info.value.status_code is None + + def test_client_initialization_with_trailing_slash(self): + """Test client initialization removes trailing slash from base_url.""" + client = ApiHubClient(api_key="test_key", base_url="https://api.test.com/") + assert client.base_url == "https://api.test.com" + assert client.api_key == "test_key" + assert client.headers == {"apikey": "test_key"} diff --git a/test/test_doc_splitter.py b/test/test_doc_splitter.py new file mode 100644 index 0000000..7b801c0 --- /dev/null +++ b/test/test_doc_splitter.py @@ -0,0 +1,483 @@ +"""Comprehensive test cases for DocSplitterClient.""" + +import time +from unittest.mock import mock_open, patch + +import pytest +import requests_mock +from apihub_client.client import ApiHubClientException +from apihub_client.doc_splitter import DocSplitterClient + + +class TestDocSplitterClient: + """Test cases for DocSplitterClient.""" + + @pytest.fixture + def client(self): + """Create a test client instance.""" + return DocSplitterClient( + api_key="test_api_key", base_url="http://localhost:8005" + ) + + @pytest.fixture + def mock_file_content(self): + """Mock file content for testing.""" + return b"test pdf content" + + def test_client_initialization(self, client): + """Test client initialization.""" + assert client.api_key == "test_api_key" + assert client.base_url == "http://localhost:8005" + assert client.headers == {"apikey": "test_api_key"} + + def test_upload_success(self, client, mock_file_content): + """Test successful file upload.""" + with requests_mock.Mocker() as m: + # Mock successful upload response + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={"job_id": "test-job-123", "status": "PROCESSING"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.upload(file_path="/test/document.pdf") + + assert result["job_id"] == "test-job-123" + assert result["status"] == "PROCESSING" + + # Verify request details + assert len(m.request_history) == 1 + request = m.request_history[0] + assert ( + request.url + == "http://localhost:8005/api/v1/doc-splitter/documents/upload" + ) + assert request.headers["apikey"] == "test_api_key" + + def test_upload_success_202_nested_response(self, client, mock_file_content): + """Test successful file upload with 202 status and nested response.""" + with requests_mock.Mocker() as m: + # Mock successful upload response with 202 and nested data structure + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={ + "data": { + "filename": "test.pdf", + "job_id": "93bbb7ab-3291-429c-8923-3b179e7ae5bf", + "pages": 2, + "size_bytes": 63580, + "status": "queued", + "upload_timestamp": "2025-08-23T04:28:32.424535Z", + "user_limits": { + "current_jobs": 1, + "jobs_today": 3, + "max_jobs_per_day": 5000, + "max_parallel_jobs": 5, + }, + }, + "request_id": "eb75ac1e-a224-4624-a4f2-6c813ddc2b3c", + "success": True, + "timestamp": "2025-08-23T04:28:32.424612", + }, + status_code=202, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.upload(file_path="/test/document.pdf") + + # Verify we got the nested response structure + assert result["data"]["job_id"] == "93bbb7ab-3291-429c-8923-3b179e7ae5bf" + assert result["data"]["status"] == "queued" + assert result["success"] is True + + def test_upload_file_not_found(self, client): + """Test upload with non-existent file.""" + with patch("builtins.open", side_effect=FileNotFoundError("File not found")): + with pytest.raises(ApiHubClientException) as exc_info: + client.upload(file_path="/nonexistent/file.pdf") + + assert "File not found: /nonexistent/file.pdf" in exc_info.value.message + + def test_upload_failure(self, client, mock_file_content): + """Test upload with API failure.""" + with requests_mock.Mocker() as m: + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + text="Bad Request", + status_code=400, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with pytest.raises(ApiHubClientException) as exc_info: + client.upload(file_path="/test/document.pdf") + + assert exc_info.value.message == "Bad Request" + assert exc_info.value.status_code == 400 + + def test_upload_with_wait_for_completion_success(self, client, mock_file_content): + """Test upload with wait_for_completion=True.""" + with requests_mock.Mocker() as m: + # Mock upload response + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={"job_id": "test-job-456", "status": "PROCESSING"}, + status_code=200, + ) + + # Mock status responses (first PROCESSING, then COMPLETED) + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job-456", + [ + { + "json": {"status": "PROCESSING", "job_id": "test-job-456"}, + "status_code": 200, + }, + { + "json": {"status": "PROCESSING", "job_id": "test-job-456"}, + "status_code": 200, + }, + { + "json": {"status": "COMPLETED", "job_id": "test-job-456"}, + "status_code": 200, + }, + ], + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with patch("time.sleep") as mock_sleep: + result = client.upload( + file_path="/test/document.pdf", + wait_for_completion=True, + polling_interval=1, + ) + + # Verify final result + assert result["status"] == "COMPLETED" + assert result["job_id"] == "test-job-456" + + # Verify sleep was called with correct interval + assert mock_sleep.call_count >= 1 + mock_sleep.assert_called_with(1) + + def test_upload_with_wait_for_completion_failed_status( + self, client, mock_file_content + ): + """Test upload with wait_for_completion when processing fails.""" + with requests_mock.Mocker() as m: + # Mock upload response + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={"job_id": "test-job-fail", "status": "PROCESSING"}, + status_code=200, + ) + + # Mock status response with FAILED status + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job-fail", + json={"status": "FAILED", "job_id": "test-job-fail"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.upload( + file_path="/test/document.pdf", + wait_for_completion=True, + ) + + assert ( + "Processing failed for job_id: test-job-fail" in exc_info.value.message + ) + + def test_upload_with_wait_for_completion_no_job_id(self, client, mock_file_content): + """Test upload with wait_for_completion when no job_id in response.""" + with requests_mock.Mocker() as m: + # Mock upload response without job_id + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={"status": "PROCESSING"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.upload( + file_path="/test/document.pdf", + wait_for_completion=True, + ) + + # Should return the initial response without polling + assert result["status"] == "PROCESSING" + assert "job_id" not in result + + def test_get_job_status_success(self, client): + """Test get_job_status method success.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job", + json={"status": "COMPLETED", "job_id": "test-job"}, + status_code=200, + ) + + result = client.get_job_status("test-job") + + assert result["status"] == "COMPLETED" + assert result["job_id"] == "test-job" + + def test_get_job_status_failure(self, client): + """Test get_job_status method with API failure.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job", + text="Not Found", + status_code=404, + ) + + with pytest.raises(ApiHubClientException) as exc_info: + client.get_job_status("test-job") + + assert exc_info.value.message == "Not Found" + assert exc_info.value.status_code == 404 + + def test_download_result_success(self, client, tmp_path): + """Test download_result method success.""" + # Create a temporary output path + output_path = tmp_path / "result.zip" + + mock_file_content = b"mock zip file content" + + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/download?job_id=test-job", + content=mock_file_content, + status_code=200, + ) + + result_path = client.download_result("test-job", str(output_path)) + + # Verify file was written correctly + assert result_path == str(output_path) + assert output_path.exists() + assert output_path.read_bytes() == mock_file_content + + def test_download_result_default_filename(self, client): + """Test download_result with default filename.""" + mock_file_content = b"mock zip file content" + + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/download?job_id=test-job-123", + content=mock_file_content, + status_code=200, + ) + + with patch("builtins.open", mock_open()) as mock_file: + result_path = client.download_result("test-job-123") + + assert result_path == "result_test-job-123.zip" + mock_file.assert_called_once_with("result_test-job-123.zip", "wb") + + def test_download_result_failure(self, client): + """Test download_result method with API failure.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/download?job_id=test-job", + text="Service Unavailable", + status_code=503, + ) + + with pytest.raises(ApiHubClientException) as exc_info: + client.download_result("test-job", "output.zip") + + assert exc_info.value.message == "Service Unavailable" + assert exc_info.value.status_code == 503 + + def test_wait_for_completion_success(self, client): + """Test wait_for_completion method success.""" + with requests_mock.Mocker() as m: + # Mock status responses (first PROCESSING, then COMPLETED) + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job", + [ + { + "json": {"status": "PROCESSING", "job_id": "test-job"}, + "status_code": 200, + }, + { + "json": {"status": "COMPLETED", "job_id": "test-job"}, + "status_code": 200, + }, + ], + ) + + with patch("time.sleep") as mock_sleep: + result = client.wait_for_completion("test-job", polling_interval=1) + + assert result["status"] == "COMPLETED" + assert result["job_id"] == "test-job" + mock_sleep.assert_called_with(1) + + def test_wait_for_completion_timeout(self, client): + """Test wait_for_completion with timeout.""" + with requests_mock.Mocker() as m: + # Mock status responses that never complete + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job", + json={"status": "PROCESSING", "job_id": "test-job"}, + status_code=200, + ) + + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.wait_for_completion( + "test-job", timeout=1, polling_interval=0.1 + ) + + assert "Timeout waiting for completion" in exc_info.value.message + assert "test-job" in exc_info.value.message + + def test_wait_for_completion_failed_status(self, client): + """Test wait_for_completion with failed job.""" + with requests_mock.Mocker() as m: + # Mock status response with FAILED status + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-job", + json={"status": "FAILED", "job_id": "test-job"}, + status_code=200, + ) + + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.wait_for_completion("test-job") + + assert "Processing failed for job_id: test-job" in exc_info.value.message + + def test_wait_for_completion_nested_response(self, client): + """Test wait_for_completion with nested response structure.""" + with requests_mock.Mocker() as m: + # Mock status responses with nested structure (processing, then completed) + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=test-nested", + [ + { + "json": { + "data": { + "status": "processing", + "job_id": "test-nested", + "current_step": "page_image_gen", + }, + "success": True, + }, + "status_code": 200, + }, + { + "json": { + "data": { + "status": "completed", + "job_id": "test-nested", + "finished_at": "2025-08-23T04:40:00.000000Z", + }, + "success": True, + }, + "status_code": 200, + }, + ], + ) + + with patch("time.sleep") as mock_sleep: + result = client.wait_for_completion("test-nested", polling_interval=1) + + # Should successfully complete and return the nested structure + assert result["data"]["status"] == "completed" + assert result["data"]["job_id"] == "test-nested" + assert result["success"] is True + mock_sleep.assert_called_with(1) + + def test_logging_output(self, client, caplog, mock_file_content): + """Test that appropriate logging messages are generated.""" + with requests_mock.Mocker() as m: + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={"job_id": "test-job-log"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with caplog.at_level("INFO"): + client.upload(file_path="/test/document.pdf") + + # Check for expected log messages + log_messages = [record.message for record in caplog.records] + assert any("Uploading file for splitting" in msg for msg in log_messages) + assert any("Upload completed successfully" in msg for msg in log_messages) + + @pytest.mark.parametrize( + "base_url", + [ + "http://localhost:8005", + "http://localhost:8005/", + "https://api.example.com", + "https://api.example.com/", + ], + ) + def test_different_base_urls(self, base_url, mock_file_content): + """Test client with different base URL formats.""" + client = DocSplitterClient(api_key="test", base_url=base_url) + + # Base URL should be normalized (trailing slash removed) + expected_base = base_url.rstrip("/") + assert client.base_url == expected_base + + with requests_mock.Mocker() as m: + expected_url = f"{expected_base}/api/v1/doc-splitter/documents/upload" + m.post( + expected_url, + json={"job_id": "test-job"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.upload(file_path="/test/document.pdf") + + assert result["job_id"] == "test-job" + + def test_upload_real_timing(self, client, mock_file_content): + """Test that polling respects the specified interval.""" + with requests_mock.Mocker() as m: + # Mock upload response + m.post( + "http://localhost:8005/api/v1/doc-splitter/documents/upload", + json={"job_id": "timing-test"}, + status_code=200, + ) + + # Mock status responses + m.get( + "http://localhost:8005/api/v1/doc-splitter/jobs/status?job_id=timing-test", + [ + { + "json": {"status": "PROCESSING", "job_id": "timing-test"}, + "status_code": 200, + }, + { + "json": {"status": "COMPLETED", "job_id": "timing-test"}, + "status_code": 200, + }, + ], + ) + + start_time = time.time() + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.upload( + file_path="/test/document.pdf", + wait_for_completion=True, + polling_interval=0.1, # Very short interval for testing + ) + + end_time = time.time() + + # Should have completed quickly due to short polling interval + assert (end_time - start_time) < 2.0 # Should complete within 2 seconds + assert result["status"] == "COMPLETED" diff --git a/test/test_generic_client.py b/test/test_generic_client.py new file mode 100644 index 0000000..2a12b1b --- /dev/null +++ b/test/test_generic_client.py @@ -0,0 +1,512 @@ +"""Comprehensive test cases for GenericUnstractClient.""" + +import time +from unittest.mock import mock_open, patch + +import pytest +import requests_mock +from apihub_client.client import ApiHubClientException +from apihub_client.generic_client import GenericUnstractClient + + +class TestGenericUnstractClient: + """Test cases for GenericUnstractClient.""" + + @pytest.fixture + def client(self): + """Create a test client instance.""" + return GenericUnstractClient( + api_key="test_api_key", base_url="http://localhost:8005" + ) + + @pytest.fixture + def mock_file_content(self): + """Mock file content for testing.""" + return b"test pdf content" + + def test_client_initialization(self, client): + """Test client initialization.""" + assert client.api_key == "test_api_key" + assert client.base_url == "http://localhost:8005" + assert client.headers == {"apikey": "test_api_key"} + + def test_process_success(self, client, mock_file_content): + """Test successful file processing.""" + with requests_mock.Mocker() as m: + # Mock successful processing response + m.post( + "http://localhost:8005/api/v1/invoice", + json={"execution_id": "test-exec-123", "status": "PROCESSING"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.process( + endpoint="invoice", file_path="/test/invoice.pdf" + ) + + assert result["execution_id"] == "test-exec-123" + assert result["status"] == "PROCESSING" + + # Verify request details + assert len(m.request_history) == 1 + request = m.request_history[0] + assert request.url == "http://localhost:8005/api/v1/invoice" + assert request.headers["apikey"] == "test_api_key" + + def test_process_file_not_found(self, client): + """Test process with non-existent file.""" + with patch("builtins.open", side_effect=FileNotFoundError("File not found")): + with pytest.raises(ApiHubClientException) as exc_info: + client.process(endpoint="invoice", file_path="/nonexistent/file.pdf") + + assert "File not found: /nonexistent/file.pdf" in exc_info.value.message + + def test_process_failure(self, client, mock_file_content): + """Test process with API failure.""" + with requests_mock.Mocker() as m: + m.post( + "http://localhost:8005/api/v1/invoice", + text="Bad Request", + status_code=400, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with pytest.raises(ApiHubClientException) as exc_info: + client.process(endpoint="invoice", file_path="/test/invoice.pdf") + + assert exc_info.value.message == "Bad Request" + assert exc_info.value.status_code == 400 + + def test_process_with_wait_for_completion_success(self, client, mock_file_content): + """Test process with wait_for_completion=True.""" + with requests_mock.Mocker() as m: + # Mock process response + m.post( + "http://localhost:8005/api/v1/contract", + json={"execution_id": "test-exec-456", "status": "PROCESSING"}, + status_code=200, + ) + + # Mock get_result responses (first PROCESSING, then COMPLETED) + m.get( + "http://localhost:8005/api/v1/contract?execution_id=test-exec-456", + [ + { + "json": { + "status": "PROCESSING", + "execution_id": "test-exec-456", + }, + "status_code": 200, + }, + { + "json": { + "status": "PROCESSING", + "execution_id": "test-exec-456", + }, + "status_code": 200, + }, + { + "json": { + "status": "COMPLETED", + "execution_id": "test-exec-456", + "result": {"extracted_data": "contract_data"}, + }, + "status_code": 200, + }, + ], + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with patch("time.sleep") as mock_sleep: + result = client.process( + endpoint="contract", + file_path="/test/contract.pdf", + wait_for_completion=True, + polling_interval=1, + ) + + # Verify final result + assert result["status"] == "COMPLETED" + assert result["execution_id"] == "test-exec-456" + assert result["result"]["extracted_data"] == "contract_data" + + # Verify sleep was called with correct interval + assert mock_sleep.call_count >= 1 + mock_sleep.assert_called_with(1) + + def test_process_with_wait_for_completion_failed_status( + self, client, mock_file_content + ): + """Test process with wait_for_completion when processing fails.""" + with requests_mock.Mocker() as m: + # Mock process response + m.post( + "http://localhost:8005/api/v1/invoice", + json={"execution_id": "test-exec-fail", "status": "PROCESSING"}, + status_code=200, + ) + + # Mock get_result response with FAILED status + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec-fail", + json={ + "status": "FAILED", + "execution_id": "test-exec-fail", + "error": "Processing error occurred", + }, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.process( + endpoint="invoice", + file_path="/test/invoice.pdf", + wait_for_completion=True, + ) + + assert ( + "Processing failed for execution_id: test-exec-fail" + in exc_info.value.message + ) + assert "Processing error occurred" in exc_info.value.message + + def test_process_with_wait_for_completion_no_execution_id( + self, client, mock_file_content + ): + """Test process with wait_for_completion when no execution_id in response.""" + with requests_mock.Mocker() as m: + # Mock process response without execution_id + m.post( + "http://localhost:8005/api/v1/invoice", + json={"status": "PROCESSING"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.process( + endpoint="invoice", + file_path="/test/invoice.pdf", + wait_for_completion=True, + ) + + # Should return the initial response without polling + assert result["status"] == "PROCESSING" + assert "execution_id" not in result + + def test_get_result_success(self, client): + """Test get_result method success.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/receipt?execution_id=test-exec", + json={ + "status": "COMPLETED", + "execution_id": "test-exec", + "result": {"total_amount": 123.45}, + }, + status_code=200, + ) + + result = client.get_result("receipt", "test-exec") + + assert result["status"] == "COMPLETED" + assert result["execution_id"] == "test-exec" + assert result["result"]["total_amount"] == 123.45 + + def test_get_result_failure(self, client): + """Test get_result method with API failure.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + text="Not Found", + status_code=404, + ) + + with pytest.raises(ApiHubClientException) as exc_info: + client.get_result("invoice", "test-exec") + + assert exc_info.value.message == "Not Found" + assert exc_info.value.status_code == 404 + + def test_wait_for_completion_success(self, client): + """Test wait_for_completion method success.""" + with requests_mock.Mocker() as m: + # Mock get_result responses (first PROCESSING, then COMPLETED) + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + [ + { + "json": {"status": "PROCESSING", "execution_id": "test-exec"}, + "status_code": 200, + }, + { + "json": { + "status": "COMPLETED", + "execution_id": "test-exec", + "result": {"data": "final_result"}, + }, + "status_code": 200, + }, + ], + ) + + with patch("time.sleep") as mock_sleep: + result = client.wait_for_completion( + "invoice", "test-exec", polling_interval=1 + ) + + assert result["status"] == "COMPLETED" + assert result["execution_id"] == "test-exec" + assert result["result"]["data"] == "final_result" + mock_sleep.assert_called_with(1) + + def test_wait_for_completion_timeout(self, client): + """Test wait_for_completion with timeout.""" + with requests_mock.Mocker() as m: + # Mock get_result responses that never complete + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + json={"status": "PROCESSING", "execution_id": "test-exec"}, + status_code=200, + ) + + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.wait_for_completion( + "invoice", "test-exec", timeout=1, polling_interval=0.1 + ) + + assert "Timeout waiting for completion" in exc_info.value.message + assert "test-exec" in exc_info.value.message + + def test_wait_for_completion_failed_status(self, client): + """Test wait_for_completion with failed processing.""" + with requests_mock.Mocker() as m: + # Mock get_result response with FAILED status + m.get( + "http://localhost:8005/api/v1/contract?execution_id=test-exec", + json={ + "status": "ERROR", + "execution_id": "test-exec", + "error": "Validation failed", + }, + status_code=200, + ) + + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.wait_for_completion("contract", "test-exec") + + assert ( + "Processing failed for execution_id: test-exec" + in exc_info.value.message + ) + assert "Validation failed" in exc_info.value.message + + def test_check_status_success(self, client): + """Test check_status method success.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + json={"status": "IN_PROGRESS", "execution_id": "test-exec"}, + status_code=200, + ) + + status = client.check_status("invoice", "test-exec") + assert status == "IN_PROGRESS" + + def test_check_status_failure(self, client): + """Test check_status method with API failure.""" + with requests_mock.Mocker() as m: + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + text="Internal Server Error", + status_code=500, + ) + + with pytest.raises(ApiHubClientException): + client.check_status("invoice", "test-exec") + + def test_logging_output(self, client, caplog, mock_file_content): + """Test that appropriate logging messages are generated.""" + with requests_mock.Mocker() as m: + m.post( + "http://localhost:8005/api/v1/invoice", + json={"execution_id": "test-exec-log"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + with caplog.at_level("INFO"): + client.process(endpoint="invoice", file_path="/test/invoice.pdf") + + # Check for expected log messages + log_messages = [record.message for record in caplog.records] + assert any("Processing file with endpoint" in msg for msg in log_messages) + assert any("Processing started successfully" in msg for msg in log_messages) + + @pytest.mark.parametrize( + "endpoint", + ["invoice", "contract", "receipt", "purchase_order", "bank_statement"], + ) + def test_different_endpoints(self, client, endpoint, mock_file_content): + """Test process method with different endpoint configurations.""" + with requests_mock.Mocker() as m: + m.post( + f"http://localhost:8005/api/v1/{endpoint}", + json={"execution_id": f"exec_{endpoint}"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.process( + endpoint=endpoint, + file_path="/test/document.pdf", + ) + + assert result["execution_id"] == f"exec_{endpoint}" + + @pytest.mark.parametrize( + "base_url", + [ + "http://localhost:8005", + "http://localhost:8005/", + "https://api.example.com", + "https://api.example.com/", + ], + ) + def test_different_base_urls(self, base_url, mock_file_content): + """Test client with different base URL formats.""" + client = GenericUnstractClient(api_key="test", base_url=base_url) + + # Base URL should be normalized (trailing slash removed) + expected_base = base_url.rstrip("/") + assert client.base_url == expected_base + + with requests_mock.Mocker() as m: + expected_url = f"{expected_base}/api/v1/invoice" + m.post( + expected_url, + json={"execution_id": "test-exec"}, + status_code=200, + ) + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.process( + endpoint="invoice", file_path="/test/invoice.pdf" + ) + + assert result["execution_id"] == "test-exec" + + @pytest.mark.parametrize( + "status_value,expected_completion", + [ + ("COMPLETED", True), + ("SUCCESS", True), + ("FINISHED", True), + ("FAILED", False), + ("ERROR", False), + ("PROCESSING", "continue"), + ("IN_PROGRESS", "continue"), + ("RUNNING", "continue"), + ("UNKNOWN_STATUS", "continue"), + ], + ) + def test_status_handling_in_wait_for_completion( + self, client, status_value, expected_completion + ): + """Test different status values in wait_for_completion.""" + with requests_mock.Mocker() as m: + if expected_completion is True: + # Should complete successfully + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + json={ + "status": status_value, + "execution_id": "test-exec", + "result": {"data": "completed"}, + }, + status_code=200, + ) + + with patch("time.sleep"): + result = client.wait_for_completion("invoice", "test-exec") + assert result["status"] == status_value + + elif expected_completion is False: + # Should fail + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + json={ + "status": status_value, + "execution_id": "test-exec", + "error": "Test error", + }, + status_code=200, + ) + + with patch("time.sleep"): + with pytest.raises(ApiHubClientException): + client.wait_for_completion("invoice", "test-exec") + + else: # expected_completion == "continue" + # Should continue polling (we'll timeout quickly for test) + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=test-exec", + json={"status": status_value, "execution_id": "test-exec"}, + status_code=200, + ) + + with patch("time.sleep"): + with pytest.raises(ApiHubClientException) as exc_info: + client.wait_for_completion( + "invoice", "test-exec", timeout=0.1, polling_interval=0.05 + ) + assert "Timeout waiting for completion" in exc_info.value.message + + def test_process_real_timing(self, client, mock_file_content): + """Test that polling respects the specified interval.""" + with requests_mock.Mocker() as m: + # Mock process response + m.post( + "http://localhost:8005/api/v1/invoice", + json={"execution_id": "timing-test"}, + status_code=200, + ) + + # Mock get_result responses + m.get( + "http://localhost:8005/api/v1/invoice?execution_id=timing-test", + [ + { + "json": {"status": "PROCESSING", "execution_id": "timing-test"}, + "status_code": 200, + }, + { + "json": { + "status": "COMPLETED", + "execution_id": "timing-test", + "result": {"data": "final"}, + }, + "status_code": 200, + }, + ], + ) + + start_time = time.time() + + with patch("builtins.open", mock_open(read_data=mock_file_content)): + result = client.process( + endpoint="invoice", + file_path="/test/invoice.pdf", + wait_for_completion=True, + polling_interval=0.1, # Very short interval for testing + ) + + end_time = time.time() + + # Should have completed quickly due to short polling interval + assert (end_time - start_time) < 2.0 # Should complete within 2 seconds + assert result["status"] == "COMPLETED" diff --git a/test/test_imports.py b/test/test_imports.py new file mode 100644 index 0000000..93cc5ee --- /dev/null +++ b/test/test_imports.py @@ -0,0 +1,151 @@ +"""Test module imports and package-level functionality.""" + + +class TestPackageImports: + """Test cases for package imports.""" + + def test_main_package_imports(self): + """Test importing main classes from the package.""" + # This should import all main classes and trigger __init__.py coverage + from apihub_client import ( + ApiHubClient, + ApiHubClientException, + DocSplitterClient, + GenericUnstractClient, + ) + + # Verify classes are importable and are actually classes + assert ApiHubClient is not None + assert ApiHubClientException is not None + assert DocSplitterClient is not None + assert GenericUnstractClient is not None + + # Verify they are actually classes/exceptions + assert callable(ApiHubClient) + assert callable(ApiHubClientException) + assert callable(DocSplitterClient) + assert callable(GenericUnstractClient) + + def test_package_metadata(self): + """Test package metadata is accessible.""" + import apihub_client + + # Check metadata attributes exist + assert hasattr(apihub_client, "__version__") + assert hasattr(apihub_client, "__author__") + assert hasattr(apihub_client, "__email__") + assert hasattr(apihub_client, "__all__") + + # Check metadata values + assert apihub_client.__version__ == "0.1.1" + assert apihub_client.__author__ == "Unstract Team" + assert apihub_client.__email__ == "support@unstract.com" + + # Check __all__ contains expected items + expected_all = [ + "ApiHubClient", + "ApiHubClientException", + "DocSplitterClient", + "GenericUnstractClient", + ] + assert apihub_client.__all__ == expected_all + + def test_direct_module_imports(self): + """Test direct module imports work.""" + from apihub_client.client import ApiHubClient, ApiHubClientException + from apihub_client.doc_splitter import DocSplitterClient + from apihub_client.generic_client import GenericUnstractClient + + # Verify classes are importable + assert ApiHubClient is not None + assert ApiHubClientException is not None + assert DocSplitterClient is not None + assert GenericUnstractClient is not None + + def test_client_instantiation(self): + """Test that clients can be instantiated from package imports.""" + from apihub_client import ( + ApiHubClient, + DocSplitterClient, + GenericUnstractClient, + ) + + # Test ApiHubClient instantiation + api_client = ApiHubClient(api_key="test_key", base_url="https://test.com") + assert api_client.api_key == "test_key" + assert api_client.base_url == "https://test.com" + + # Test DocSplitterClient instantiation + doc_client = DocSplitterClient(api_key="test_key", base_url="https://test.com") + assert doc_client.api_key == "test_key" + assert doc_client.base_url == "https://test.com" + + # Test GenericUnstractClient instantiation + generic_client = GenericUnstractClient( + api_key="test_key", base_url="https://test.com" + ) + assert generic_client.api_key == "test_key" + assert generic_client.base_url == "https://test.com" + + def test_exception_instantiation(self): + """Test that exception can be instantiated from package imports.""" + from apihub_client import ApiHubClientException + + # Test exception creation + exc = ApiHubClientException("Test message", 400) + assert exc.message == "Test message" + assert exc.status_code == 400 + + # Test exception string representation + str_repr = str(exc) + assert "Test message" in str_repr + assert "400" in str_repr + + def test_star_import(self): + """Test that star import works correctly.""" + # This imports everything in __all__ + exec("from apihub_client import *") # noqa: S102 + + # Check that the main classes are available in local scope + locals_dict = locals() + assert "ApiHubClient" in locals_dict + assert "ApiHubClientException" in locals_dict + assert "DocSplitterClient" in locals_dict + assert "GenericUnstractClient" in locals_dict + + def test_package_docstring(self): + """Test package docstring is accessible.""" + import apihub_client + + assert apihub_client.__doc__ is not None + assert "Unstract API Hub Python Client" in apihub_client.__doc__ + assert "dynamic, extensible Python client" in apihub_client.__doc__ + + def test_import_order_independence(self): + """Test that imports work regardless of order.""" + # Import in different order + from apihub_client import ( + ApiHubClient, # noqa: F401 + ApiHubClientException, # noqa: F401 + DocSplitterClient, # noqa: F401 + GenericUnstractClient, + ) + + # Should work fine + client = GenericUnstractClient(api_key="test", base_url="https://test.com") + assert client.api_key == "test" + + def test_submodule_access(self): + """Test that submodules are accessible through the package.""" + import apihub_client + + # Should be able to access submodules + assert hasattr(apihub_client, "client") + assert hasattr(apihub_client, "doc_splitter") + assert hasattr(apihub_client, "generic_client") + + # Should be able to access classes through submodules + assert hasattr(apihub_client.client, "ApiHubClient") + assert hasattr(apihub_client.client, "ApiHubClientException") + assert hasattr(apihub_client.doc_splitter, "DocSplitterClient") + assert hasattr(apihub_client.generic_client, "GenericUnstractClient") diff --git a/test/test_performance.py b/test/test_performance.py deleted file mode 100644 index 2fb91f7..0000000 --- a/test/test_performance.py +++ /dev/null @@ -1,334 +0,0 @@ -"""Performance tests for ApiHubClient.""" - -from unittest.mock import mock_open, patch - -import pytest -import requests_mock - -from apihub_client.client import ApiHubClient - - -class TestApiHubClientPerformance: - """Performance tests for ApiHubClient operations.""" - - @pytest.fixture - def performance_client(self): - """Create client for performance testing.""" - return ApiHubClient( - api_key="performance_test_key", - base_url="https://api.performance.test", - ) - - @pytest.fixture - def large_mock_file_content(self): - """Create large mock file content for performance testing.""" - return b"Large PDF content for performance testing " * 10000 - - def test_extract_upload_performance( - self, performance_client, large_mock_file_content, benchmark - ): - """Test performance of file upload during extract operation.""" - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/performance_test", - json={"file_hash": "perf_hash_123", "status": "PROCESSING"}, - status_code=200, - ) - - def upload_operation(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - return performance_client.extract( - endpoint="performance_test", - vertical="table", - sub_vertical="performance_test", - file_path="/test/large_file.pdf", - ) - - result = benchmark(upload_operation) - assert result["file_hash"] == "perf_hash_123" - - def test_polling_performance_fast_completion( - self, performance_client, large_mock_file_content, benchmark - ): - """Test polling performance when processing completes quickly.""" - with requests_mock.Mocker() as m: - # Mock extract - m.post( - "https://api.performance.test/extract/fast_process", - json={"file_hash": "fast_hash_123", "status": "PROCESSING"}, - status_code=200, - ) - - # Mock immediate completion - m.get( - "https://api.performance.test/status?file_hash=fast_hash_123", - json={"status": "COMPLETED"}, - status_code=200, - ) - - # Mock retrieve - m.get( - "https://api.performance.test/retrieve?file_hash=fast_hash_123", - json={"result": "fast_completion_data"}, - status_code=200, - ) - - def fast_completion_workflow(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): - return performance_client.extract( - endpoint="fast_process", - vertical="table", - sub_vertical="fast_process", - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - - result = benchmark(fast_completion_workflow) - assert result["result"] == "fast_completion_data" - - def test_polling_performance_slow_completion( - self, performance_client, large_mock_file_content, benchmark - ): - """Test polling performance with multiple status checks.""" - with requests_mock.Mocker() as m: - # Mock extract - m.post( - "https://api.performance.test/extract/slow_process", - json={"file_hash": "slow_hash_456", "status": "PROCESSING"}, - status_code=200, - ) - - # Mock multiple processing status responses - status_responses = [] - for i in range(5): # 5 polling cycles - status_responses.append( - { - "json": {"status": "PROCESSING", "progress": i * 20}, - "status_code": 200, - } - ) - status_responses.append( - {"json": {"status": "COMPLETED", "progress": 100}, "status_code": 200} - ) - - for response in status_responses: - m.get( - "https://api.performance.test/status?file_hash=slow_hash_456", - **response, - ) - - # Mock retrieve - m.get( - "https://api.performance.test/retrieve?file_hash=slow_hash_456", - json={"result": "slow_completion_data"}, - status_code=200, - ) - - def slow_completion_workflow(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): # Mock sleep to avoid actual delays - return performance_client.extract( - endpoint="slow_process", - vertical="table", - sub_vertical="slow_process", - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - - result = benchmark(slow_completion_workflow) - assert result["result"] == "slow_completion_data" - - def test_multiple_sequential_requests_performance( - self, performance_client, large_mock_file_content, benchmark - ): - """Test performance of multiple sequential API requests.""" - with requests_mock.Mocker() as m: - # Mock multiple different endpoints - endpoints = ["discover", "extract", "process", "analyze"] - - for i, endpoint in enumerate(endpoints): - m.post( - f"https://api.performance.test/extract/{endpoint}", - json={"file_hash": f"hash_{i}", "status": "PROCESSING"}, - status_code=200, - ) - - m.get( - f"https://api.performance.test/status?file_hash=hash_{i}", - json={"status": "COMPLETED"}, - status_code=200, - ) - - m.get( - f"https://api.performance.test/retrieve?file_hash=hash_{i}", - json={"result": f"data_{endpoint}"}, - status_code=200, - ) - - def sequential_requests(): - results = [] - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): - for endpoint in endpoints: - result = performance_client.extract( - endpoint=endpoint, - vertical="table", - sub_vertical=endpoint, - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - results.append(result) - return results - - results = benchmark(sequential_requests) - assert len(results) == 4 - for i, result in enumerate(results): - assert result["result"] == f"data_{endpoints[i]}" - - def test_memory_usage_large_response( - self, performance_client, large_mock_file_content, benchmark - ): - """Test memory efficiency with large API responses.""" - # Create a large mock response - large_response = { - "file_hash": "large_response_hash", - "result": { - "data": ["row_" + str(i) for i in range(10000)], # Large dataset - "metadata": {"size": "large", "processing_time": 60}, - }, - } - - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/large_response", - json={"file_hash": "large_response_hash", "status": "PROCESSING"}, - status_code=200, - ) - - m.get( - "https://api.performance.test/status?file_hash=large_response_hash", - json={"status": "COMPLETED"}, - status_code=200, - ) - - m.get( - "https://api.performance.test/retrieve?file_hash=large_response_hash", - json=large_response, - status_code=200, - ) - - def large_response_workflow(): - with patch( - "builtins.open", mock_open(read_data=large_mock_file_content) - ): - with patch("time.sleep"): - return performance_client.extract( - endpoint="large_response", - vertical="table", - sub_vertical="large_response", - file_path="/test/file.pdf", - wait_for_completion=True, - polling_interval=0.1, - ) - - result = benchmark(large_response_workflow) - assert len(result["result"]["data"]) == 10000 - - def test_api_request_overhead(self, performance_client, benchmark): - """Test the overhead of API request setup and teardown.""" - with requests_mock.Mocker() as m: - m.get( - "https://api.performance.test/status?file_hash=overhead_test", - json={"status": "COMPLETED"}, - status_code=200, - ) - - def simple_status_check(): - return performance_client.get_status("overhead_test") - - result = benchmark(simple_status_check) - assert result["status"] == "COMPLETED" - - def test_concurrent_status_checks(self, performance_client, benchmark): - """Test performance of rapid consecutive status checks.""" - with requests_mock.Mocker() as m: - # Mock status endpoint - m.get( - "https://api.performance.test/status", - json={"status": "PROCESSING"}, - status_code=200, - ) - - def rapid_status_checks(): - file_hashes = [f"hash_{i}" for i in range(10)] - results = [] - for hash_id in file_hashes: - result = performance_client.get_status(hash_id) - results.append(result) - return results - - results = benchmark(rapid_status_checks) - assert len(results) == 10 - assert all(r["status"] == "PROCESSING" for r in results) - - @pytest.mark.parametrize("file_size_multiplier", [1, 10, 100]) - def test_file_size_impact_on_performance( - self, performance_client, file_size_multiplier, benchmark - ): - """Test how file size affects upload performance.""" - file_content = b"Content " * (1000 * file_size_multiplier) - - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/size_test", - json={"file_hash": f"size_hash_{file_size_multiplier}"}, - status_code=200, - ) - - def upload_sized_file(): - with patch("builtins.open", mock_open(read_data=file_content)): - return performance_client.extract( - endpoint="size_test", - vertical="table", - sub_vertical="size_test", - file_path="/test/sized_file.pdf", - ) - - result = benchmark(upload_sized_file) - assert result["file_hash"] == f"size_hash_{file_size_multiplier}" - - def test_error_handling_performance(self, performance_client, benchmark): - """Test performance impact of error handling.""" - with requests_mock.Mocker() as m: - m.post( - "https://api.performance.test/extract/error_test", - text="Internal Server Error", - status_code=500, - ) - - def error_handling_operation(): - try: - with patch("builtins.open", mock_open(read_data=b"test")): - performance_client.extract( - endpoint="error_test", - vertical="table", - sub_vertical="error_test", - file_path="/test/file.pdf", - ) - except Exception: - return "error_handled" - - result = benchmark(error_handling_operation) - assert result == "error_handled" diff --git a/tox.ini b/tox.ini index 598855e..2d2978b 100644 --- a/tox.ini +++ b/tox.ini @@ -11,21 +11,21 @@ deps = requests-mock>=1.9.0 python-dotenv>=1.0.0 commands = - pytest {posargs} test/test_client.py test/test_integration.py -v + pytest {posargs} test/ -v [testenv:lint] deps = ruff>=0.1.0 commands = - ruff check src/ test/ - ruff format --check src/ test/ + ruff check src/ + ruff format --check src/ [testenv:format] deps = ruff>=0.1.0 commands = - ruff format src/ test/ - ruff check --fix src/ test/ + ruff format src/ + ruff check --fix src/ [testenv:type-check] deps = @@ -42,7 +42,7 @@ deps = requests-mock>=1.9.0 python-dotenv>=1.0.0 commands = - pytest test/test_client.py test/test_integration.py --cov=src/apihub_client --cov-report=term-missing --cov-report=html --cov-report=xml --cov-fail-under=85 + pytest test/ --cov=src/apihub_client --cov-report=term-missing --cov-report=html --cov-report=xml --cov-fail-under=85 [testenv:docs] deps =