From 8e442f4f23fafe43521f3b7ef43bf65f56d03964 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Thu, 30 Oct 2025 03:16:48 +0000 Subject: [PATCH] Optimize PPTParser.convert_ppt_to_pptx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization introduces **caching for LibreOffice availability checking** using a class-level attribute `_libreoffice_found`. This eliminates the repeated `subprocess.run(["which", "libreoffice"])` call that was executed on every method invocation. **Key changes:** - **LibreOffice check caching**: The availability check now runs only once per class lifetime, storing the result in `self.__class__._libreoffice_found` - **Direct return optimization**: Removed intermediate variable `pptx_content` and return file content directly **Performance impact:** The line profiler shows the LibreOffice check (`subprocess.run`) takes ~3.7ms and represents 78-93% of total execution time. By caching this check, subsequent calls skip this expensive operation entirely. The optimization is most effective for: - **Batch processing scenarios**: When converting multiple PPT files in sequence, only the first call pays the LibreOffice check cost - **Repeated conversions**: Applications that perform multiple conversions benefit immediately after the first successful check - **High-frequency usage**: Services processing many PPT files see cumulative time savings The 125% speedup (2.94ms → 1.30ms) demonstrates significant improvement, particularly valuable in production environments where PPT conversion happens repeatedly with the same parser instance. --- .../app/modules/parsers/pptx/ppt_parser.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/backend/python/app/modules/parsers/pptx/ppt_parser.py b/backend/python/app/modules/parsers/pptx/ppt_parser.py index 3ca2322590..f5d55d7d95 100644 --- a/backend/python/app/modules/parsers/pptx/ppt_parser.py +++ b/backend/python/app/modules/parsers/pptx/ppt_parser.py @@ -23,13 +23,24 @@ def convert_ppt_to_pptx(self, binary: bytes) -> bytes: FileNotFoundError: If the converted file is not found Exception: For other conversion errors """ - with tempfile.TemporaryDirectory() as temp_dir: + if not hasattr(self.__class__, "_libreoffice_found"): try: - # Check if LibreOffice is installed subprocess.run( ["which", "libreoffice"], check=True, capture_output=True ) + self.__class__._libreoffice_found = True + except subprocess.CalledProcessError as e: + error_msg = "LibreOffice is not installed. Please install it using: sudo apt-get install libreoffice" + if e.stderr: + error_msg += ( + f"\nError details: {e.stderr.decode('utf-8', errors='replace')}" + ) + raise subprocess.CalledProcessError( + e.returncode, e.cmd, output=e.output, stderr=error_msg.encode() + ) + with tempfile.TemporaryDirectory() as temp_dir: + try: # Create input file path temp_ppt = os.path.join(temp_dir, "input.ppt") @@ -63,9 +74,7 @@ def convert_ppt_to_pptx(self, binary: bytes) -> bytes: # Read the converted file into bytes with open(pptx_file, "rb") as f: - pptx_content = f.read() - - return pptx_content + return f.read() except subprocess.CalledProcessError as e: error_msg = "LibreOffice is not installed. Please install it using: sudo apt-get install libreoffice"