VectifyAI · clarenceluo78 · Jun 6, 2025 · Jun 12, 2025 · Jun 12, 2025
diff --git a/docs/ebooks_percival_keene.txt b/docs/ebooks_percival_keene.txt
diff --git a/pageindex/config.yaml b/pageindex/config.yaml
@@ -5,4 +5,8 @@ max_token_num_each_node: 20000
 if_add_node_id: "yes"
 if_add_node_summary: "no"
 if_add_doc_description: "yes"
-if_add_node_text: "no"
+if_add_node_text: "no"
+txt_page_method: "token"  # "char" or "token"
+txt_tokens_per_page: 512
+txt_chars_per_page: 2048
+txt_tokenizer: "gpt2"
diff --git a/pageindex/page_index.py b/pageindex/page_index.py
@@ -971,7 +971,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
     accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)
 
     logger.info({
-        'mode': 'process_toc_with_page_numbers',
+        'mode': mode,
         'accuracy': accuracy,
         'incorrect_results': incorrect_results
     })
@@ -1058,15 +1058,30 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
 def page_index_main(doc, opt=None):
     logger = JsonLogger(doc)
 
-    is_valid_pdf = (
-        (isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or 
+    is_valid_document = (
+        (isinstance(doc, str) and os.path.isfile(doc) and 
+         (doc.lower().endswith(".pdf") or doc.lower().endswith(".txt"))) or 
         isinstance(doc, BytesIO)
     )
-    if not is_valid_pdf:
-        raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
+    if not is_valid_document:
+        raise ValueError("Unsupported input type. Expected a PDF or TXT file path, or BytesIO object.")
 
-    print('Parsing PDF...')
-    page_list = get_page_tokens(doc)
+    # Determine file type for processing message
+    if isinstance(doc, str) and doc.lower().endswith(".txt"):
+        print('Parsing TXT...')
+    else:
+        print('Parsing PDF...')
+
+    # Pass TXT parameters to get_page_tokens
+    page_list = get_page_tokens(
+        doc,
+        model=opt.model if opt else "gpt-4o-2024-11-20",
+        txt_method=getattr(opt, 'txt_page_method', 'token'),
+        txt_tokens_per_page=getattr(opt, 'txt_tokens_per_page', 512),
+        txt_chars_per_page=getattr(opt, 'txt_chars_per_page', 2048),
+        txt_tokenizer=getattr(opt, 'txt_tokenizer', 'gpt2'),
+        txt_chunk_overlap=getattr(opt, 'txt_chunk_overlap', 10)
+    )
 
     logger.info({'total_page_number': len(page_list)})
     logger.info({'total_token': sum([page[1] for page in page_list])})
@@ -1085,12 +1100,12 @@ def page_index_main(doc, opt=None):
         if opt.if_add_doc_description == 'yes':
             doc_description = generate_doc_description(structure, model=opt.model)
             return {
-                'doc_name': get_pdf_name(doc),
+                'doc_name': get_document_name(doc),
                 'doc_description': doc_description,
                 'structure': structure,
             }
     return {
-        'doc_name': get_pdf_name(doc),
+        'doc_name': get_document_name(doc),
         'structure': structure,
     }
 

diff --git a/pageindex/utils.py b/pageindex/utils.py
@@ -2,6 +2,7 @@
 import openai
 import logging
 import os
+import re
 from datetime import datetime
 import time
 import json
@@ -17,6 +18,14 @@
 from pathlib import Path
 from types import SimpleNamespace as config
 
+# Add LlamaIndex import
+try:
+    from llama_index.core.node_parser.text import TokenTextSplitter
+    LLAMA_INDEX_AVAILABLE = True
+except ImportError:
+    LLAMA_INDEX_AVAILABLE = False
+    print("Warning: llama-index not available. Token-based TXT splitting will fall back to character-based.")
+
 CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")
 
 
@@ -292,25 +301,26 @@ def sanitize_filename(filename, replacement='-'):
     # Null can't be represented in strings, so we only handle '/'.
     return filename.replace('/', replacement)
 
-def get_pdf_name(pdf_path):
-    # Extract PDF name
-    if isinstance(pdf_path, str):
-        pdf_name = os.path.basename(pdf_path)
-    elif isinstance(pdf_path, BytesIO):
-        pdf_reader = PyPDF2.PdfReader(pdf_path)
+def get_document_name(doc_path):
+    # Extract document name for both PDF and TXT files
+    if isinstance(doc_path, str):
+        doc_name = os.path.basename(doc_path)
+    elif isinstance(doc_path, BytesIO):
+        # For BytesIO, assume it's a PDF and try to get title from metadata
+        pdf_reader = PyPDF2.PdfReader(doc_path)
         meta = pdf_reader.metadata
-        pdf_name = meta.title if meta and meta.title else 'Untitled'
-        pdf_name = sanitize_filename(pdf_name)
-    return pdf_name
+        doc_name = meta.title if meta and meta.title else 'Untitled'
+        doc_name = sanitize_filename(doc_name)
+    return doc_name
 
 
 class JsonLogger:
     def __init__(self, file_path):
-        # Extract PDF name for logger name
-        pdf_name = get_pdf_name(file_path)
+        # Extract document name for logger name
+        doc_name = get_document_name(file_path)
 
         current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
-        self.filename = f"{pdf_name}_{current_time}.json"
+        self.filename = f"{doc_name}_{current_time}.json"
         os.makedirs("./logs", exist_ok=True)
         # Initialize empty list to store all messages
         self.log_data = []
@@ -408,10 +418,105 @@ def add_preface_if_needed(data):
 
 
 
-def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
+def get_txt_page_tokens(txt_path, model="gpt-4o-2024-11-20", max_chars_per_page=2048, 
+                       method="token", tokens_per_page=512, tokenizer_name="gpt2", chunk_overlap=10):
+    """
+    Split TXT file into logical pages using character or token-based segmentation.
+
+    Args:
+        txt_path: Path to TXT file
+        model: Model name for token counting (for final token count)
+        max_chars_per_page: Maximum characters per page (for char method)
+        method: Segmentation method ("char" or "token")
+        tokens_per_page: Number of tokens per page (for token method)
+        tokenizer_name: Name of the tokenizer encoding (e.g., "gpt2", "cl100k_base")
+        chunk_overlap: Number of tokens to overlap between chunks (for token method)
+
+    Returns:
+        List of (page_text, token_count) tuples
+    """
+    enc = tiktoken.encoding_for_model(model)
+
+    with open(txt_path, 'r', encoding='utf-8', errors='ignore') as f:
+        full_text = f.read()
+
+    if method == "token" and LLAMA_INDEX_AVAILABLE:
+        # Use token-based segmentation with LlamaIndex
+        try:
+            tokenizer = tiktoken.get_encoding(tokenizer_name)
+            text_splitter = TokenTextSplitter(
+                chunk_size=tokens_per_page,
+                chunk_overlap=chunk_overlap,
+                tokenizer=tokenizer.encode
+            )
+            chunks = text_splitter.split_text(full_text)
+
+            page_list = []
+            for chunk in chunks:
+                token_length = len(enc.encode(chunk))
+                page_list.append((chunk.strip(), token_length))
+
+            print(f"{len(page_list)} pages created using token-based segmentation (tokens_per_page = {tokens_per_page})")
+            return page_list
+
+        except Exception as e:
+            print(f"Token-based segmentation failed: {e}")
+            print("Falling back to character-based segmentation...")
+            method = "char"
+
+    # Character-based segmentation (fallback or explicit choice)
+    if method == "char" or not LLAMA_INDEX_AVAILABLE:
+        page_list = []
+        for i in range(0, len(full_text), max_chars_per_page):
+            page_text = full_text[i:i + max_chars_per_page]
+            # Try to end at a word boundary
+            if i + max_chars_per_page < len(full_text):
+                last_space = page_text.rfind(' ')
+                if last_space > max_chars_per_page * 0.8:  # Only break at space if it's not too early
+                    page_text = page_text[:last_space]
+
+            token_length = len(enc.encode(page_text))
+            page_list.append((page_text.strip(), token_length))
+
+        print(f"{len(page_list)} pages created using character-based segmentation (max_chars_per_page = {max_chars_per_page})")
+        return page_list
+
+
+def get_page_tokens(doc_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2", 
+                   txt_method="token", txt_tokens_per_page=512, txt_chars_per_page=2048, 
+                   txt_tokenizer="gpt2", txt_chunk_overlap=10):
+    """
+    Extract pages and token counts from PDF or TXT files.
+
+    Args:
+        doc_path: Path to PDF or TXT file, or BytesIO object
+        model: Model name for token counting
+        pdf_parser: PDF parser to use ("PyPDF2" or "PyMuPDF")
+        txt_method: TXT segmentation method ("char" or "token")
+        txt_tokens_per_page: Number of tokens per page for TXT files
+        txt_chars_per_page: Number of characters per page for TXT files
+        txt_tokenizer: Tokenizer encoding name for TXT files
+        txt_chunk_overlap: Token overlap between chunks for TXT files
+
+    Returns:
+        List of (page_text, token_count) tuples
+    """
+    # Handle TXT files
+    if isinstance(doc_path, str) and doc_path.lower().endswith(".txt"):
+        return get_txt_page_tokens(
+            doc_path, 
+            model=model,
+            max_chars_per_page=txt_chars_per_page,
+            method=txt_method,
+            tokens_per_page=txt_tokens_per_page,
+            tokenizer_name=txt_tokenizer,
+            chunk_overlap=txt_chunk_overlap
+        )
+
+    # Handle PDF files (existing logic)
     enc = tiktoken.encoding_for_model(model)
     if pdf_parser == "PyPDF2":
-        pdf_reader = PyPDF2.PdfReader(pdf_path)
+        pdf_reader = PyPDF2.PdfReader(doc_path)
         page_list = []
         for page_num in range(len(pdf_reader.pages)):
             page = pdf_reader.pages[page_num]
@@ -420,11 +525,11 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
             page_list.append((page_text, token_length))
         return page_list
     elif pdf_parser == "PyMuPDF":
-        if isinstance(pdf_path, BytesIO):
-            pdf_stream = pdf_path
+        if isinstance(doc_path, BytesIO):
+            pdf_stream = doc_path
             doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
-        elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
-            doc = pymupdf.open(pdf_path)
+        elif isinstance(doc_path, str) and os.path.isfile(doc_path) and doc_path.lower().endswith(".pdf"):
+            doc = pymupdf.open(doc_path)
         page_list = []
         for page in doc:
             page_text = page.get_text()

diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ PyPDF2==3.0.1
 python-dotenv==1.1.0
 tiktoken==0.7.0
 pyyaml==6.0.2
+llama-index-core==0.12.41