Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15,667 changes: 15,667 additions & 0 deletions docs/ebooks_percival_keene.txt

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion pageindex/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,8 @@ max_token_num_each_node: 20000
if_add_node_id: "yes"
if_add_node_summary: "no"
if_add_doc_description: "yes"
if_add_node_text: "no"
if_add_node_text: "no"
txt_page_method: "token" # "char" or "token"
txt_tokens_per_page: 512
txt_chars_per_page: 2048
txt_tokenizer: "gpt2"
33 changes: 24 additions & 9 deletions pageindex/page_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,7 +971,7 @@ async def meta_processor(page_list, mode=None, toc_content=None, toc_page_list=N
accuracy, incorrect_results = await verify_toc(page_list, toc_with_page_number, start_index=start_index, model=opt.model)

logger.info({
'mode': 'process_toc_with_page_numbers',
'mode': mode,
'accuracy': accuracy,
'incorrect_results': incorrect_results
})
Expand Down Expand Up @@ -1058,15 +1058,30 @@ async def tree_parser(page_list, opt, doc=None, logger=None):
def page_index_main(doc, opt=None):
logger = JsonLogger(doc)

is_valid_pdf = (
(isinstance(doc, str) and os.path.isfile(doc) and doc.lower().endswith(".pdf")) or
is_valid_document = (
(isinstance(doc, str) and os.path.isfile(doc) and
(doc.lower().endswith(".pdf") or doc.lower().endswith(".txt"))) or
isinstance(doc, BytesIO)
)
if not is_valid_pdf:
raise ValueError("Unsupported input type. Expected a PDF file path or BytesIO object.")
if not is_valid_document:
raise ValueError("Unsupported input type. Expected a PDF or TXT file path, or BytesIO object.")

print('Parsing PDF...')
page_list = get_page_tokens(doc)
# Determine file type for processing message
if isinstance(doc, str) and doc.lower().endswith(".txt"):
print('Parsing TXT...')
else:
print('Parsing PDF...')

# Pass TXT parameters to get_page_tokens
page_list = get_page_tokens(
doc,
model=opt.model if opt else "gpt-4o-2024-11-20",
txt_method=getattr(opt, 'txt_page_method', 'token'),
txt_tokens_per_page=getattr(opt, 'txt_tokens_per_page', 512),
txt_chars_per_page=getattr(opt, 'txt_chars_per_page', 2048),
txt_tokenizer=getattr(opt, 'txt_tokenizer', 'gpt2'),
txt_chunk_overlap=getattr(opt, 'txt_chunk_overlap', 10)
)

logger.info({'total_page_number': len(page_list)})
logger.info({'total_token': sum([page[1] for page in page_list])})
Expand All @@ -1085,12 +1100,12 @@ def page_index_main(doc, opt=None):
if opt.if_add_doc_description == 'yes':
doc_description = generate_doc_description(structure, model=opt.model)
return {
'doc_name': get_pdf_name(doc),
'doc_name': get_document_name(doc),
'doc_description': doc_description,
'structure': structure,
}
return {
'doc_name': get_pdf_name(doc),
'doc_name': get_document_name(doc),
'structure': structure,
}

Expand Down
141 changes: 123 additions & 18 deletions pageindex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import openai
import logging
import os
import re
from datetime import datetime
import time
import json
Expand All @@ -17,6 +18,14 @@
from pathlib import Path
from types import SimpleNamespace as config

# Add LlamaIndex import
try:
from llama_index.core.node_parser.text import TokenTextSplitter
LLAMA_INDEX_AVAILABLE = True
except ImportError:
LLAMA_INDEX_AVAILABLE = False
print("Warning: llama-index not available. Token-based TXT splitting will fall back to character-based.")

CHATGPT_API_KEY = os.getenv("CHATGPT_API_KEY")


Expand Down Expand Up @@ -292,25 +301,26 @@ def sanitize_filename(filename, replacement='-'):
# Null can't be represented in strings, so we only handle '/'.
return filename.replace('/', replacement)

def get_pdf_name(pdf_path):
# Extract PDF name
if isinstance(pdf_path, str):
pdf_name = os.path.basename(pdf_path)
elif isinstance(pdf_path, BytesIO):
pdf_reader = PyPDF2.PdfReader(pdf_path)
def get_document_name(doc_path):
# Extract document name for both PDF and TXT files
if isinstance(doc_path, str):
doc_name = os.path.basename(doc_path)
elif isinstance(doc_path, BytesIO):
# For BytesIO, assume it's a PDF and try to get title from metadata
pdf_reader = PyPDF2.PdfReader(doc_path)
meta = pdf_reader.metadata
pdf_name = meta.title if meta and meta.title else 'Untitled'
pdf_name = sanitize_filename(pdf_name)
return pdf_name
doc_name = meta.title if meta and meta.title else 'Untitled'
doc_name = sanitize_filename(doc_name)
return doc_name


class JsonLogger:
def __init__(self, file_path):
# Extract PDF name for logger name
pdf_name = get_pdf_name(file_path)
# Extract document name for logger name
doc_name = get_document_name(file_path)

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
self.filename = f"{pdf_name}_{current_time}.json"
self.filename = f"{doc_name}_{current_time}.json"
os.makedirs("./logs", exist_ok=True)
# Initialize empty list to store all messages
self.log_data = []
Expand Down Expand Up @@ -408,10 +418,105 @@ def add_preface_if_needed(data):



def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
def get_txt_page_tokens(txt_path, model="gpt-4o-2024-11-20", max_chars_per_page=2048,
method="token", tokens_per_page=512, tokenizer_name="gpt2", chunk_overlap=10):
"""
Split TXT file into logical pages using character or token-based segmentation.

Args:
txt_path: Path to TXT file
model: Model name for token counting (for final token count)
max_chars_per_page: Maximum characters per page (for char method)
method: Segmentation method ("char" or "token")
tokens_per_page: Number of tokens per page (for token method)
tokenizer_name: Name of the tokenizer encoding (e.g., "gpt2", "cl100k_base")
chunk_overlap: Number of tokens to overlap between chunks (for token method)

Returns:
List of (page_text, token_count) tuples
"""
enc = tiktoken.encoding_for_model(model)

with open(txt_path, 'r', encoding='utf-8', errors='ignore') as f:
full_text = f.read()

if method == "token" and LLAMA_INDEX_AVAILABLE:
# Use token-based segmentation with LlamaIndex
try:
tokenizer = tiktoken.get_encoding(tokenizer_name)
text_splitter = TokenTextSplitter(
chunk_size=tokens_per_page,
chunk_overlap=chunk_overlap,
tokenizer=tokenizer.encode
)
chunks = text_splitter.split_text(full_text)

page_list = []
for chunk in chunks:
token_length = len(enc.encode(chunk))
page_list.append((chunk.strip(), token_length))

print(f"{len(page_list)} pages created using token-based segmentation (tokens_per_page = {tokens_per_page})")
return page_list

except Exception as e:
print(f"Token-based segmentation failed: {e}")
print("Falling back to character-based segmentation...")
method = "char"

# Character-based segmentation (fallback or explicit choice)
if method == "char" or not LLAMA_INDEX_AVAILABLE:
page_list = []
for i in range(0, len(full_text), max_chars_per_page):
page_text = full_text[i:i + max_chars_per_page]
# Try to end at a word boundary
if i + max_chars_per_page < len(full_text):
last_space = page_text.rfind(' ')
if last_space > max_chars_per_page * 0.8: # Only break at space if it's not too early
page_text = page_text[:last_space]

token_length = len(enc.encode(page_text))
page_list.append((page_text.strip(), token_length))

print(f"{len(page_list)} pages created using character-based segmentation (max_chars_per_page = {max_chars_per_page})")
return page_list


def get_page_tokens(doc_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2",
txt_method="token", txt_tokens_per_page=512, txt_chars_per_page=2048,
txt_tokenizer="gpt2", txt_chunk_overlap=10):
"""
Extract pages and token counts from PDF or TXT files.

Args:
doc_path: Path to PDF or TXT file, or BytesIO object
model: Model name for token counting
pdf_parser: PDF parser to use ("PyPDF2" or "PyMuPDF")
txt_method: TXT segmentation method ("char" or "token")
txt_tokens_per_page: Number of tokens per page for TXT files
txt_chars_per_page: Number of characters per page for TXT files
txt_tokenizer: Tokenizer encoding name for TXT files
txt_chunk_overlap: Token overlap between chunks for TXT files

Returns:
List of (page_text, token_count) tuples
"""
# Handle TXT files
if isinstance(doc_path, str) and doc_path.lower().endswith(".txt"):
return get_txt_page_tokens(
doc_path,
model=model,
max_chars_per_page=txt_chars_per_page,
method=txt_method,
tokens_per_page=txt_tokens_per_page,
tokenizer_name=txt_tokenizer,
chunk_overlap=txt_chunk_overlap
)

# Handle PDF files (existing logic)
enc = tiktoken.encoding_for_model(model)
if pdf_parser == "PyPDF2":
pdf_reader = PyPDF2.PdfReader(pdf_path)
pdf_reader = PyPDF2.PdfReader(doc_path)
page_list = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
Expand All @@ -420,11 +525,11 @@ def get_page_tokens(pdf_path, model="gpt-4o-2024-11-20", pdf_parser="PyPDF2"):
page_list.append((page_text, token_length))
return page_list
elif pdf_parser == "PyMuPDF":
if isinstance(pdf_path, BytesIO):
pdf_stream = pdf_path
if isinstance(doc_path, BytesIO):
pdf_stream = doc_path
doc = pymupdf.open(stream=pdf_stream, filetype="pdf")
elif isinstance(pdf_path, str) and os.path.isfile(pdf_path) and pdf_path.lower().endswith(".pdf"):
doc = pymupdf.open(pdf_path)
elif isinstance(doc_path, str) and os.path.isfile(doc_path) and doc_path.lower().endswith(".pdf"):
doc = pymupdf.open(doc_path)
page_list = []
for page in doc:
page_text = page.get_text()
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ PyPDF2==3.0.1
python-dotenv==1.1.0
tiktoken==0.7.0
pyyaml==6.0.2
llama-index-core==0.12.41
Loading