PDF Processor
PDF Processor
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO PDF Processor
import os
import logging
import re
from collections import Counter
class PDFProcessor:
"""Processes PDF files for intent generation."""
def __init__(self):
"""Initialize the PDF processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None
Args:
file_path (str): Path to the PDF file
Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing PDF file: {os.path.basename(file_path)}")
self.on_progress(10)
self.on_progress(20)
self.on_progress(30)
self.on_progress(40)
# Extract text
self.on_status("Extracting text...")
text = ""
for page_num in range(len(doc)):
self.on_status(f"Processing page {page_num + 1} of {len(doc)}...")
page = doc.load_page(page_num)
text += page.get_text()
self.on_progress(40 + int(50 * (page_num + 1) / len(doc)))
self.on_progress(90)
# Combine results
result = {
'info': info,
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}
self.on_progress(100)
self.on_status("PDF processing complete")
return result
except Exception as e:
logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
raise
Args:
text (str): Text to split
Returns:
list: List of sentences
"""
# Clean text
text = re.sub(r'\s+', ' ', text)
return sentences
Args:
sentences (list): List of sentences
Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy
key_phrases = []
# Count frequencies
phrase_counts = Counter(key_phrases)
except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")
# Tokenize
words = []
for sentence in sentences[:100]: # Limit to first 100 sentences
words.extend(sentence.lower().split())
bigram_counts = Counter(bigrams)