0% found this document useful (0 votes)
3 views4 pages

PDF Processor

The LEO PDF Processor module is designed to process PDF files for intent generation by extracting metadata, text, and key phrases. It utilizes the PyMuPDF library for PDF handling and can optionally use spaCy for advanced key phrase extraction. The module provides progress updates and handles errors during the processing of PDF files.

Uploaded by

raynyx77
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views4 pages

PDF Processor

The LEO PDF Processor module is designed to process PDF files for intent generation by extracting metadata, text, and key phrases. It utilizes the PyMuPDF library for PDF handling and can optionally use spaCy for advanced key phrase extraction. The module provides progress updates and handles errors during the processing of PDF files.

Uploaded by

raynyx77
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
LEO PDF Processor

This module processes PDF files for intent generation.


"""

import os
import logging
import re
from collections import Counter

class PDFProcessor:
"""Processes PDF files for intent generation."""

def __init__(self):
"""Initialize the PDF processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None

def process(self, file_path):


"""
Process a PDF file.

Args:
file_path (str): Path to the PDF file

Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing PDF file: {os.path.basename(file_path)}")
self.on_progress(10)

# Try to import PyMuPDF


try:
import fitz # PyMuPDF
except ImportError:
raise ImportError("PyMuPDF (fitz) is required for PDF processing.
Please install it with 'pip install PyMuPDF'.")

self.on_progress(20)

# Open the PDF


self.on_status("Opening PDF...")
doc = fitz.open(file_path)

self.on_progress(30)

# Extract basic information


self.on_status("Extracting document information...")
info = {
'title': doc.metadata.get('title', ''),
'author': doc.metadata.get('author', ''),
'subject': doc.metadata.get('subject', ''),
'keywords': doc.metadata.get('keywords', ''),
'num_pages': len(doc),
'format': doc.metadata.get('format', '')
}

self.on_progress(40)

# Extract text
self.on_status("Extracting text...")
text = ""
for page_num in range(len(doc)):
self.on_status(f"Processing page {page_num + 1} of {len(doc)}...")
page = doc.load_page(page_num)
text += page.get_text()
self.on_progress(40 + int(50 * (page_num + 1) / len(doc)))

# Split into sentences


self.on_status("Splitting into sentences...")
sentences = self._split_into_sentences(text)

self.on_progress(90)

# Extract key phrases


self.on_status("Extracting key phrases...")
key_phrases = self._extract_key_phrases(sentences)

# Close the document


doc.close()

# Combine results
result = {
'info': info,
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}

self.on_progress(100)
self.on_status("PDF processing complete")

return result

except Exception as e:
logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
raise

def _split_into_sentences(self, text):


"""
Split text into sentences.

Args:
text (str): Text to split

Returns:
list: List of sentences
"""
# Clean text
text = re.sub(r'\s+', ' ', text)

# Simple sentence splitting


sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter out empty sentences
sentences = [s.strip() for s in sentences if s.strip()]

return sentences

def _extract_key_phrases(self, sentences):


"""
Extract key phrases from sentences.

Args:
sentences (list): List of sentences

Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy

# Load spaCy model


nlp = spacy.load("en_core_web_sm")

key_phrases = []

for sentence in sentences[:100]: # Limit to first 100 sentences


doc = nlp(sentence)

# Extract noun phrases


for chunk in doc.noun_chunks:
if len(chunk.text.split()) > 1: # Only multi-word phrases
key_phrases.append(chunk.text)

# Extract verb phrases


for token in doc:
if token.pos_ == "VERB":
phrase = token.text
for child in token.children:
if child.dep_ in ["dobj", "pobj"]:
phrase += " " + child.text
key_phrases.append(phrase)

# Count frequencies
phrase_counts = Counter(key_phrases)

# Return top phrases


return [phrase for phrase, count in phrase_counts.most_common(30)]

except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")

# Tokenize
words = []
for sentence in sentences[:100]: # Limit to first 100 sentences
words.extend(sentence.lower().split())

# Count word frequencies


word_counts = Counter(words)

# Get common bigrams


bigrams = []
for i in range(len(words) - 1):
bigrams.append(words[i] + " " + words[i + 1])

bigram_counts = Counter(bigrams)

# Return top phrases


return [phrase for phrase, count in bigram_counts.most_common(30)]

You might also like