0% found this document useful (0 votes)

3 views4 pages

PDF Processor

The LEO PDF Processor module is designed to process PDF files for intent generation by extracting metadata, text, and key phrases. It utilizes the PyMuPDF library for PDF handling and can optionally use spaCy for advanced key phrase extraction. The module provides progress updates and handles errors during the processing of PDF files.

Uploaded by

raynyx77

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

3 views4 pages

PDF Processor

Uploaded by

raynyx77

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 4

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
LEO PDF Processor

This module processes PDF files for intent generation.

"""

import os
import logging
import re
from collections import Counter

class PDFProcessor:
"""Processes PDF files for intent generation."""

def __init__(self):
"""Initialize the PDF processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None

def process(self, file_path):

"""
Process a PDF file.

Args:
file_path (str): Path to the PDF file

Returns:
dict: Processed data
"""
try:
self.on_status(f"Processing PDF file: {os.path.basename(file_path)}")
self.on_progress(10)

# Try to import PyMuPDF

try:
import fitz # PyMuPDF
except ImportError:
raise ImportError("PyMuPDF (fitz) is required for PDF processing.
Please install it with 'pip install PyMuPDF'.")

self.on_progress(20)

# Open the PDF

self.on_status("Opening PDF...")
doc = fitz.open(file_path)

self.on_progress(30)

# Extract basic information

self.on_status("Extracting document information...")
info = {
'title': doc.metadata.get('title', ''),
'author': doc.metadata.get('author', ''),
'subject': doc.metadata.get('subject', ''),
'keywords': doc.metadata.get('keywords', ''),
'num_pages': len(doc),
'format': doc.metadata.get('format', '')
}

self.on_progress(40)

# Extract text
self.on_status("Extracting text...")
text = ""
for page_num in range(len(doc)):
self.on_status(f"Processing page {page_num + 1} of {len(doc)}...")
page = doc.load_page(page_num)
text += page.get_text()
self.on_progress(40 + int(50 * (page_num + 1) / len(doc)))

# Split into sentences

self.on_status("Splitting into sentences...")
sentences = self._split_into_sentences(text)

self.on_progress(90)

# Extract key phrases

self.on_status("Extracting key phrases...")
key_phrases = self._extract_key_phrases(sentences)

# Close the document

doc.close()

# Combine results
result = {
'info': info,
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}

self.on_progress(100)
self.on_status("PDF processing complete")

return result

except Exception as e:
logging.error(f"Error processing PDF file: {str(e)}", exc_info=True)
raise

def _split_into_sentences(self, text):

"""
Split text into sentences.

Args:
text (str): Text to split

Returns:
list: List of sentences
"""
# Clean text
text = re.sub(r'\s+', ' ', text)

# Simple sentence splitting

sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter out empty sentences
sentences = [s.strip() for s in sentences if s.strip()]

return sentences

def _extract_key_phrases(self, sentences):

"""
Extract key phrases from sentences.

Args:
sentences (list): List of sentences

Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy

# Load spaCy model

nlp = spacy.load("en_core_web_sm")

key_phrases = []

for sentence in sentences[:100]: # Limit to first 100 sentences

doc = nlp(sentence)

# Extract noun phrases

for chunk in doc.noun_chunks:
if len(chunk.text.split()) > 1: # Only multi-word phrases
key_phrases.append(chunk.text)

# Extract verb phrases

for token in doc:
if token.pos_ == "VERB":
phrase = token.text
for child in token.children:
if child.dep_ in ["dobj", "pobj"]:
phrase += " " + child.text
key_phrases.append(phrase)

# Count frequencies
phrase_counts = Counter(key_phrases)

# Return top phrases

return [phrase for phrase, count in phrase_counts.most_common(30)]

except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")

# Tokenize
words = []
for sentence in sentences[:100]: # Limit to first 100 sentences
words.extend(sentence.lower().split())

# Count word frequencies

word_counts = Counter(words)

# Get common bigrams

bigrams = []
for i in range(len(words) - 1):
bigrams.append(words[i] + " " + words[i + 1])

bigram_counts = Counter(bigrams)

# Return top phrases

return [phrase for phrase, count in bigram_counts.most_common(30)]

Exercise - Analytical Exposition Text
40% (5)
Exercise - Analytical Exposition Text
3 pages
CS 3308 Programming Assignment Unit 4
No ratings yet
CS 3308 Programming Assignment Unit 4
7 pages
Python NLP
No ratings yet
Python NLP
15 pages
Sample Process Template SCM
0% (1)
Sample Process Template SCM
70 pages
Extracting Text and Images From PDF Files
No ratings yet
Extracting Text and Images From PDF Files
10 pages
Hist SN T1 e ST
No ratings yet
Hist SN T1 e ST
58 pages
B350 - STD - 7 - Maths - em - Term 1 PDF
No ratings yet
B350 - STD - 7 - Maths - em - Term 1 PDF
136 pages
Preliminar Não Fabricar: Plan View From Above Showing Foundation Hole Drilling
No ratings yet
Preliminar Não Fabricar: Plan View From Above Showing Foundation Hole Drilling
1 page
Text Processor
No ratings yet
Text Processor
3 pages
Lecture 31-Document GPT Hands On
No ratings yet
Lecture 31-Document GPT Hands On
18 pages
Cover Sheet: For Audited Financial Statements
80% (10)
Cover Sheet: For Audited Financial Statements
2 pages
Image To Doc Working
No ratings yet
Image To Doc Working
4 pages
Language Engineering - Section
No ratings yet
Language Engineering - Section
20 pages
Files in Python
No ratings yet
Files in Python
22 pages
Untitled Document
No ratings yet
Untitled Document
18 pages
Long Docs
No ratings yet
Long Docs
8 pages
Natural Language Processing Lab Manual
No ratings yet
Natural Language Processing Lab Manual
24 pages
5 Python Fundamentals m04 Objects Slides
No ratings yet
5 Python Fundamentals m04 Objects Slides
89 pages
Script Output
No ratings yet
Script Output
53 pages
TextSimp Summarization Project
No ratings yet
TextSimp Summarization Project
3 pages
0 387 28942 9
No ratings yet
0 387 28942 9
703 pages
Genre Worksheet 1 Answers
No ratings yet
Genre Worksheet 1 Answers
3 pages
Project
No ratings yet
Project
2 pages
Proj1 Fall24
No ratings yet
Proj1 Fall24
2 pages
Python Script For PDF - Reading
No ratings yet
Python Script For PDF - Reading
2 pages
File Handling Practice Red
No ratings yet
File Handling Practice Red
3 pages
Numericals (Force)
No ratings yet
Numericals (Force)
22 pages
Tsarecord
No ratings yet
Tsarecord
22 pages
NLP Expts
No ratings yet
NLP Expts
41 pages
Batch 2
No ratings yet
Batch 2
13 pages
SK NLP Practical (FS)
No ratings yet
SK NLP Practical (FS)
22 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
Claude Comparet DB
No ratings yet
Claude Comparet DB
8 pages
Mapeh Blank Grading Sheet
No ratings yet
Mapeh Blank Grading Sheet
19 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
Ballerono Cappuchino
No ratings yet
Ballerono Cappuchino
10 pages
Bling
No ratings yet
Bling
7 pages
20BCE1779 - Web Mining - Lab-4
No ratings yet
20BCE1779 - Web Mining - Lab-4
10 pages
NLP TP1 Report Lahouel Ibrahim
No ratings yet
NLP TP1 Report Lahouel Ibrahim
6 pages
Sousa Graphics Gems CryENGINE3
No ratings yet
Sousa Graphics Gems CryENGINE3
59 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
Micro Controller For Beginners
No ratings yet
Micro Controller For Beginners
11 pages
Assignment 4
No ratings yet
Assignment 4
11 pages
IR Assignment6
No ratings yet
IR Assignment6
5 pages
Notes - by Kishor
No ratings yet
Notes - by Kishor
11 pages
Final NLP Lab File
No ratings yet
Final NLP Lab File
28 pages
Encyclopedia of Giftedness Creativity and Talent 1st Edition Barbara Kerr Download
No ratings yet
Encyclopedia of Giftedness Creativity and Talent 1st Edition Barbara Kerr Download
86 pages
NLP Lab - Manual
No ratings yet
NLP Lab - Manual
33 pages
CS Practical File
No ratings yet
CS Practical File
47 pages
Assignment 2
No ratings yet
Assignment 2
4 pages
Information Retrieval WA
No ratings yet
Information Retrieval WA
9 pages
Practical File by Aksh Jaiswal
No ratings yet
Practical File by Aksh Jaiswal
48 pages
CZA PPT Tirana Final 8 CD 5039
No ratings yet
CZA PPT Tirana Final 8 CD 5039
94 pages
GM 3500T OwnersManual
No ratings yet
GM 3500T OwnersManual
36 pages
Cs 3308 Unit 7 Programming Assignment
No ratings yet
Cs 3308 Unit 7 Programming Assignment
8 pages
Introduction
No ratings yet
Introduction
17 pages
Sentence Correction Rules
No ratings yet
Sentence Correction Rules
27 pages
AngryBirds Physics
No ratings yet
AngryBirds Physics
3 pages
Mec-1200 Vet
No ratings yet
Mec-1200 Vet
2 pages
Basfiber For Construction Market (US Customary Units) .
No ratings yet
Basfiber For Construction Market (US Customary Units) .
4 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
BC672 772RB-2 6pg
No ratings yet
BC672 772RB-2 6pg
6 pages
卂几ㄖ几ㄚ
No ratings yet
卂几ㄖ几ㄚ
8 pages
Narrative Report 3rd
No ratings yet
Narrative Report 3rd
2 pages
Writing Task 2-1
No ratings yet
Writing Task 2-1
3 pages
8 Step Training Model
No ratings yet
8 Step Training Model
1 page
Hypothesis (Pooled T Test)
No ratings yet
Hypothesis (Pooled T Test)
31 pages
18CSP83 - Project Phase 2 - Body
No ratings yet
18CSP83 - Project Phase 2 - Body
11 pages
Webview
No ratings yet
Webview
3 pages
How To Use The TIMESTAMPADD Parameter To Retrieve by Today - X Time in An Alma Analytics Report
No ratings yet
How To Use The TIMESTAMPADD Parameter To Retrieve by Today - X Time in An Alma Analytics Report
27 pages
2025 Uc Secondary Teaching
No ratings yet
2025 Uc Secondary Teaching
20 pages
1c - Business Letter Rules
No ratings yet
1c - Business Letter Rules
1 page
Dataset Manager
No ratings yet
Dataset Manager
6 pages
ARC Prize 2025 Paper Submission
No ratings yet
ARC Prize 2025 Paper Submission
13 pages
Research
No ratings yet
Research
9 pages
Figure1 Belief in Pseudoscience
No ratings yet
Figure1 Belief in Pseudoscience
1 page
NLP Lab Manual
No ratings yet
NLP Lab Manual
15 pages
Python Reference: An Alphabetical Guide
From Everand
Python Reference: An Alphabetical Guide
Jo Foster
No ratings yet
Simplifying Data Science With Python
From Everand
Simplifying Data Science With Python
Billy David millican
No ratings yet
50 Recipes for Programming Node.js
From Everand
50 Recipes for Programming Node.js
Jamie Munro
3/5 (4)
10 Lessons in Front-end
From Everand
10 Lessons in Front-end
Krasimir Tsonev
2/5 (1)
Quick Python Guide
From Everand
Quick Python Guide
Coder1
No ratings yet
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
C++ Functions and tutorial
From Everand
C++ Functions and tutorial
Nino Paiotta
No ratings yet
PHP programming
From Everand
PHP programming
Nino Paiotta
No ratings yet
50 Python Concepts Every Developer Should Know
From Everand
50 Python Concepts Every Developer Should Know
Hernando Abella
No ratings yet
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
PHP Interview Questions, Answers, and Explanations: PHP Certification Review: PHP FAQ
From Everand
PHP Interview Questions, Answers, and Explanations: PHP Certification Review: PHP FAQ
equitypress
No ratings yet
Python: Advanced Guide to Programming Code with Python: Python Computer Programming, #4
From Everand
Python: Advanced Guide to Programming Code with Python: Python Computer Programming, #4
Charlie Masterson
No ratings yet
Python: Advanced Guide to Programming Code with Python
From Everand
Python: Advanced Guide to Programming Code with Python
Charlie Masterson
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
From Everand
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
Manish Soni
No ratings yet
UNIX Shell Programming Interview Questions You'll Most Likely Be Asked
From Everand
UNIX Shell Programming Interview Questions You'll Most Likely Be Asked
Vibrant Publishers
No ratings yet

PDF Processor

Uploaded by

PDF Processor

Uploaded by

#!

This module processes PDF files for intent generation.

def process(self, file_path):

# Try to import PyMuPDF

# Open the PDF

# Extract basic information

# Split into sentences

# Extract key phrases

# Close the document

def _split_into_sentences(self, text):

# Simple sentence splitting

def _extract_key_phrases(self, sentences):

# Load spaCy model

for sentence in sentences[:100]: # Limit to first 100 sentences

# Extract noun phrases

# Extract verb phrases

# Return top phrases

# Count word frequencies

# Get common bigrams

# Return top phrases

You might also like