import PyPDF2
import re
from collections import defaultdict
class PDFReader:
def __init__(self, file_path):
self.file_path = file_path
self.chapters = defaultdict(list)
self.topics = defaultdict(list)
def extract_text(self):
with open(self.file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text() + '\n'
return text
def store_chapters(self, text):
# Example regex patterns, adjust according to your PDF structure
chapter_pattern = r'Chapter \d+:'
topic_pattern = r'\d+\.\s+(.*?)(?=\n\d+\.\s+|$)' # Adjust according to
topics structure
chapters = re.split(chapter_pattern, text)
for i, chapter in enumerate(chapters):
if i == 0: # Skip the introduction or non-chapter content
continue
self.chapters[f'Chapter {i}'] = chapter.strip()
topics = re.findall(topic_pattern, chapter)
for topic in topics:
self.topics[f'Chapter {i}'].append(topic.strip())
def sort_data(self):
# Sort topics within each chapter
for chapter in self.topics:
self.topics[chapter].sort()
def answer_question(self, question):
# Simple keyword search in stored data
response = []
for chapter, topics in self.topics.items():
for topic in topics:
if re.search(re.escape(question), topic, re.IGNORECASE):
response.append(f'Found in {chapter}: {topic}')
return response if response else ["No relevant information found."]
def process_pdf(self):
text = self.extract_text()
self.store_chapters(text)
self.sort_data()
# Example usage
if __name__ == "__main__":
pdf_reader = PDFReader('C:/Videos/363007BUSC01282_CDFE02_117.pdf')
pdf_reader.process_pdf()
# Example question
question = 'lifting devices need to be re-certified'
answers = pdf_reader.answer_question(question)
for answer in answers:
print(answer)