0% found this document useful (0 votes)
2 views

Python script for PDF_reading

The document outlines a Python class, PDFReader, designed to extract and organize text from PDF files into chapters and topics. It includes methods for extracting text, storing chapters and topics, sorting them, and answering questions based on keyword searches. The example usage demonstrates how to instantiate the class, process a PDF, and query for specific information.

Uploaded by

Anonymous YBAHVQ
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

Python script for PDF_reading

The document outlines a Python class, PDFReader, designed to extract and organize text from PDF files into chapters and topics. It includes methods for extracting text, storing chapters and topics, sorting them, and answering questions based on keyword searches. The example usage demonstrates how to instantiate the class, process a PDF, and query for specific information.

Uploaded by

Anonymous YBAHVQ
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

import PyPDF2

import re
from collections import defaultdict

class PDFReader:
def __init__(self, file_path):
self.file_path = file_path
self.chapters = defaultdict(list)
self.topics = defaultdict(list)

def extract_text(self):
with open(self.file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ''
for page in reader.pages:
text += page.extract_text() + '\n'
return text

def store_chapters(self, text):


# Example regex patterns, adjust according to your PDF structure
chapter_pattern = r'Chapter \d+:'
topic_pattern = r'\d+\.\s+(.*?)(?=\n\d+\.\s+|$)' # Adjust according to
topics structure

chapters = re.split(chapter_pattern, text)


for i, chapter in enumerate(chapters):
if i == 0: # Skip the introduction or non-chapter content
continue
self.chapters[f'Chapter {i}'] = chapter.strip()
topics = re.findall(topic_pattern, chapter)
for topic in topics:
self.topics[f'Chapter {i}'].append(topic.strip())

def sort_data(self):
# Sort topics within each chapter
for chapter in self.topics:
self.topics[chapter].sort()

def answer_question(self, question):


# Simple keyword search in stored data
response = []
for chapter, topics in self.topics.items():
for topic in topics:
if re.search(re.escape(question), topic, re.IGNORECASE):
response.append(f'Found in {chapter}: {topic}')
return response if response else ["No relevant information found."]

def process_pdf(self):
text = self.extract_text()
self.store_chapters(text)
self.sort_data()

# Example usage
if __name__ == "__main__":
pdf_reader = PDFReader('C:/Videos/363007BUSC01282_CDFE02_117.pdf')
pdf_reader.process_pdf()

# Example question
question = 'lifting devices need to be re-certified'
answers = pdf_reader.answer_question(question)
for answer in answers:
print(answer)

You might also like