0% found this document useful (0 votes)

4 views8 pages

Claude Comparet DB

The document outlines a Python script designed for processing PDF files to extract text, generate embeddings, and calculate cosine similarity for matching sections based on a similarity threshold. It includes memory management techniques, parallel processing for efficiency, and logging for tracking performance and errors. Key functionalities involve initializing a machine learning model, handling text extraction, and interfacing with a database for embedding retrieval.

Uploaded by

no one

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

4 views8 pages

Claude Comparet DB

Uploaded by

no one

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 8

import os

import sys
import logging
import datetime
import psutil
from takeTime import Timer
import pretty_errors
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import fitz
from text_processing import clear_text, is_radar_header
from semantic_text_splitter import TextSplitter
from result_printer import res_printer
from sklearn.metrics.pairwise import cosine_similarity
import gc
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

logging.basicConfig(
filename="./data/logs/compareT_db_memory_usage.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)

splitter = TextSplitter((260, 1000))

pretty_errors.configure(
display_timestamp=1,
timestamp_function=lambda: datetime.datetime.now().strftime("%Y-%m-%d %H:%M:
%S"),
lines_before=2,
lines_after=1,
display_locals=1,
)

# Configure memory threshold based on system capabilities

MAX_MEMORY = 0.75 * psutil.virtual_memory().total # Use 75% of available system
memory

# Global model initialization to avoid reloading

EMBEDDING_MODEL = None

def initialize_model():
"""Initialize the embedding model with optimal settings once globally."""
global EMBEDDING_MODEL
if EMBEDDING_MODEL is None:
# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

EMBEDDING_MODEL = SentenceTransformer(
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
device=device
)
return EMBEDDING_MODEL
def log_memory_usage(step=""):
"""Logs the memory usage of the script with an optional step name."""
process = psutil.Process(os.getpid())
memory_used = process.memory_info().rss / 1024**2 # Convert bytes to MB
logging.info(f"Memory Usage after {step}: {memory_used:.2f} MB")

# Monitor system memory and trigger GC if needed

if process.memory_info().rss > MAX_MEMORY:
logging.warning(f"Memory threshold exceeded. Triggering garbage
collection.")
gc.collect()

def extract_list_blocks_sem_chunker(file):
"""Extract text chunks from PDF using LangChain with optimized settings."""
log_memory_usage("Before extracting text")

# Ensure the file path is correct

file_path = file if os.path.exists(file) else os.path.join('uploaded', file)

# Check if file exists before processing

if not os.path.exists(file_path):
logging.error(f"File not found: {file_path}")
return []

try:
loader = PyPDFLoader(file_path)

# Optimize chunk settings for better performance and relevance

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=256, # Increased for better context
chunk_overlap=20, # Reduced overlap to minimize redundancy
length_function=len,
separators=["\n\n", "\n", ".", " "]
)

# Extract documents
documents = loader.load_and_split(text_splitter=text_splitter)

# Extract text only

text_chunks = [doc.page_content for doc in documents]

log_memory_usage("After extracting text")

# Process chunks in memory-efficient way

clean_chunks = merge_text_if_needed(text_chunks)

# Free memory
del documents
del text_chunks
gc.collect()

return clean_chunks

except Exception as e:
logging.error(f"Error extracting text: {e}")
return []

def merge_text_if_needed(chunks):
"""
Optimize the text merging process by improving boundary detection
and reducing memory allocations.
"""
if not chunks:
return []

# Pre-compile regex pattern for better performance

sentence_end_pattern = re.compile(r'[.!?]')

# Create a new list to avoid modifying the input list during iteration
merged_chunks = chunks.copy()

for i in range(1, len(merged_chunks)):

current_chunk_text = merged_chunks[i].strip()
prev_chunk_text = merged_chunks[i-1].strip()

# Skip empty chunks

if not current_chunk_text:
continue

# If the text starts with a lowercase letter

if current_chunk_text[0].islower():
# Find the first sentence-ending punctuation
match = sentence_end_pattern.search(current_chunk_text)
if match:
cutoff_position = match.end()
text_to_move = current_chunk_text[:cutoff_position].strip()

# Update chunks more efficiently

merged_chunks[i-1] = prev_chunk_text + ' ' + text_to_move
merged_chunks[i] = current_chunk_text[cutoff_position:].strip()

# Remove empty chunks to save memory

merged_chunks = [chunk for chunk in merged_chunks if chunk.strip()]

return merged_chunks

def get_optimal_batch_size(texts):
"""Dynamically determine optimal batch size based on text length and available
memory."""
avg_length = sum(len(text) for text in texts) / len(texts) if texts else 0

# Base batch size on average text length

if avg_length > 1000:
return 16 # Smaller batches for very long texts
elif avg_length > 500:
return 32 # Medium batches
else:
return 64 # Larger batches

def get_embeddings_batch(batch):
"""Process a single batch of embeddings."""
model = initialize_model()

try:
embeddings = model.encode(batch, normalize_embeddings=True,
show_progress_bar=False)
return embeddings
except Exception as e:
logging.error(f"Error generating embeddings: {e}")
# Return empty embeddings for failed batch
return np.zeros((len(batch), model.get_sentence_embedding_dimension()))

def get_embeddings_parallel(texts, max_workers=4):

"""
Generate embeddings for texts using parallel processing.

Args:
texts (list of str): List of text sections to encode
max_workers (int): Maximum number of parallel workers

Returns:
numpy.ndarray: Array of embedding vectors
"""
# Ensure model is initialized first
initialize_model()

if not texts:
return []

# Determine optimal batch size for the texts

batch_size = get_optimal_batch_size(texts)
logging.info(f"Using batch size: {batch_size}")

# Create batches
batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
all_embeddings = []

# Process batches in parallel

with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all batch processing tasks
future_to_batch = {executor.submit(get_embeddings_batch, batch): i
for i, batch in enumerate(batches)}

# Process completed batches

for future in as_completed(future_to_batch):
batch_idx = future_to_batch[future]
try:
batch_embeddings = future.result()
all_embeddings.append(batch_embeddings)
logging.info(f"Processed batch {batch_idx+1}/{len(batches)}")

# Force garbage collection after each batch

gc.collect()

except Exception as e:
logging.error(f"Error processing batch {batch_idx}: {e}")

# Combine all embeddings

if all_embeddings:
return np.vstack(all_embeddings)
return []

def calculate_cosine_similarity(query_v, doc_v, similarity_threshold):

"""Calculate cosine similarity with optimized memory usage."""
log_memory_usage("Before cosine similarity calculation")
# Convert inputs to numpy arrays if they aren't already
if not isinstance(query_v, np.ndarray):
query_v = np.array(query_v)
if not isinstance(doc_v, np.ndarray):
doc_v = np.array(doc_v)

# Process similarity in batches to reduce memory usage

batch_size = 1000 # Adjust based on your memory constraints
num_queries = query_v.shape[0]
result_indices = []
result_similarities = []

for i in range(0, num_queries, batch_size):

end_idx = min(i + batch_size, num_queries)
batch_queries = query_v[i:end_idx]

# Calculate similarities for this batch

similarities = cosine_similarity(batch_queries, doc_v)

# Find matches above threshold

matches = np.where(similarities > similarity_threshold)

# Adjust indices to account for batching

batch_matches = (matches[0] + i, matches[1])

# Store results
for q_idx, d_idx in zip(*batch_matches):
sim_value = similarities[q_idx - i, d_idx]
if sim_value < 1.0: # Exclude exact matches (self-matches)
result_indices.append((q_idx, d_idx))
result_similarities.append(sim_value)

# Clear batch data

del similarities
gc.collect()

# Convert to format expected by calling code

if result_indices:
result_matrix = tuple(zip(*result_indices))
similarity_values = np.zeros((num_queries, doc_v.shape[0]))
for (q_idx, d_idx), sim in zip(result_indices, result_similarities):
similarity_values[q_idx, d_idx] = sim
else:
result_matrix = (np.array([]), np.array([]))
similarity_values = np.zeros((num_queries, doc_v.shape[0]))

log_memory_usage("After cosine similarity calculation")

return result_matrix, similarity_values

def db_connect():
"""Database connection function - imported from db_worker module."""
from db_worker import db_connect
return db_connect()

def db_get_embeddings(cursor, table_name):

"""Get embeddings from database - imported from db_worker module."""
from db_worker import db_get_embeddings
return db_get_embeddings(cursor, table_name)
def main():
with Timer() as timer:
if len(sys.argv) < 2:
logging.error("Missing RADAR file argument. Usage: script.py
<radar_file_path>")
sys.exit(1)

radar_file = sys.argv[1]
if not os.path.exists(radar_file):
logging.error("Compare - Radar file not found.")
sys.exit(1)

log_memory_usage("Start of main()")

# Initialize model at start

initialize_model()

# Extract text chunks from radar file

radar_blocks = extract_list_blocks_sem_chunker(radar_file)
if not radar_blocks:
logging.error("No text blocks extracted from radar file.")
sys.exit(1)

logging.info(f"Extracted {len(radar_blocks)} text blocks from radar file")

# Generate embeddings in parallel with optimized batch processing

log_memory_usage("Before generating embeddings")
# Determine optimal number of workers based on CPU cores
num_workers = min(4, os.cpu_count() or 4)
logging.info(f"Using {num_workers} workers for parallel processing")

radar_embeddings = get_embeddings_parallel(radar_blocks,
max_workers=num_workers)
log_memory_usage("After generating embeddings")

# Get database embeddings

db_data_list = []
conn = None
try:
conn = db_connect()
cursor = conn.cursor()
table_name = "Regulations_embeddings"
db_data_list = db_get_embeddings(cursor, table_name)
logging.info(f"Retrieved {len(db_data_list)} embeddings from database")
except Exception as e:
logging.error(f"Database error: {e}")
finally:
if conn:
conn.close()

if not db_data_list:
logging.error("No embeddings retrieved from database.")
sys.exit(1)

# Extract embeddings and metadata

checker_embeddings = np.array([row[2] for row in db_data_list])
auth_roles = [row[3] for row in db_data_list]

# Calculate similarity with memory optimization

similarity_threshold = 0.755
result_matrix, similarity_values = calculate_cosine_similarity(
radar_embeddings, checker_embeddings, similarity_threshold
)

# Process matches more efficiently

matches = {}
for file_one_idx, file_two_idx in zip(*result_matrix):
key = radar_blocks[file_one_idx]

# Create match entry only once

if key not in matches:
matches[key] = []

# Create match object

dict_to_append = {
"Section_matched": str(db_data_list[file_two_idx][1]),
"Section_pdf_origin": str(db_data_list[file_two_idx][0]),
"Similarity": round(float(similarity_values[file_one_idx,
file_two_idx]), 4),
"Authority": auth_roles[file_two_idx]
}

# Check for duplicates efficiently

if not any(dict_to_append["Section_matched"] ==
existing["Section_matched"] and
dict_to_append["Section_pdf_origin"] ==
existing["Section_pdf_origin"]
for existing in matches[key]):
matches[key].append(dict_to_append)

# Clear large data structures to free memory

del radar_embeddings
del checker_embeddings
del db_data_list
del radar_blocks
gc.collect()
log_memory_usage("After similarity calculations")

# Process and output results

try:
results = {}
index_no = 0
if not matches:
results["nomatch"] = f"No matches found. Minimum similarity threshold
is {similarity_threshold}. Try lowering the threshold."
else:
for key, matches_list in matches.items():
index_no += 1
new_key = f"RADAR match section no.{index_no}"
results[new_key] = {
"Radar_section": key,
"Matches with": sorted(
matches_list, key=lambda x: x["Similarity"], reverse=True
),
}

# Clear matches to free memory

del matches
log_memory_usage("End of main()")

# Output results
res_printer(results, "compareT_db", "RALF")

except Exception as e:
logging.error(f"Error during result processing or printing: {e}")
print(f"Error during result processing or printing: {e}")

if __name__ == "__main__":
main()

SCE Enm C60
No ratings yet
SCE Enm C60
538 pages
ENC User Manual
No ratings yet
ENC User Manual
17 pages
Python Scripts
No ratings yet
Python Scripts
5 pages
RAG With Reinforcement Learning
No ratings yet
RAG With Reinforcement Learning
40 pages
Introduction
No ratings yet
Introduction
17 pages
QA Using Gemini Langchain ChromaDB PDF
No ratings yet
QA Using Gemini Langchain ChromaDB PDF
2 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
RAG Application Using Open Source Tools 1721123882
No ratings yet
RAG Application Using Open Source Tools 1721123882
5 pages
Notes - by Kishor
No ratings yet
Notes - by Kishor
11 pages
LLM Prcess
No ratings yet
LLM Prcess
7 pages
Lecture 31-Document GPT Hands On
No ratings yet
Lecture 31-Document GPT Hands On
18 pages
Case Study
No ratings yet
Case Study
25 pages
Langchain App Design
No ratings yet
Langchain App Design
7 pages
Code Explanation
No ratings yet
Code Explanation
8 pages
C++ Functions and tutorial
From Everand
C++ Functions and tutorial
Nino Paiotta
No ratings yet
MultiModel RAG
No ratings yet
MultiModel RAG
18 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Gen Ai-1
No ratings yet
Gen Ai-1
6 pages
IRT Lab Programs
No ratings yet
IRT Lab Programs
9 pages
Gen Ai 7,8,9,10
No ratings yet
Gen Ai 7,8,9,10
7 pages
Chatbot Code
No ratings yet
Chatbot Code
2 pages
Research Paper Summarization
No ratings yet
Research Paper Summarization
13 pages
Largescaiass 2
No ratings yet
Largescaiass 2
7 pages
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
Guide Ipynb
No ratings yet
Guide Ipynb
26 pages
UNIX Shell Programming Interview Questions You'll Most Likely Be Asked
From Everand
UNIX Shell Programming Interview Questions You'll Most Likely Be Asked
Vibrant Publishers
No ratings yet
50 Recipes for Programming Node.js
From Everand
50 Recipes for Programming Node.js
Jamie Munro
3/5 (4)
365careers - AI - Eng - Bootcamp, Ai, 365careers, Udemy
No ratings yet
365careers - AI - Eng - Bootcamp, Ai, 365careers, Udemy
89 pages
AIlab 10
No ratings yet
AIlab 10
3 pages
Langchain Onepager
No ratings yet
Langchain Onepager
1 page
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
200835.113 - Cheat Sheet
No ratings yet
200835.113 - Cheat Sheet
29 pages
pdf2txt Py
No ratings yet
pdf2txt Py
6 pages
RAG Project Documentation
No ratings yet
RAG Project Documentation
3 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Medical Text Classifier GabrieldeOlaguibel
No ratings yet
Medical Text Classifier GabrieldeOlaguibel
12 pages
NLP
No ratings yet
NLP
15 pages
Long Docs
No ratings yet
Long Docs
8 pages
A-Z of RAG Question Answering Methods in Langchain
No ratings yet
A-Z of RAG Question Answering Methods in Langchain
33 pages
Building RAG Apps
No ratings yet
Building RAG Apps
32 pages
Assessment - 2: - K Mary Nikitha
No ratings yet
Assessment - 2: - K Mary Nikitha
27 pages
Asyncsupport
No ratings yet
Asyncsupport
5 pages
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
From Everand
Oracle Certified Professional Java Programmer OCPJP 1Z0 809
Manish Soni
No ratings yet
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
No ratings yet
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
10 pages
Tutorials Sources Beginner Ptcheat
No ratings yet
Tutorials Sources Beginner Ptcheat
7 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet
Image Caption2
No ratings yet
Image Caption2
9 pages
DL - 20-WordEmbeddings - Ipynb - Colab
No ratings yet
DL - 20-WordEmbeddings - Ipynb - Colab
6 pages
TYPES OF Text Processing Chunks Techniques Best Solution
No ratings yet
TYPES OF Text Processing Chunks Techniques Best Solution
5 pages
Flowise AI Tutorial #3 File Loaders, Text Splitters, Embeddings & Vector Stores
No ratings yet
Flowise AI Tutorial #3 File Loaders, Text Splitters, Embeddings & Vector Stores
3 pages
Advanced C Concepts and Programming: First Edition
From Everand
Advanced C Concepts and Programming: First Edition
Gayatri
3/5 (1)
HPC Codes
No ratings yet
HPC Codes
14 pages
Untitled Document
No ratings yet
Untitled Document
18 pages
Experiment 5.1
No ratings yet
Experiment 5.1
2 pages
Python: Advanced Guide to Programming Code with Python
From Everand
Python: Advanced Guide to Programming Code with Python
Charlie Masterson
No ratings yet
Python: Advanced Guide to Programming Code with Python: Python Computer Programming, #4
From Everand
Python: Advanced Guide to Programming Code with Python: Python Computer Programming, #4
Charlie Masterson
No ratings yet
Fine-Tuned Vs RAG Short Notes ?
No ratings yet
Fine-Tuned Vs RAG Short Notes ?
25 pages
Demo
No ratings yet
Demo
3 pages
20BCE1779 - Web Mining - Lab-4
No ratings yet
20BCE1779 - Web Mining - Lab-4
10 pages
Cse425 Assignement - 20101257
No ratings yet
Cse425 Assignement - 20101257
12 pages
Logs-Lovefy - VLLM Server (Bacchus) - FB
No ratings yet
Logs-Lovefy - VLLM Server (Bacchus) - FB
59 pages
Resume English
No ratings yet
Resume English
3 pages
Print 2
No ratings yet
Print 2
4 pages
DD
No ratings yet
DD
26 pages
Aaaaaaaaa
No ratings yet
Aaaaaaaaa
6 pages
Copie de PFA (Draft)
No ratings yet
Copie de PFA (Draft)
10 pages
Taxonomy
No ratings yet
Taxonomy
38 pages
PriEval Protect
No ratings yet
PriEval Protect
13 pages
MCA Rtu Syllabuss
No ratings yet
MCA Rtu Syllabuss
6 pages
Linked List:: Types of Linked Lists
No ratings yet
Linked List:: Types of Linked Lists
12 pages
Worksheet SQL
No ratings yet
Worksheet SQL
14 pages
Generative AI and ChatGPT 101
100% (1)
Generative AI and ChatGPT 101
27 pages
Internship and Recent Graduate Opportunities 1729278842
No ratings yet
Internship and Recent Graduate Opportunities 1729278842
1 page
Embedded Interview Questions - SPI & I2C
No ratings yet
Embedded Interview Questions - SPI & I2C
10 pages
Iot Competitor Analysis
No ratings yet
Iot Competitor Analysis
6 pages
Buffalo Link Station Quad LS-QL-R5 User Manual
No ratings yet
Buffalo Link Station Quad LS-QL-R5 User Manual
96 pages
Ol8 Relnotes8 8
No ratings yet
Ol8 Relnotes8 8
111 pages
How Format Disk in C#
No ratings yet
How Format Disk in C#
3 pages
Car Template Proposal 4g
No ratings yet
Car Template Proposal 4g
37 pages
DASHConfig RT Guide
No ratings yet
DASHConfig RT Guide
4 pages
90-100001-XX Retailer Vue Service Manual POR
No ratings yet
90-100001-XX Retailer Vue Service Manual POR
136 pages
FALV sitWTRO2016
No ratings yet
FALV sitWTRO2016
10 pages
OSS Information Gateway 2016 Issue 02 (U2000 Poster U2000 Overview V200R016C10)
No ratings yet
OSS Information Gateway 2016 Issue 02 (U2000 Poster U2000 Overview V200R016C10)
4 pages
DEC 4000 Brochure AP
100% (1)
DEC 4000 Brochure AP
8 pages
Arduino OBD2 Simulator - 3 Steps - Instructables
100% (1)
Arduino OBD2 Simulator - 3 Steps - Instructables
7 pages
Examination Guidelines - Infosys Online Test
No ratings yet
Examination Guidelines - Infosys Online Test
3 pages
Bit 2207 Web Design and Development Ii
No ratings yet
Bit 2207 Web Design and Development Ii
2 pages
Gopal Passmanagement
No ratings yet
Gopal Passmanagement
44 pages
Unit1 Detailed Notes DWDM MAKAUT
No ratings yet
Unit1 Detailed Notes DWDM MAKAUT
4 pages
Joyce Resume
No ratings yet
Joyce Resume
3 pages
Ge3151 Anna Univ QP
No ratings yet
Ge3151 Anna Univ QP
8 pages
Employee Schedule1
No ratings yet
Employee Schedule1
4 pages
An PRC 148 Rover FMV MM - Web
100% (1)
An PRC 148 Rover FMV MM - Web
2 pages
MD Lab 5
No ratings yet
MD Lab 5
8 pages
Link Full (SFILE - MOBI) - 1
50% (2)
Link Full (SFILE - MOBI) - 1
15 pages
Selected IBM DB2 Products - Some Replacements Available - Jun 2007 - ENUS907125
No ratings yet
Selected IBM DB2 Products - Some Replacements Available - Jun 2007 - ENUS907125
22 pages

Claude Comparet DB

Uploaded by

Claude Comparet DB

Uploaded by

import os

splitter = TextSplitter((260, 1000))

# Configure memory threshold based on system capabilities

# Global model initialization to avoid reloading

# Monitor system memory and trigger GC if needed

# Ensure the file path is correct

# Check if file exists before processing

# Optimize chunk settings for better performance and relevance

# Extract text only

log_memory_usage("After extracting text")

# Process chunks in memory-efficient way

# Pre-compile regex pattern for better performance

for i in range(1, len(merged_chunks)):

# Skip empty chunks

# If the text starts with a lowercase letter

# Update chunks more efficiently

# Remove empty chunks to save memory

# Base batch size on average text length

def get_embeddings_parallel(texts, max_workers=4):

# Determine optimal batch size for the texts

# Process batches in parallel

# Process completed batches

# Force garbage collection after each batch

# Combine all embeddings

def calculate_cosine_similarity(query_v, doc_v, similarity_threshold):

# Process similarity in batches to reduce memory usage

for i in range(0, num_queries, batch_size):

# Calculate similarities for this batch

# Find matches above threshold

# Adjust indices to account for batching

# Clear batch data

# Convert to format expected by calling code

log_memory_usage("After cosine similarity calculation")

def db_get_embeddings(cursor, table_name):

# Initialize model at start

# Extract text chunks from radar file

logging.info(f"Extracted {len(radar_blocks)} text blocks from radar file")

# Generate embeddings in parallel with optimized batch processing

# Get database embeddings

# Extract embeddings and metadata

# Calculate similarity with memory optimization

# Process matches more efficiently

# Create match entry only once

# Create match object

# Check for duplicates efficiently

# Clear large data structures to free memory

# Process and output results

# Clear matches to free memory

You might also like