0% found this document useful (0 votes)
4 views8 pages

Claude Comparet DB

The document outlines a Python script designed for processing PDF files to extract text, generate embeddings, and calculate cosine similarity for matching sections based on a similarity threshold. It includes memory management techniques, parallel processing for efficiency, and logging for tracking performance and errors. Key functionalities involve initializing a machine learning model, handling text extraction, and interfacing with a database for embedding retrieval.

Uploaded by

no one
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views8 pages

Claude Comparet DB

The document outlines a Python script designed for processing PDF files to extract text, generate embeddings, and calculate cosine similarity for matching sections based on a similarity threshold. It includes memory management techniques, parallel processing for efficiency, and logging for tracking performance and errors. Key functionalities involve initializing a machine learning model, handling text extraction, and interfacing with a database for embedding retrieval.

Uploaded by

no one
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 8

import os

import sys
import logging
import datetime
import psutil
from takeTime import Timer
import pretty_errors
from sentence_transformers import SentenceTransformer
import torch
import numpy as np
import fitz
from text_processing import clear_text, is_radar_header
from semantic_text_splitter import TextSplitter
from result_printer import res_printer
from sklearn.metrics.pairwise import cosine_similarity
import gc
from concurrent.futures import ThreadPoolExecutor, as_completed
import re
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

logging.basicConfig(
filename="./data/logs/compareT_db_memory_usage.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)

splitter = TextSplitter((260, 1000))

pretty_errors.configure(
display_timestamp=1,
timestamp_function=lambda: datetime.datetime.now().strftime("%Y-%m-%d %H:%M:
%S"),
lines_before=2,
lines_after=1,
display_locals=1,
)

# Configure memory threshold based on system capabilities


MAX_MEMORY = 0.75 * psutil.virtual_memory().total # Use 75% of available system
memory

# Global model initialization to avoid reloading


EMBEDDING_MODEL = None

def initialize_model():
"""Initialize the embedding model with optimal settings once globally."""
global EMBEDDING_MODEL
if EMBEDDING_MODEL is None:
# Check if GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info(f"Using device: {device}")

EMBEDDING_MODEL = SentenceTransformer(
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
device=device
)
return EMBEDDING_MODEL
def log_memory_usage(step=""):
"""Logs the memory usage of the script with an optional step name."""
process = psutil.Process(os.getpid())
memory_used = process.memory_info().rss / 1024**2 # Convert bytes to MB
logging.info(f"Memory Usage after {step}: {memory_used:.2f} MB")

# Monitor system memory and trigger GC if needed


if process.memory_info().rss > MAX_MEMORY:
logging.warning(f"Memory threshold exceeded. Triggering garbage
collection.")
gc.collect()

def extract_list_blocks_sem_chunker(file):
"""Extract text chunks from PDF using LangChain with optimized settings."""
log_memory_usage("Before extracting text")

# Ensure the file path is correct


file_path = file if os.path.exists(file) else os.path.join('uploaded', file)

# Check if file exists before processing


if not os.path.exists(file_path):
logging.error(f"File not found: {file_path}")
return []

try:
loader = PyPDFLoader(file_path)

# Optimize chunk settings for better performance and relevance


text_splitter = RecursiveCharacterTextSplitter(
chunk_size=256, # Increased for better context
chunk_overlap=20, # Reduced overlap to minimize redundancy
length_function=len,
separators=["\n\n", "\n", ".", " "]
)

# Extract documents
documents = loader.load_and_split(text_splitter=text_splitter)

# Extract text only


text_chunks = [doc.page_content for doc in documents]

log_memory_usage("After extracting text")

# Process chunks in memory-efficient way


clean_chunks = merge_text_if_needed(text_chunks)

# Free memory
del documents
del text_chunks
gc.collect()

return clean_chunks

except Exception as e:
logging.error(f"Error extracting text: {e}")
return []

def merge_text_if_needed(chunks):
"""
Optimize the text merging process by improving boundary detection
and reducing memory allocations.
"""
if not chunks:
return []

# Pre-compile regex pattern for better performance


sentence_end_pattern = re.compile(r'[.!?]')

# Create a new list to avoid modifying the input list during iteration
merged_chunks = chunks.copy()

for i in range(1, len(merged_chunks)):


current_chunk_text = merged_chunks[i].strip()
prev_chunk_text = merged_chunks[i-1].strip()

# Skip empty chunks


if not current_chunk_text:
continue

# If the text starts with a lowercase letter


if current_chunk_text[0].islower():
# Find the first sentence-ending punctuation
match = sentence_end_pattern.search(current_chunk_text)
if match:
cutoff_position = match.end()
text_to_move = current_chunk_text[:cutoff_position].strip()

# Update chunks more efficiently


merged_chunks[i-1] = prev_chunk_text + ' ' + text_to_move
merged_chunks[i] = current_chunk_text[cutoff_position:].strip()

# Remove empty chunks to save memory


merged_chunks = [chunk for chunk in merged_chunks if chunk.strip()]

return merged_chunks

def get_optimal_batch_size(texts):
"""Dynamically determine optimal batch size based on text length and available
memory."""
avg_length = sum(len(text) for text in texts) / len(texts) if texts else 0

# Base batch size on average text length


if avg_length > 1000:
return 16 # Smaller batches for very long texts
elif avg_length > 500:
return 32 # Medium batches
else:
return 64 # Larger batches

def get_embeddings_batch(batch):
"""Process a single batch of embeddings."""
model = initialize_model()

try:
embeddings = model.encode(batch, normalize_embeddings=True,
show_progress_bar=False)
return embeddings
except Exception as e:
logging.error(f"Error generating embeddings: {e}")
# Return empty embeddings for failed batch
return np.zeros((len(batch), model.get_sentence_embedding_dimension()))

def get_embeddings_parallel(texts, max_workers=4):


"""
Generate embeddings for texts using parallel processing.

Args:
texts (list of str): List of text sections to encode
max_workers (int): Maximum number of parallel workers

Returns:
numpy.ndarray: Array of embedding vectors
"""
# Ensure model is initialized first
initialize_model()

if not texts:
return []

# Determine optimal batch size for the texts


batch_size = get_optimal_batch_size(texts)
logging.info(f"Using batch size: {batch_size}")

# Create batches
batches = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
all_embeddings = []

# Process batches in parallel


with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all batch processing tasks
future_to_batch = {executor.submit(get_embeddings_batch, batch): i
for i, batch in enumerate(batches)}

# Process completed batches


for future in as_completed(future_to_batch):
batch_idx = future_to_batch[future]
try:
batch_embeddings = future.result()
all_embeddings.append(batch_embeddings)
logging.info(f"Processed batch {batch_idx+1}/{len(batches)}")

# Force garbage collection after each batch


gc.collect()

except Exception as e:
logging.error(f"Error processing batch {batch_idx}: {e}")

# Combine all embeddings


if all_embeddings:
return np.vstack(all_embeddings)
return []

def calculate_cosine_similarity(query_v, doc_v, similarity_threshold):


"""Calculate cosine similarity with optimized memory usage."""
log_memory_usage("Before cosine similarity calculation")
# Convert inputs to numpy arrays if they aren't already
if not isinstance(query_v, np.ndarray):
query_v = np.array(query_v)
if not isinstance(doc_v, np.ndarray):
doc_v = np.array(doc_v)

# Process similarity in batches to reduce memory usage


batch_size = 1000 # Adjust based on your memory constraints
num_queries = query_v.shape[0]
result_indices = []
result_similarities = []

for i in range(0, num_queries, batch_size):


end_idx = min(i + batch_size, num_queries)
batch_queries = query_v[i:end_idx]

# Calculate similarities for this batch


similarities = cosine_similarity(batch_queries, doc_v)

# Find matches above threshold


matches = np.where(similarities > similarity_threshold)

# Adjust indices to account for batching


batch_matches = (matches[0] + i, matches[1])

# Store results
for q_idx, d_idx in zip(*batch_matches):
sim_value = similarities[q_idx - i, d_idx]
if sim_value < 1.0: # Exclude exact matches (self-matches)
result_indices.append((q_idx, d_idx))
result_similarities.append(sim_value)

# Clear batch data


del similarities
gc.collect()

# Convert to format expected by calling code


if result_indices:
result_matrix = tuple(zip(*result_indices))
similarity_values = np.zeros((num_queries, doc_v.shape[0]))
for (q_idx, d_idx), sim in zip(result_indices, result_similarities):
similarity_values[q_idx, d_idx] = sim
else:
result_matrix = (np.array([]), np.array([]))
similarity_values = np.zeros((num_queries, doc_v.shape[0]))

log_memory_usage("After cosine similarity calculation")


return result_matrix, similarity_values

def db_connect():
"""Database connection function - imported from db_worker module."""
from db_worker import db_connect
return db_connect()

def db_get_embeddings(cursor, table_name):


"""Get embeddings from database - imported from db_worker module."""
from db_worker import db_get_embeddings
return db_get_embeddings(cursor, table_name)
def main():
with Timer() as timer:
if len(sys.argv) < 2:
logging.error("Missing RADAR file argument. Usage: script.py
<radar_file_path>")
sys.exit(1)

radar_file = sys.argv[1]
if not os.path.exists(radar_file):
logging.error("Compare - Radar file not found.")
sys.exit(1)

log_memory_usage("Start of main()")

# Initialize model at start


initialize_model()

# Extract text chunks from radar file


radar_blocks = extract_list_blocks_sem_chunker(radar_file)
if not radar_blocks:
logging.error("No text blocks extracted from radar file.")
sys.exit(1)

logging.info(f"Extracted {len(radar_blocks)} text blocks from radar file")

# Generate embeddings in parallel with optimized batch processing


log_memory_usage("Before generating embeddings")
# Determine optimal number of workers based on CPU cores
num_workers = min(4, os.cpu_count() or 4)
logging.info(f"Using {num_workers} workers for parallel processing")

radar_embeddings = get_embeddings_parallel(radar_blocks,
max_workers=num_workers)
log_memory_usage("After generating embeddings")

# Get database embeddings


db_data_list = []
conn = None
try:
conn = db_connect()
cursor = conn.cursor()
table_name = "Regulations_embeddings"
db_data_list = db_get_embeddings(cursor, table_name)
logging.info(f"Retrieved {len(db_data_list)} embeddings from database")
except Exception as e:
logging.error(f"Database error: {e}")
finally:
if conn:
conn.close()

if not db_data_list:
logging.error("No embeddings retrieved from database.")
sys.exit(1)

# Extract embeddings and metadata


checker_embeddings = np.array([row[2] for row in db_data_list])
auth_roles = [row[3] for row in db_data_list]

# Calculate similarity with memory optimization


similarity_threshold = 0.755
result_matrix, similarity_values = calculate_cosine_similarity(
radar_embeddings, checker_embeddings, similarity_threshold
)

# Process matches more efficiently


matches = {}
for file_one_idx, file_two_idx in zip(*result_matrix):
key = radar_blocks[file_one_idx]

# Create match entry only once


if key not in matches:
matches[key] = []

# Create match object


dict_to_append = {
"Section_matched": str(db_data_list[file_two_idx][1]),
"Section_pdf_origin": str(db_data_list[file_two_idx][0]),
"Similarity": round(float(similarity_values[file_one_idx,
file_two_idx]), 4),
"Authority": auth_roles[file_two_idx]
}

# Check for duplicates efficiently


if not any(dict_to_append["Section_matched"] ==
existing["Section_matched"] and
dict_to_append["Section_pdf_origin"] ==
existing["Section_pdf_origin"]
for existing in matches[key]):
matches[key].append(dict_to_append)

# Clear large data structures to free memory


del radar_embeddings
del checker_embeddings
del db_data_list
del radar_blocks
gc.collect()
log_memory_usage("After similarity calculations")

# Process and output results


try:
results = {}
index_no = 0
if not matches:
results["nomatch"] = f"No matches found. Minimum similarity threshold
is {similarity_threshold}. Try lowering the threshold."
else:
for key, matches_list in matches.items():
index_no += 1
new_key = f"RADAR match section no.{index_no}"
results[new_key] = {
"Radar_section": key,
"Matches with": sorted(
matches_list, key=lambda x: x["Similarity"], reverse=True
),
}

# Clear matches to free memory


del matches
log_memory_usage("End of main()")

# Output results
res_printer(results, "compareT_db", "RALF")

except Exception as e:
logging.error(f"Error during result processing or printing: {e}")
print(f"Error during result processing or printing: {e}")

if __name__ == "__main__":
main()

You might also like