0% found this document useful (0 votes)
2 views

text_processor

The LEO Text Processor module processes text files to generate intents by cleaning the text, splitting it into sentences, and extracting key phrases. It includes methods for reading files, basic preprocessing, and utilizing spaCy for advanced phrase extraction, with a fallback to a simple approach if spaCy is unavailable. The module also provides progress and status updates during processing.

Uploaded by

raynyx77
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

text_processor

The LEO Text Processor module processes text files to generate intents by cleaning the text, splitting it into sentences, and extracting key phrases. It includes methods for reading files, basic preprocessing, and utilizing spaCy for advanced phrase extraction, with a fallback to a simple approach if spaCy is unavailable. The module also provides progress and status updates during processing.

Uploaded by

raynyx77
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
LEO Text Processor

This module processes text files for intent generation.


"""

import os
import logging
import re
from collections import Counter

class TextProcessor:
"""Processes text files for intent generation."""

def __init__(self):
"""Initialize the text processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None

def process(self, file_path):


"""
Process a text file.

Args:
file_path (str): Path to the text file

Returns:
str: Processed text
"""
try:
self.on_status(f"Processing text file: {os.path.basename(file_path)}")
self.on_progress(10)

# Read file
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
text = f.read()

self.on_progress(30)

# Basic preprocessing
self.on_status("Cleaning text...")

# Remove extra whitespace


text = re.sub(r'\s+', ' ', text)

# Split into sentences


self.on_status("Splitting into sentences...")
sentences = self._split_into_sentences(text)

self.on_progress(70)

# Extract key phrases


self.on_status("Extracting key phrases...")
key_phrases = self._extract_key_phrases(sentences)

self.on_progress(90)
# Combine results
result = {
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}

self.on_progress(100)
self.on_status("Text processing complete")

return result

except Exception as e:
logging.error(f"Error processing text file: {str(e)}", exc_info=True)
raise

def _split_into_sentences(self, text):


"""
Split text into sentences.

Args:
text (str): Text to split

Returns:
list: List of sentences
"""
# Simple sentence splitting
sentences = re.split(r'(?<=[.!?])\s+', text)

# Filter out empty sentences


sentences = [s.strip() for s in sentences if s.strip()]

return sentences

def _extract_key_phrases(self, sentences):


"""
Extract key phrases from sentences.

Args:
sentences (list): List of sentences

Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy

# Load spaCy model


nlp = spacy.load("en_core_web_sm")

key_phrases = []

for sentence in sentences:


doc = nlp(sentence)

# Extract noun phrases


for chunk in doc.noun_chunks:
if len(chunk.text.split()) > 1: # Only multi-word phrases
key_phrases.append(chunk.text)

# Extract verb phrases


for token in doc:
if token.pos_ == "VERB":
phrase = token.text
for child in token.children:
if child.dep_ in ["dobj", "pobj"]:
phrase += " " + child.text
key_phrases.append(phrase)

return key_phrases

except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")

# Tokenize
words = []
for sentence in sentences:
words.extend(sentence.lower().split())

# Count word frequencies


word_counts = Counter(words)

# Get common bigrams


bigrams = []
for i in range(len(words) - 1):
bigrams.append(words[i] + " " + words[i + 1])

bigram_counts = Counter(bigrams)

# Return top phrases


return [phrase for phrase, count in bigram_counts.most_common(20)]

You might also like