text_processor
text_processor
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO Text Processor
import os
import logging
import re
from collections import Counter
class TextProcessor:
"""Processes text files for intent generation."""
def __init__(self):
"""Initialize the text processor."""
self.on_progress = lambda p: None
self.on_status = lambda s: None
Args:
file_path (str): Path to the text file
Returns:
str: Processed text
"""
try:
self.on_status(f"Processing text file: {os.path.basename(file_path)}")
self.on_progress(10)
# Read file
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
text = f.read()
self.on_progress(30)
# Basic preprocessing
self.on_status("Cleaning text...")
self.on_progress(70)
self.on_progress(90)
# Combine results
result = {
'text': text,
'sentences': sentences,
'key_phrases': key_phrases
}
self.on_progress(100)
self.on_status("Text processing complete")
return result
except Exception as e:
logging.error(f"Error processing text file: {str(e)}", exc_info=True)
raise
Args:
text (str): Text to split
Returns:
list: List of sentences
"""
# Simple sentence splitting
sentences = re.split(r'(?<=[.!?])\s+', text)
return sentences
Args:
sentences (list): List of sentences
Returns:
list: List of key phrases
"""
# Try to use spaCy if available
try:
import spacy
key_phrases = []
return key_phrases
except ImportError:
# Fallback to simple approach if spaCy is not available
logging.warning("spaCy not available, using simple key phrase
extraction")
# Tokenize
words = []
for sentence in sentences:
words.extend(sentence.lower().split())
bigram_counts = Counter(bigrams)