Dataset Manager
Dataset Manager
/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
LEO Dataset Manager
import os
import json
import logging
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
class DatasetManager:
"""Manages datasets for training and evaluation."""
Args:
data_dir (str): Directory for storing datasets
"""
self.data_dir = data_dir
self.datasets = {}
self.current_dataset = None
def load_available_datasets(self):
"""Load available datasets."""
try:
datasets_path = os.path.join(self.data_dir, 'datasets.json')
if os.path.exists(datasets_path):
with open(datasets_path, 'r') as f:
self.datasets = json.load(f)
else:
self.datasets = {}
except Exception as e:
logging.error(f"Error loading datasets: {str(e)}")
self.datasets = {}
def save_datasets_index(self):
"""Save the datasets index."""
try:
datasets_path = os.path.join(self.data_dir, 'datasets.json')
with open(datasets_path, 'w') as f:
json.dump(self.datasets, f, indent=2)
except Exception as e:
logging.error(f"Error saving datasets index: {str(e)}")
Args:
name (str): Dataset name
description (str): Dataset description
Returns:
str: Dataset ID
"""
# Generate ID
dataset_id = f"dataset_{len(self.datasets) +
1}_{int(datetime.now().timestamp())}"
return dataset_id
Args:
dataset_id (str): Dataset ID
Returns:
dict: Dataset
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")
try:
with open(self.datasets[dataset_id]['file_path'], 'r') as f:
dataset = json.load(f)
# Set as current dataset
self.current_dataset = dataset_id
return dataset
except Exception as e:
logging.error(f"Error loading dataset {dataset_id}: {str(e)}")
raise
Args:
dataset_id (str): Dataset ID
examples (list): List of examples
Returns:
int: Number of examples added
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")
try:
# Load dataset
dataset = self.load_dataset(dataset_id)
# Add examples
dataset['examples'].extend(examples)
# Update metadata
dataset['metadata']['num_examples'] = len(dataset['examples'])
dataset['metadata']['num_intents'] = len(set(ex['intent'] for ex in
dataset['examples']))
dataset['metadata']['updated_at'] = datetime.now().isoformat()
# Save dataset
with open(self.datasets[dataset_id]['file_path'], 'w') as f:
json.dump(dataset, f, indent=2)
return len(examples)
except Exception as e:
logging.error(f"Error adding examples to dataset {dataset_id}:
{str(e)}")
raise
Args:
dataset_id (str): Dataset ID
test_size (float): Test set size
random_state (int): Random state
Returns:
tuple: (X_train, X_test, y_train, y_test)
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")
try:
# Load dataset
dataset = self.load_dataset(dataset_id)
# Extract examples
X = [ex['text'] for ex in dataset['examples']]
y = [ex['intent'] for ex in dataset['examples']]
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)
Args:
dataset_id (str): Dataset ID
Returns:
bool: True if successful, False otherwise
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")
try:
# Delete dataset file
if os.path.exists(self.datasets[dataset_id]['file_path']):
os.remove(self.datasets[dataset_id]['file_path'])
return True
except Exception as e:
logging.error(f"Error deleting dataset {dataset_id}: {str(e)}")
return False
def import_from_command_history(self, confidence_threshold=0.7):
"""
Import examples from command history.
Args:
confidence_threshold (float): Minimum confidence for examples
Returns:
tuple: (dataset_id, num_examples)
"""
try:
# Load command history
history_path = os.path.join(self.data_dir, 'command_history.json')
if not os.path.exists(history_path):
return None, 0
# Filter examples
examples = []
for entry in history:
if 'command' in entry and 'intent' in entry and
entry.get('confidence', 0) >= confidence_threshold:
examples.append({
'text': entry['command'],
'intent': entry['intent'],
'confidence': entry['confidence'],
'timestamp': entry.get('timestamp',
datetime.now().isoformat())
})
if not examples:
return None, 0
# Create dataset
dataset_id = self.create_dataset(
f"Command History ({datetime.now().strftime('%Y-%m-%d')})",
f"Imported from command history with confidence >=
{confidence_threshold}"
)
# Add examples
self.add_examples(dataset_id, examples)
Args:
intents (dict): Intents dictionary
Returns:
tuple: (dataset_id, num_examples)
"""
try:
# Create examples
examples = []
for intent_name, intent_data in intents.items():
for pattern in intent_data.get('patterns', []):
examples.append({
'text': pattern,
'intent': intent_name,
'confidence': 1.0,
'timestamp': datetime.now().isoformat()
})
if not examples:
return None, 0
# Create dataset
dataset_id = self.create_dataset(
f"Intents ({datetime.now().strftime('%Y-%m-%d')})",
f"Imported from intents"
)
# Add examples
self.add_examples(dataset_id, examples)