0% found this document useful (0 votes)
5 views6 pages

Dataset Manager

The LEO Dataset Manager module is designed for managing datasets used in training and evaluation, allowing users to create, load, add examples, and delete datasets. It includes functionality for importing examples from command history and intents, as well as splitting data into training and test sets. The module ensures data persistence by saving dataset indices in JSON format and handles errors through logging.

Uploaded by

raynyx77
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views6 pages

Dataset Manager

The LEO Dataset Manager module is designed for managing datasets used in training and evaluation, allowing users to create, load, add examples, and delete datasets. It includes functionality for importing examples from command history and intents, as well as splitting data into training and test sets. The module ensures data persistence by saving dataset indices in JSON format and handles errors through logging.

Uploaded by

raynyx77
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 6

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
LEO Dataset Manager

This module manages datasets for training and evaluation.


"""

import os
import json
import logging
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

class DatasetManager:
"""Manages datasets for training and evaluation."""

def __init__(self, data_dir='data'):


"""
Initialize the dataset manager.

Args:
data_dir (str): Directory for storing datasets
"""
self.data_dir = data_dir
self.datasets = {}
self.current_dataset = None

# Create data directory if it doesn't exist


os.makedirs(data_dir, exist_ok=True)

# Load available datasets


self.load_available_datasets()

def load_available_datasets(self):
"""Load available datasets."""
try:
datasets_path = os.path.join(self.data_dir, 'datasets.json')
if os.path.exists(datasets_path):
with open(datasets_path, 'r') as f:
self.datasets = json.load(f)
else:
self.datasets = {}
except Exception as e:
logging.error(f"Error loading datasets: {str(e)}")
self.datasets = {}

def save_datasets_index(self):
"""Save the datasets index."""
try:
datasets_path = os.path.join(self.data_dir, 'datasets.json')
with open(datasets_path, 'w') as f:
json.dump(self.datasets, f, indent=2)
except Exception as e:
logging.error(f"Error saving datasets index: {str(e)}")

def create_dataset(self, name, description=''):


"""
Create a new dataset.

Args:
name (str): Dataset name
description (str): Dataset description

Returns:
str: Dataset ID
"""
# Generate ID
dataset_id = f"dataset_{len(self.datasets) +
1}_{int(datetime.now().timestamp())}"

# Create dataset metadata


self.datasets[dataset_id] = {
'name': name,
'description': description,
'created_at': datetime.now().isoformat(),
'updated_at': datetime.now().isoformat(),
'num_examples': 0,
'num_intents': 0,
'file_path': os.path.join(self.data_dir, f"{dataset_id}.json")
}

# Create empty dataset file


dataset = {
'metadata': self.datasets[dataset_id],
'examples': []
}

with open(self.datasets[dataset_id]['file_path'], 'w') as f:


json.dump(dataset, f, indent=2)

# Save datasets index


self.save_datasets_index()

# Set as current dataset


self.current_dataset = dataset_id

return dataset_id

def load_dataset(self, dataset_id):


"""
Load a dataset.

Args:
dataset_id (str): Dataset ID

Returns:
dict: Dataset
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
with open(self.datasets[dataset_id]['file_path'], 'r') as f:
dataset = json.load(f)
# Set as current dataset
self.current_dataset = dataset_id

return dataset
except Exception as e:
logging.error(f"Error loading dataset {dataset_id}: {str(e)}")
raise

def add_examples(self, dataset_id, examples):


"""
Add examples to a dataset.

Args:
dataset_id (str): Dataset ID
examples (list): List of examples

Returns:
int: Number of examples added
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
# Load dataset
dataset = self.load_dataset(dataset_id)

# Add examples
dataset['examples'].extend(examples)

# Update metadata
dataset['metadata']['num_examples'] = len(dataset['examples'])
dataset['metadata']['num_intents'] = len(set(ex['intent'] for ex in
dataset['examples']))
dataset['metadata']['updated_at'] = datetime.now().isoformat()

# Update datasets index


self.datasets[dataset_id] = dataset['metadata']

# Save dataset
with open(self.datasets[dataset_id]['file_path'], 'w') as f:
json.dump(dataset, f, indent=2)

# Save datasets index


self.save_datasets_index()

return len(examples)
except Exception as e:
logging.error(f"Error adding examples to dataset {dataset_id}:
{str(e)}")
raise

def get_training_data(self, dataset_id, test_size=0.2, random_state=42):


"""
Get training and test data from a dataset.

Args:
dataset_id (str): Dataset ID
test_size (float): Test set size
random_state (int): Random state
Returns:
tuple: (X_train, X_test, y_train, y_test)
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
# Load dataset
dataset = self.load_dataset(dataset_id)

# Extract examples
X = [ex['text'] for ex in dataset['examples']]
y = [ex['intent'] for ex in dataset['examples']]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)

return X_train, X_test, y_train, y_test


except Exception as e:
logging.error(f"Error getting training data from dataset {dataset_id}:
{str(e)}")
raise

def delete_dataset(self, dataset_id):


"""
Delete a dataset.

Args:
dataset_id (str): Dataset ID

Returns:
bool: True if successful, False otherwise
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
# Delete dataset file
if os.path.exists(self.datasets[dataset_id]['file_path']):
os.remove(self.datasets[dataset_id]['file_path'])

# Remove from datasets index


del self.datasets[dataset_id]

# Save datasets index


self.save_datasets_index()

# Reset current dataset if it was deleted


if self.current_dataset == dataset_id:
self.current_dataset = None

return True
except Exception as e:
logging.error(f"Error deleting dataset {dataset_id}: {str(e)}")
return False
def import_from_command_history(self, confidence_threshold=0.7):
"""
Import examples from command history.

Args:
confidence_threshold (float): Minimum confidence for examples

Returns:
tuple: (dataset_id, num_examples)
"""
try:
# Load command history
history_path = os.path.join(self.data_dir, 'command_history.json')
if not os.path.exists(history_path):
return None, 0

with open(history_path, 'r') as f:


history = json.load(f)

# Filter examples
examples = []
for entry in history:
if 'command' in entry and 'intent' in entry and
entry.get('confidence', 0) >= confidence_threshold:
examples.append({
'text': entry['command'],
'intent': entry['intent'],
'confidence': entry['confidence'],
'timestamp': entry.get('timestamp',
datetime.now().isoformat())
})

if not examples:
return None, 0

# Create dataset
dataset_id = self.create_dataset(
f"Command History ({datetime.now().strftime('%Y-%m-%d')})",
f"Imported from command history with confidence >=
{confidence_threshold}"
)

# Add examples
self.add_examples(dataset_id, examples)

return dataset_id, len(examples)


except Exception as e:
logging.error(f"Error importing from command history: {str(e)}")
return None, 0

def import_from_intents(self, intents):


"""
Import examples from intents.

Args:
intents (dict): Intents dictionary

Returns:
tuple: (dataset_id, num_examples)
"""
try:
# Create examples
examples = []
for intent_name, intent_data in intents.items():
for pattern in intent_data.get('patterns', []):
examples.append({
'text': pattern,
'intent': intent_name,
'confidence': 1.0,
'timestamp': datetime.now().isoformat()
})

if not examples:
return None, 0

# Create dataset
dataset_id = self.create_dataset(
f"Intents ({datetime.now().strftime('%Y-%m-%d')})",
f"Imported from intents"
)

# Add examples
self.add_examples(dataset_id, examples)

return dataset_id, len(examples)


except Exception as e:
logging.error(f"Error importing from intents: {str(e)}")
return None, 0

You might also like