0% found this document useful (0 votes)

8 views6 pages

Dataset Manager

The LEO Dataset Manager module is designed for managing datasets used in training and evaluation, allowing users to create, load, add examples, and delete datasets. It includes functionality for importing examples from command history and intents, as well as splitting data into training and test sets. The module ensures data persistence by saving dataset indices in JSON format and handles errors through logging.

Uploaded by

raynyx77

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

8 views6 pages

Dataset Manager

Uploaded by

raynyx77

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 6

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
LEO Dataset Manager

This module manages datasets for training and evaluation.

"""

import os
import json
import logging
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split

class DatasetManager:
"""Manages datasets for training and evaluation."""

def init(self, data_dir='data'):

"""
Initialize the dataset manager.

Args:
data_dir (str): Directory for storing datasets
"""
self.data_dir = data_dir
self.datasets = {}
self.current_dataset = None

# Create data directory if it doesn't exist

os.makedirs(data_dir, exist_ok=True)

# Load available datasets

self.load_available_datasets()

def load_available_datasets(self):
"""Load available datasets."""
try:
datasets_path = os.path.join(self.data_dir, 'datasets.json')
if os.path.exists(datasets_path):
with open(datasets_path, 'r') as f:
self.datasets = json.load(f)
else:
self.datasets = {}
except Exception as e:
logging.error(f"Error loading datasets: {str(e)}")
self.datasets = {}

def save_datasets_index(self):
"""Save the datasets index."""
try:
datasets_path = os.path.join(self.data_dir, 'datasets.json')
with open(datasets_path, 'w') as f:
json.dump(self.datasets, f, indent=2)
except Exception as e:
logging.error(f"Error saving datasets index: {str(e)}")

def create_dataset(self, name, description=''):

"""
Create a new dataset.

Args:
name (str): Dataset name
description (str): Dataset description

Returns:
str: Dataset ID
"""
# Generate ID
dataset_id = f"dataset_{len(self.datasets) +
1}_{int(datetime.now().timestamp())}"

# Create dataset metadata

self.datasets[dataset_id] = {
'name': name,
'description': description,
'created_at': datetime.now().isoformat(),
'updated_at': datetime.now().isoformat(),
'num_examples': 0,
'num_intents': 0,
'file_path': os.path.join(self.data_dir, f"{dataset_id}.json")
}

# Create empty dataset file

dataset = {
'metadata': self.datasets[dataset_id],
'examples': []
}

with open(self.datasets[dataset_id]['file_path'], 'w') as f:

json.dump(dataset, f, indent=2)

# Save datasets index

self.save_datasets_index()

# Set as current dataset

self.current_dataset = dataset_id

return dataset_id

def load_dataset(self, dataset_id):

"""
Load a dataset.

Args:
dataset_id (str): Dataset ID

Returns:
dict: Dataset
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
with open(self.datasets[dataset_id]['file_path'], 'r') as f:
dataset = json.load(f)
# Set as current dataset
self.current_dataset = dataset_id

return dataset
except Exception as e:
logging.error(f"Error loading dataset {dataset_id}: {str(e)}")
raise

def add_examples(self, dataset_id, examples):

"""
Add examples to a dataset.

Args:
dataset_id (str): Dataset ID
examples (list): List of examples

Returns:
int: Number of examples added
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
# Load dataset
dataset = self.load_dataset(dataset_id)

# Add examples
dataset['examples'].extend(examples)

# Update metadata
dataset['metadata']['num_examples'] = len(dataset['examples'])
dataset['metadata']['num_intents'] = len(set(ex['intent'] for ex in
dataset['examples']))
dataset['metadata']['updated_at'] = datetime.now().isoformat()

# Update datasets index

self.datasets[dataset_id] = dataset['metadata']

# Save dataset
with open(self.datasets[dataset_id]['file_path'], 'w') as f:
json.dump(dataset, f, indent=2)

# Save datasets index

self.save_datasets_index()

return len(examples)
except Exception as e:
logging.error(f"Error adding examples to dataset {dataset_id}:
{str(e)}")
raise

def get_training_data(self, dataset_id, test_size=0.2, random_state=42):

"""
Get training and test data from a dataset.

Args:
dataset_id (str): Dataset ID
test_size (float): Test set size
random_state (int): Random state
Returns:
tuple: (X_train, X_test, y_train, y_test)
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
# Load dataset
dataset = self.load_dataset(dataset_id)

# Extract examples
X = [ex['text'] for ex in dataset['examples']]
y = [ex['intent'] for ex in dataset['examples']]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)

return X_train, X_test, y_train, y_test

except Exception as e:
logging.error(f"Error getting training data from dataset {dataset_id}:
{str(e)}")
raise

def delete_dataset(self, dataset_id):

"""
Delete a dataset.

Args:
dataset_id (str): Dataset ID

Returns:
bool: True if successful, False otherwise
"""
if dataset_id not in self.datasets:
raise ValueError(f"Dataset {dataset_id} not found")

try:
# Delete dataset file
if os.path.exists(self.datasets[dataset_id]['file_path']):
os.remove(self.datasets[dataset_id]['file_path'])

# Remove from datasets index

del self.datasets[dataset_id]

# Save datasets index

self.save_datasets_index()

# Reset current dataset if it was deleted

if self.current_dataset == dataset_id:
self.current_dataset = None

return True
except Exception as e:
logging.error(f"Error deleting dataset {dataset_id}: {str(e)}")
return False
def import_from_command_history(self, confidence_threshold=0.7):
"""
Import examples from command history.

Args:
confidence_threshold (float): Minimum confidence for examples

Returns:
tuple: (dataset_id, num_examples)
"""
try:
# Load command history
history_path = os.path.join(self.data_dir, 'command_history.json')
if not os.path.exists(history_path):
return None, 0

with open(history_path, 'r') as f:

history = json.load(f)

# Filter examples
examples = []
for entry in history:
if 'command' in entry and 'intent' in entry and
entry.get('confidence', 0) >= confidence_threshold:
examples.append({
'text': entry['command'],
'intent': entry['intent'],
'confidence': entry['confidence'],
'timestamp': entry.get('timestamp',
datetime.now().isoformat())
})

if not examples:
return None, 0

# Create dataset
dataset_id = self.create_dataset(
f"Command History ({datetime.now().strftime('%Y-%m-%d')})",
f"Imported from command history with confidence >=
{confidence_threshold}"
)

# Add examples
self.add_examples(dataset_id, examples)

return dataset_id, len(examples)

except Exception as e:
logging.error(f"Error importing from command history: {str(e)}")
return None, 0

def import_from_intents(self, intents):

"""
Import examples from intents.

Args:
intents (dict): Intents dictionary

Returns:
tuple: (dataset_id, num_examples)
"""
try:
# Create examples
examples = []
for intent_name, intent_data in intents.items():
for pattern in intent_data.get('patterns', []):
examples.append({
'text': pattern,
'intent': intent_name,
'confidence': 1.0,
'timestamp': datetime.now().isoformat()
})

if not examples:
return None, 0

# Create dataset
dataset_id = self.create_dataset(
f"Intents ({datetime.now().strftime('%Y-%m-%d')})",
f"Imported from intents"
)

# Add examples
self.add_examples(dataset_id, examples)

return dataset_id, len(examples)

except Exception as e:
logging.error(f"Error importing from intents: {str(e)}")
return None, 0

Python Ultimate Guide
100% (1)
Python Ultimate Guide
10 pages
Numpy Pad
No ratings yet
Numpy Pad
1,318 pages
Ip Pythone Code
No ratings yet
Ip Pythone Code
24 pages
PR Writing
No ratings yet
PR Writing
9 pages
Pattern Recognition Lab
No ratings yet
Pattern Recognition Lab
24 pages
EXP Py
No ratings yet
EXP Py
9 pages
Weather Forecasting
No ratings yet
Weather Forecasting
5 pages
Python Code Examples
100% (1)
Python Code Examples
30 pages
Stats Practicals - Ipynb
No ratings yet
Stats Practicals - Ipynb
9 pages
Wa0027.
No ratings yet
Wa0027.
34 pages
Import As: Pandas PD DF PD - Read - CSV DF - Head
No ratings yet
Import As: Pandas PD DF PD - Read - CSV DF - Head
91 pages
60 ChatGPT Prompts For Data Science 2023
100% (3)
60 ChatGPT Prompts For Data Science 2023
67 pages
ML Lab (Print Copy)
No ratings yet
ML Lab (Print Copy)
23 pages
Lecture 05
No ratings yet
Lecture 05
16 pages
Python Lab File
No ratings yet
Python Lab File
4 pages
Assignment 2.1.2 Image Augmentation
No ratings yet
Assignment 2.1.2 Image Augmentation
8 pages
ML Lab
No ratings yet
ML Lab
51 pages
Code 2
No ratings yet
Code 2
2 pages
IDP Assignment - 4 - 5 (Saswat Mohanty - 1941012407 - CSE-D)
No ratings yet
IDP Assignment - 4 - 5 (Saswat Mohanty - 1941012407 - CSE-D)
15 pages
Lab Exercise - Session 1 - Python Libraries
No ratings yet
Lab Exercise - Session 1 - Python Libraries
2 pages
Python in Datascience
No ratings yet
Python in Datascience
2 pages
Wa0009.
No ratings yet
Wa0009.
26 pages
12 Python Features Every Data Scientist Should Know
No ratings yet
12 Python Features Every Data Scientist Should Know
15 pages
Experiment 2 v2
No ratings yet
Experiment 2 v2
10 pages
B22EE010 Report
No ratings yet
B22EE010 Report
9 pages
University Institute of Engineering Department of Computer Science & Engineering
No ratings yet
University Institute of Engineering Department of Computer Science & Engineering
11 pages
UNIT 1 - C# Notes - 2022-23 3rd Sem
No ratings yet
UNIT 1 - C# Notes - 2022-23 3rd Sem
10 pages
ML Lab Manual Completed
No ratings yet
ML Lab Manual Completed
56 pages
Handwriting Recognition
No ratings yet
Handwriting Recognition
31 pages
ML LAB Record
No ratings yet
ML LAB Record
35 pages
Python Cprofile
No ratings yet
Python Cprofile
5 pages
Code:: To Find Frequent Itemsets and Association Between Different Itemsets Using Apriori Algorithm
No ratings yet
Code:: To Find Frequent Itemsets and Association Between Different Itemsets Using Apriori Algorithm
28 pages
Python Programs FDP
No ratings yet
Python Programs FDP
20 pages
AIML Worksheet 4
No ratings yet
AIML Worksheet 4
2 pages
Lab Manual
No ratings yet
Lab Manual
80 pages
Sentiment Analysis On Tweets
No ratings yet
Sentiment Analysis On Tweets
2 pages
Original ML Lab Manual
No ratings yet
Original ML Lab Manual
22 pages
ML (Sudhanshu)
No ratings yet
ML (Sudhanshu)
24 pages
Machine Learning Techniques Lab: Session: 2023-24, Even Semester
No ratings yet
Machine Learning Techniques Lab: Session: 2023-24, Even Semester
20 pages
ML Record
No ratings yet
ML Record
19 pages
Lab Manual Final
No ratings yet
Lab Manual Final
34 pages
Sample Worksheet 1
No ratings yet
Sample Worksheet 1
8 pages
DNN ALL Practical 28
No ratings yet
DNN ALL Practical 28
34 pages
dt3 The Same As Possible To Get To
No ratings yet
dt3 The Same As Possible To Get To
13 pages
ML Lab File Batch 1
No ratings yet
ML Lab File Batch 1
20 pages
GenAI - Lab-File - Darab Khan 22SCSE1480055
No ratings yet
GenAI - Lab-File - Darab Khan 22SCSE1480055
31 pages
Python Lab PRG
No ratings yet
Python Lab PRG
20 pages
Python Tutorial
No ratings yet
Python Tutorial
41 pages
Python Core Words Examples
No ratings yet
Python Core Words Examples
1 page
Unit 5 Python
No ratings yet
Unit 5 Python
11 pages
ML Manual
No ratings yet
ML Manual
21 pages
Command Classifier
No ratings yet
Command Classifier
4 pages
Exercise and Experiment 3
No ratings yet
Exercise and Experiment 3
14 pages
Datasciendeusingpython 6 Weeks
No ratings yet
Datasciendeusingpython 6 Weeks
7 pages
1 Notmnist - Ipynb
No ratings yet
1 Notmnist - Ipynb
15 pages
Cognitive Science Manual
No ratings yet
Cognitive Science Manual
17 pages
ML Lab Manual - Ex No. 1 To 9
No ratings yet
ML Lab Manual - Ex No. 1 To 9
26 pages
Machine Learning Lab Record: Dr. Sarika Hegde
No ratings yet
Machine Learning Lab Record: Dr. Sarika Hegde
23 pages
15CSL76 Students
No ratings yet
15CSL76 Students
18 pages
Study Material XII Computer Science
No ratings yet
Study Material XII Computer Science
216 pages
Cobol
100% (2)
Cobol
430 pages
Artificial Intelligence - CS607 Handouts Lecture 9 - 10
0% (1)
Artificial Intelligence - CS607 Handouts Lecture 9 - 10
29 pages
Intro To Software Engineering
No ratings yet
Intro To Software Engineering
7 pages
Evans Analytics2e PPT 02
No ratings yet
Evans Analytics2e PPT 02
29 pages
Fundamental Programming in Java - Trainer's Guide
No ratings yet
Fundamental Programming in Java - Trainer's Guide
553 pages
Algorithm: Algorithm Is A Step-By-Step Procedure or Sequence of Instruction To Solve A Problem
No ratings yet
Algorithm: Algorithm Is A Step-By-Step Procedure or Sequence of Instruction To Solve A Problem
38 pages
18CS55 ADP Notes Module 4 and 5
100% (1)
18CS55 ADP Notes Module 4 and 5
72 pages
Learn Python 3 - Modules Cheatsheet - Codecademy
No ratings yet
Learn Python 3 - Modules Cheatsheet - Codecademy
2 pages
100 Interview Questions
No ratings yet
100 Interview Questions
13 pages
HUSKY - Products Filter Professional For WooCommerce
100% (1)
HUSKY - Products Filter Professional For WooCommerce
16 pages
Lecture Notes On C++ For Java Programmers
No ratings yet
Lecture Notes On C++ For Java Programmers
289 pages
Casprimarycomputing
No ratings yet
Casprimarycomputing
34 pages
Case Study On Corba
No ratings yet
Case Study On Corba
23 pages
Kelsey Rochester 0188E 10062
No ratings yet
Kelsey Rochester 0188E 10062
191 pages
Question Bank Class Xi Cs Q&A
No ratings yet
Question Bank Class Xi Cs Q&A
64 pages
Computer Science 2
No ratings yet
Computer Science 2
5 pages
Types of SQL Commands
No ratings yet
Types of SQL Commands
12 pages
Question
100% (1)
Question
68 pages
ECCOMAS Oslo Article
No ratings yet
ECCOMAS Oslo Article
12 pages
Bharath P Resume
No ratings yet
Bharath P Resume
5 pages
XML Quick Guide
No ratings yet
XML Quick Guide
32 pages
Data Analytics Certificate Glossary
No ratings yet
Data Analytics Certificate Glossary
23 pages
JDK Installation Steps
No ratings yet
JDK Installation Steps
10 pages
Swayam Prabha Educational DTH Channels India 13.8.2024 Onwards
No ratings yet
Swayam Prabha Educational DTH Channels India 13.8.2024 Onwards
18 pages
SQL Project
No ratings yet
SQL Project
16 pages
Database Concepts: Need For A Database
No ratings yet
Database Concepts: Need For A Database
3 pages
2-The Print Function
No ratings yet
2-The Print Function
3 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

Dataset Manager

Uploaded by

Dataset Manager

Uploaded by

#!

This module manages datasets for training and evaluation.

def __init__(self, data_dir='data'):

# Create data directory if it doesn't exist

# Load available datasets

def create_dataset(self, name, description=''):

# Create dataset metadata

# Create empty dataset file

with open(self.datasets[dataset_id]['file_path'], 'w') as f:

# Save datasets index

# Set as current dataset

def load_dataset(self, dataset_id):

def add_examples(self, dataset_id, examples):

# Update datasets index

# Save datasets index

def get_training_data(self, dataset_id, test_size=0.2, random_state=42):

return X_train, X_test, y_train, y_test

def delete_dataset(self, dataset_id):

# Remove from datasets index

# Save datasets index

# Reset current dataset if it was deleted

with open(history_path, 'r') as f:

return dataset_id, len(examples)

def import_from_intents(self, intents):

return dataset_id, len(examples)

You might also like

def init(self, data_dir='data'):