LSTM From Scratch in Python
LSTM From Scratch in Python
May 6, 2024
Parameters:
- input_size: int, dimensionality of input space
1
- hidden_size: int, number of LSTM units
- output_size: int, dimensionality of output space
- init_method: str, weight initialization method (default: 'xavier')
"""
def __init__(self, input_size, hidden_size, output_size,␣
↪init_method='xavier'):
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.weight_initializer = WeightInitializer(method=init_method)
# Initialize weights
self.wf = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
self.wi = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
self.wo = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
self.wc = self.weight_initializer.initialize((hidden_size, hidden_size␣
↪+ input_size))
# Initialize biases
self.bf = np.zeros((hidden_size, 1))
self.bi = np.zeros((hidden_size, 1))
self.bo = np.zeros((hidden_size, 1))
self.bc = np.zeros((hidden_size, 1))
@staticmethod
def sigmoid(z):
"""
Sigmoid activation function.
Parameters:
- z: np.ndarray, input to the activation function
Returns:
- np.ndarray, output of the activation function
"""
return 1 / (1 + np.exp(-z))
@staticmethod
2
def dsigmoid(y):
"""
Derivative of the sigmoid activation function.
Parameters:
- y: np.ndarray, output of the sigmoid activation function
Returns:
- np.ndarray, derivative of the sigmoid function
"""
return y * (1 - y)
@staticmethod
def dtanh(y):
"""
Derivative of the hyperbolic tangent activation function.
Parameters:
- y: np.ndarray, output of the hyperbolic tangent activation function
Returns:
- np.ndarray, derivative of the hyperbolic tangent function
"""
return 1 - y * y
Parameters:
- x: np.ndarray, input to the network
Returns:
- np.ndarray, output of the network
- list, caches containing intermediate values for backpropagation
"""
caches = []
h_prev = np.zeros((self.hidden_size, 1))
c_prev = np.zeros((self.hidden_size, 1))
h = h_prev
c = c_prev
for t in range(x.shape[0]):
x_t = x[t].reshape(-1, 1)
combined = np.vstack((h_prev, x_t))
3
i = self.sigmoid(np.dot(self.wi, combined) + self.bi)
o = self.sigmoid(np.dot(self.wo, combined) + self.bo)
c_ = np.tanh(np.dot(self.wc, combined) + self.bc)
c = f * c_prev + i * c_
h = o * np.tanh(c)
h_prev, c_prev = h, c
y = np.dot(self.why, h) + self.by
return y, caches
Parameters:
- dy: np.ndarray, gradient of the loss with respect to the output
- caches: list, caches from the forward pass
- clip_value: float, value to clip gradients to (default: 1.0)
Returns:
- tuple, gradients of the loss with respect to the parameters
"""
dWf, dWi, dWo, dWc = [np.zeros_like(w) for w in (self.wf, self.wi, self.
↪wo, self.wc)]
dWhy = np.zeros_like(self.why)
dby = np.zeros_like(self.by)
dc_next = np.zeros_like(dh_next)
4
df = dc * c_prev * self.dsigmoid(f)
di = dc * c_ * self.dsigmoid(i)
do = dh * self.dtanh(np.tanh(c))
dc_ = dc * i * self.dtanh(c_)
gradients = (dWf, dWi, dWo, dWc, dbf, dbi, dbo, dbc, dWhy, dby)
# Gradient clipping
for i in range(len(gradients)):
np.clip(gradients[i], -clip_value, clip_value, out=gradients[i])
return gradients
Parameters:
- grads: tuple, gradients of the loss with respect to the parameters
- learning_rate: float, learning rate
"""
dWf, dWi, dWo, dWc, dbf, dbi, dbo, dbc, dWhy, dby = grads
5
self.wo -= learning_rate * dWo
self.wc -= learning_rate * dWc
Parameters:
- model: LSTM, the LSTM network to train
- learning_rate: float, learning rate for the optimizer
- patience: int, number of epochs to wait before early stopping
- verbose: bool, whether to print training information
- delta: float, minimum change in validation loss to qualify as an␣
↪improvement
"""
def __init__(self, model, learning_rate=0.01, patience=7, verbose=True,␣
↪delta=0):
self.model = model
self.learning_rate = learning_rate
self.train_losses = []
self.val_losses = []
self.early_stopping = EarlyStopping(patience, verbose, delta)
"""
Train the LSTM network.
Parameters:
- X_train: np.ndarray, training data
- y_train: np.ndarray, training labels
- X_val: np.ndarray, validation data
- y_val: np.ndarray, validation labels
- epochs: int, number of training epochs
- batch_size: int, size of mini-batches
- clip_value: float, value to clip gradients to
"""
for epoch in range(epochs):
epoch_losses = []
6
for i in range(0, len(X_train), batch_size):
batch_X = X_train[i:i + batch_size]
batch_y = y_train[i:i + batch_size]
losses = []
self.model.update_params(grads, self.learning_rate)
batch_loss = np.mean(losses)
epoch_losses.append(batch_loss)
avg_epoch_loss = np.mean(epoch_losses)
self.train_losses.append(avg_epoch_loss)
if epoch % 10 == 0:
print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_epoch_loss:.
↪5f}, Val Loss: {val_loss:.5f}')
7
"""
Validate the model on a separate set of data.
"""
val_losses = []
for x, y_true in zip(X_val, y_val):
y_pred, _ = self.model.forward(x)
loss = self.compute_loss(y_pred, y_true.reshape(-1, 1))
val_losses.append(loss)
return np.mean(val_losses)
Parameters:
- ticker: str, stock ticker symbol
- start_date: str, start date for data retrieval
- end_date: str, end date for data retrieval
- look_back: int, number of previous time steps to include in each sample
- train_size: float, proportion of data to use for training
"""
def __init__(self, start_date, end_date, look_back=1, train_size=0.67):
self.start_date = start_date
self.end_date = end_date
self.look_back = look_back
self.train_size = train_size
def load_data(self):
"""
Load stock data.
Returns:
- np.ndarray, training data
- np.ndarray, testing data
"""
df = pd.read_csv('data/google.csv')
df = df[(df['Date'] >= self.start_date) & (df['Date'] <= self.end_date)]
df = df.sort_index()
df = df.loc[self.start_date:self.end_date]
df = df[['Close']].astype(float) # Use closing price
df = self.MinMaxScaler(df.values) # Convert DataFrame to numpy array
train_size = int(len(df) * self.train_size)
train, test = df[0:train_size,:], df[train_size:len(df),:]
return train, test
8
Min-max scaling of the data.
Parameters:
- data: np.ndarray, input data
"""
numerator = data - np.min(data, 0)
denominator = np.max(data, 0) - np.min(data, 0)
return numerator / (denominator + 1e-7)
Parameters:
- dataset: np.ndarray, input data
Returns:
- np.ndarray, input data
- np.ndarray, output data
"""
dataX, dataY = [], []
for i in range(len(dataset)-self.look_back):
a = dataset[i:(i + self.look_back), 0]
dataX.append(a)
dataY.append(dataset[i + self.look_back, 0])
return np.array(dataX), np.array(dataY)
def get_train_test(self):
"""
Get the training and testing data.
Returns:
- np.ndarray, training input
- np.ndarray, training output
- np.ndarray, testing input
- np.ndarray, testing output
"""
train, test = self.load_data()
trainX, trainY = self.create_dataset(train)
testX, testY = self.create_dataset(test)
return trainX, trainY, testX, testY
9
# Plot the data
# Combine train and test data
combined = np.concatenate((trainY, testY))
10
Epoch 1/1000 - Loss: 0.24629, Val Loss: 0.53735
Epoch 11/1000 - Loss: 0.07889, Val Loss: 0.11416
Epoch 21/1000 - Loss: 0.06242, Val Loss: 0.05693
Epoch 31/1000 - Loss: 0.05286, Val Loss: 0.03841
Epoch 41/1000 - Loss: 0.04575, Val Loss: 0.02845
Epoch 51/1000 - Loss: 0.04046, Val Loss: 0.02222
Epoch 61/1000 - Loss: 0.03655, Val Loss: 0.01831
Epoch 71/1000 - Loss: 0.03364, Val Loss: 0.01598
Epoch 81/1000 - Loss: 0.03149, Val Loss: 0.01472
Epoch 91/1000 - Loss: 0.02989, Val Loss: 0.01420
Epoch 101/1000 - Loss: 0.02870, Val Loss: 0.01417
Epoch 111/1000 - Loss: 0.02782, Val Loss: 0.01444
Epoch 121/1000 - Loss: 0.02717, Val Loss: 0.01490
Epoch 131/1000 - Loss: 0.02668, Val Loss: 0.01546
Early stopping
11