Distributed Fine-Tuning With The Transformers API by HuggingFace - Databricks
Distributed Fine-Tuning With The Transformers API by HuggingFace - Databricks
NUM_WORKERS = 4
def get_gpus_per_worker(_):
import torch
return torch.cuda.device_count()
NUM_GPUS_PER_WORKER = sc.parallelize(range(4),
4).map(get_gpus_per_worker).collect()[0]
USE_GPU = NUM_GPUS_PER_WORKER > 0
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 1/7
5/15/24, 10:11 PM Distributed fine-tuning with the Transformers API by HuggingFace - Databricks
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 2/7
5/15/24, 10:11 PM Distributed fine-tuning with the Transformers API by HuggingFace - Databricks
imdb = load_dataset("imdb")
train = pd.DataFrame(imdb["train"])
test = pd.DataFrame(imdb["test"])
texts = train["text"].tolist()
labels = train["label"].tolist()
class ImdbDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __len__(self):
return len(self.labels)
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 3/7
5/15/24, 10:11 PM Distributed fine-tuning with the Transformers API by HuggingFace - Databricks
import numpy as np
from datasets import load_metric
def compute_metrics(eval_pred):
load_accuracy = load_metric("accuracy")
load_f1 = load_metric("f1")
def train_model():
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
save_strategy="epoch",
report_to=[], # REMOVE MLFLOW INTEGRATION FOR NOW
push_to_hub=False, # DO NOT PUSH TO MODEL HUB FOR NOW,
load_best_model_at_end=True, # RECOMMENDED
metric_for_best_model="eval_loss", # RECOMMENDED
evaluation_strategy="epoch" # RECOMMENDED
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_test,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
return trainer.state.best_model_checkpoint
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 4/7
5/15/24, 10:11 PM Distributed fine-tuning with the Transformers API by HuggingFace - Databricks
test_model(single_node_ckpt_path)
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 5/7
5/15/24, 10:11 PM Distributed fine-tuning with the Transformers API by HuggingFace - Databricks
NUM_PROCESSES = torch.cuda.device_count()
print(f"We're using {NUM_PROCESSES} GPUs")
single_node_multi_gpu_ckpt_path =
TorchDistributor(num_processes=NUM_PROCESSES, local_mode=True,
use_gpu=USE_GPU).run(train_model)
test_model(single_node_multi_gpu_ckpt_path)
test_model(multi_node_ckpt_path)
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 6/7
5/15/24, 10:11 PM Distributed fine-tuning with the Transformers API by HuggingFace - Databricks
https://docs.databricks.com/en/_extras/notebooks/source/deep-learning/distributed-fine-tuning-hugging-face.html 7/7