Skip to content

Instantly share code, notes, and snippets.

@yukiarimo
Created November 28, 2023 19:25
Show Gist options
  • Save yukiarimo/8b1f28016a5ef6aef751d8996f64fd10 to your computer and use it in GitHub Desktop.
Save yukiarimo/8b1f28016a5ef6aef751d8996f64fd10 to your computer and use it in GitHub Desktop.
Bert Large CNN fine tuning
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer
from torch.optim import AdamW
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"
# Define your dataset class
class CustomDataset(Dataset):
def __init__(self, csv_path, tokenizer):
self.data = pd.read_csv(csv_path)
self.tokenizer = tokenizer
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
task = self.data.iloc[idx]['task']
summary = self.data.iloc[idx]['summary']
# Tokenize the input and target sequences
tokenized_inputs = self.tokenizer.encode_plus(task, padding='max_length', truncation=True, max_length=512)
tokenized_summary = self.tokenizer.encode_plus(summary, padding='max_length', truncation=True, max_length=512)
# Convert the tokenized sequences to tensors
input_ids = torch.tensor(tokenized_inputs['input_ids'])
attention_mask = torch.tensor(tokenized_inputs['attention_mask'])
summary_ids = torch.tensor(tokenized_summary['input_ids'])
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'summary_ids': summary_ids
}
# Set the path to your CSV file
csv_path = 'yuna copy.csv'
# Set up the tokenizer
tokenizer = BartTokenizer.from_pretrained('./bart-large-cnn/')
# Create an instance of the custom dataset
dataset = CustomDataset(csv_path, tokenizer)
# Define hyperparameters and training configurations
batch_size = 1
num_epochs = 50
learning_rate = 1e-5
# Create a data loader for the dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Load the pretrained BART model
model = BartForConditionalGeneration.from_pretrained('./bart-large-cnn/')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)
# Set the model to training mode
model.train()
# Define the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(dataloader) * num_epochs
# Set the directory to save the models
save_dir = 'Yuna-trained'
# Set the task-specific parameters for summarization
length_penalty = 1.0
max_length = 1024
min_length = 120
dropout = 0.01
# Set the model parameters for summarization
model.config.length_penalty = length_penalty
model.config.max_length = max_length
model.config.min_length = min_length
model.config.dropout = dropout
# Set the task-specific parameters for summarization
task_specific_params = {
'summarization': {
'early_stopping': True,
'length_penalty': length_penalty,
'max_length': max_length,
'min_length': min_length,
'no_repeat_ngram_size': 3,
'num_beams': 4
}
}
# Update the model configuration with task-specific parameters
for task, params in task_specific_params.items():
if task in model.config.task_specific_params:
model.config.task_specific_params[task].update(params)
else:
model.config.task_specific_params[task] = params
# Training loop
for epoch in range(num_epochs):
total_loss = 0
for batch in dataloader:
# Zero the gradients
optimizer.zero_grad()
# Forward pass
outputs = model(input_ids=batch['input_ids'].to(device),
attention_mask=batch['attention_mask'].to(device),
labels=batch['summary_ids'].to(device))
# Compute the loss
loss = outputs.loss
# Backpropagation
loss.backward()
# Update the model parameters
optimizer.step()
total_loss += loss.item()
# Print the average loss for the epoch
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")
# Save the model every 5th epoch
if (epoch + 1) % 5 == 0:
model.save_pretrained(os.path.join(save_dir, f'fine_tuned_model_epoch{epoch+1}'))
tokenizer.save_pretrained(os.path.join(save_dir, f'fine_tuned_model_epoch{epoch+1}'))
# Save the final fine-tuned model
model.save_pretrained(os.path.join(save_dir, 'fine_tuned_model_final'))
tokenizer.save_pretrained(os.path.join(save_dir, 'fine_tuned_model_final'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment