Created November 28, 2023 19:25
Bert Large CNN fine tuning
import pandas as pd
import torch
from import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer
from torch.optim import AdamW
import os
# Define your dataset class
class CustomDataset(Dataset):
def __init__(self, csv_path, tokenizer): = pd.read_csv(csv_path)
self.tokenizer = tokenizer
def __len__(self):
return len(
def __getitem__(self, idx):
task =[idx]['task']
summary =[idx]['summary']
# Tokenize the input and target sequences
tokenized_inputs = self.tokenizer.encode_plus(task, padding='max_length', truncation=True, max_length=512)
tokenized_summary = self.tokenizer.encode_plus(summary, padding='max_length', truncation=True, max_length=512)
# Convert the tokenized sequences to tensors
input_ids = torch.tensor(tokenized_inputs['input_ids'])
attention_mask = torch.tensor(tokenized_inputs['attention_mask'])
summary_ids = torch.tensor(tokenized_summary['input_ids'])
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'summary_ids': summary_ids
# Set the path to your CSV file
csv_path = 'yuna copy.csv'
# Set up the tokenizer
tokenizer = BartTokenizer.from_pretrained('./bart-large-cnn/')
# Create an instance of the custom dataset
dataset = CustomDataset(csv_path, tokenizer)
# Define hyperparameters and training configurations
batch_size = 1
num_epochs = 50
learning_rate = 1e-5
# Create a data loader for the dataset
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
# Load the pretrained BART model
model = BartForConditionalGeneration.from_pretrained('./bart-large-cnn/')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Set the model to training mode
# Define the optimizer and the learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(dataloader) * num_epochs
# Set the directory to save the models
save_dir = 'Yuna-trained'
# Set the task-specific parameters for summarization
length_penalty = 1.0
max_length = 1024
min_length = 120
dropout = 0.01
# Set the model parameters for summarization
model.config.length_penalty = length_penalty
model.config.max_length = max_length
model.config.min_length = min_length
model.config.dropout = dropout
# Set the task-specific parameters for summarization
task_specific_params = {
'summarization': {
'early_stopping': True,
'length_penalty': length_penalty,
'max_length': max_length,
'min_length': min_length,
'no_repeat_ngram_size': 3,
'num_beams': 4
# Update the model configuration with task-specific parameters
for task, params in task_specific_params.items():
if task in model.config.task_specific_params:
model.config.task_specific_params[task] = params
# Training loop
for epoch in range(num_epochs):
total_loss = 0
for batch in dataloader:
# Zero the gradients
# Forward pass
outputs = model(input_ids=batch['input_ids'].to(device),
# Compute the loss
loss = outputs.loss
# Backpropagation
# Update the model parameters
total_loss += loss.item()
# Print the average loss for the epoch
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")
# Save the model every 5th epoch
if (epoch + 1) % 5 == 0:
model.save_pretrained(os.path.join(save_dir, f'fine_tuned_model_epoch{epoch+1}'))
tokenizer.save_pretrained(os.path.join(save_dir, f'fine_tuned_model_epoch{epoch+1}'))
# Save the final fine-tuned model
model.save_pretrained(os.path.join(save_dir, 'fine_tuned_model_final'))
tokenizer.save_pretrained(os.path.join(save_dir, 'fine_tuned_model_final'))
