Skip to content

Instantly share code, notes, and snippets.

View BramVanroy's full-sized avatar

Bram Vanroy BramVanroy

View GitHub Profile
@BramVanroy
BramVanroy / benchmark.py
Last active May 29, 2024 11:33
Fast method of "first-fit-decreasing" packing benchmark. Around 5x faster than baseline. Baseline taken from https://huggingface.co/DiscoResearch/Llama3-German-8B#document-packing. Note that memory usage will be higher in the optimized version.
import gc
import numpy as np
import time
import pandas as pd
from tqdm import tqdm
def pack_documents_original(tokenized_documents, block_size: int = 8192, use_tqdm=True):
#!/usr/bin/env python
# coding=utf-8
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
@BramVanroy
BramVanroy / convert_to_safetensors.py
Last active August 3, 2023 09:51
Convert a given (local) model to safetensors format
import importlib
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
from transformers import HfArgumentParser, AutoConfig, AutoTokenizer
@dataclass
class ScriptArguments:
@BramVanroy
BramVanroy / gpu-error-log.sh
Last active July 18, 2023 07:24
Log ssh GPU errors
# If there is an error in nvidia-smi, log it to a file in ~/gpu-errors!
nvidia_smi_output=$(nvidia-smi)
if echo "nvidia_smi_output" | grep -q "ERR"; then
fname=~/gpu-errors/$(hostname)-error.txt
pdir=$(dirname "$fname")
mkdir -p "$pdir"
nvcc_output=$(nvcc --version)
echo "$nvidia_smi_output"$'\n'"$nvcc_version_output" > "$fname"
fi
@BramVanroy
BramVanroy / set_seed.py
Last active March 30, 2023 07:42
Settnig deterministic seeds
def set_seed(seed: Optional[int]):
if seed is not None:
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
@BramVanroy
BramVanroy / run.py
Last active July 19, 2023 09:22
Overwrite HfArgumentParser config options with CLI arguments
# See https://gist.github.com/BramVanroy/f78530673b1437ed0d6be7c61cdbdd7c
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, HyperOptArguments))
try:
# Assumes that the first .json file is the config file (if any)
config_file = next(iter(arg for arg in sys.argv if arg.endswith(".json")))
except StopIteration:
config_file = None
run_name_specified = False
@BramVanroy
BramVanroy / vsc-lmod-deepspeed.bashrc
Last active January 23, 2023 08:37
Combining LMOD with DeepSpeed. As a bonus, also add a command to automatically generate a hostfile.
# If we open a session/job that's on a host that starts with gpu* (e.g. gpu512.dodrio.os),
# load PyTorch with CUDA and pdsh
# This makes sure that deepspeed/pdsh work in multi node settings
if [[ $(hostname) == gpu* ]]; then
module load PyTorch/1.12.0-foss-2022a-CUDA-11.7.0;
module load pdsh/2.34-GCCcore-11.3.0;
fi
# Automatically generates a hostfile for the current job in the current directory,
@BramVanroy
BramVanroy / get_memory_usage.py
Created December 12, 2022 10:07
Print out CPU/GPU memory usage (basic)
import math
import psutil
from pynvml import nvmlDeviceGetCount, nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
def format_bytes(nbytes: int) -> str:
if nbytes == 0:
return "0 B"
unit = ("B", "kB", "MB", "GB", "TB")
@BramVanroy
BramVanroy / get_words_of_tokens.py
Created June 15, 2022 13:44
Get original words of tokens in HF Tokenizers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
text = "It 's a pre-tokenized , silly sentence !"
words = text.split()
encoded = tokenizer(words, is_split_into_words=True)
for token, wordid in zip(encoded.tokens(), encoded.word_ids()):
if wordid is not None:
print(token, words[wordid])
from typing import List
import spacy
from spacy import Language, Vocab
from spacy.tokens import Doc
def load_nlp(model_name: str = "en_core_web_sm",
is_tokenized: bool = False,
exclude: List[str] = None):
"""Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized.