Fine-tuning large language models (LLMs) for code generation, such as Codex, StarCoder, and Code Llama, can significantly improve their relevance to your organization's unique coding standards and proprietary libraries. However, this process can be resource-intensive. This guide shows how to fine-tune a code LLM on private code bases efficiently using a single GPU.
colab notebookWe utilize the top 10 Hugging Face public repositories on GitHub, excluding non-code files like images and audio. The resulting dataset, available at smangrul/hf-stack-v1, includes repo IDs, file paths, and file contents.
We'll fine-tune the bigcode/starcoderbase-1b
, a 1B parameter model trained on 80+ programming languages. Access to this gated model can be requested on the Hugging Face model page.
!pip install -q transformers datasets peft bitsandbytes flash-attn
MODEL = "mistralai/Codestral-22B-v0.1"
DATASET = "smangrul/hf-stack-v1"
DATA_COLUMN = "content"
SEQ_LENGTH = 2048
MAX_STEPS = 500
BATCH_SIZE = 16
LR = 5e-4
OUTPUT_DIR = "mistralai/Codestral-22B-v0.1"
BF16 = True
Load the dataset in streaming mode to handle large data efficiently. Reserve the first 4000 examples for validation and the rest for training.
from datasets import load_dataset
dataset = load_dataset(DATASET, data_dir="data", split="train", streaming=True)
valid_data = dataset.take(4000)
train_data = dataset.skip(4000)
Estimate the character-to-token ratio to facilitate fixed-length token sequences.
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
total_characters, total_tokens = 0, 0
for _, example in zip(range(nb_examples), iter(dataset)):
total_characters += len(example[data_column])
total_tokens += len(tokenizer(example[data_column]).tokens())
return total_characters / total_tokens
chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
FIM transformations enable the model to learn infilling text, not just generating it sequentially.
import functools
import numpy as np
@functools.lru_cache(maxsize=None)
def get_fim_token_ids(tokenizer):
FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map["additional_special_tokens"][1:5]
return (
tokenizer.vocab[FIM_SUFFIX],
tokenizer.vocab[FIM_PREFIX],
tokenizer.vocab[FIM_MIDDLE],
tokenizer.vocab[FIM_PAD]
)
def permute(sample, np_rng, suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id, fim_rate=0.5, fim_spm_rate=0.5):
if np_rng.binomial(1, fim_rate):
boundaries = sorted(np_rng.randint(0, len(sample) + 1, size=2))
prefix, middle, suffix = np.array_split(sample, [boundaries[0], boundaries[1]])
if np_rng.binomial(1, fim_spm_rate):
new_sample = np.concatenate([[prefix_tok_id, suffix_tok_id], suffix, [middle_tok_id], prefix, middle])
else:
new_sample = np.concatenate([[prefix_tok_id], prefix, [suffix_tok_id], suffix, [middle_tok_id], middle])
else:
new_sample = sample
return list(new_sample), np_rng
This dataset returns constant-length chunks of tokens.
def __iter__(self):
iterator = iter(self.dataset)
np_rng = np.random.RandomState(seed=self.seed)
while True:
buffer, buffer_len = [], 0
while buffer_len < self.max_buffer_size:
try:
buffer.append(next(iterator)[self.content_field])
buffer_len += len(buffer[-1])
except StopIteration:
break
tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
all_token_ids = [token for tokens in tokenized_inputs for token in tokens + [self.concat_token_id]]
examples = [all_token_ids[i: i + self.seq_length] for i in range(0, len(all_token_ids), self.seq_length)]
random.shuffle(examples)
for example in examples:
yield {"input_ids": torch.LongTensor(example), "labels": torch.LongTensor(example)}
train_dataset = ConstantLengthDataset(tokenizer, train_data, SEQ_LENGTH, chars_per_token)
eval_dataset = ConstantLengthDataset(tokenizer, valid_data, SEQ_LENGTH, chars_per_token)
Load and quantize the model using bitsandbytes.
from transformers import AutoModelForCausalLM
from peft import prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
MODEL, quantization_config=bnb_config, device_map={"": 0}, use_cache=False, trust_remote_code=True, use_flash_attention_2=True
)
model = prepare_model_for_kbit_training(model)
Configure and train the model using LoRA for parameter-efficient fine-tuning.
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
r=8, lora_alpha=32, lora_dropout=0.0, target_modules=["c_proj", "c_attn", "q_attn", "c_fc"], task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
per_device_train_batch_size=BATCH_SIZE,
learning_rate=LR,
max_steps=MAX_STEPS,
output_dir=OUTPUT_DIR,
evaluation_strategy="steps",
eval_steps=100,
save_steps=100,
logging_steps=25
)
trainer = Trainer(
model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer
)
trainer.train()
After the Model is trained, it will automatically push the model to Hugging Face. You can access this model from here. Now you can access this model to generate the code which is fine-tuned on your dataset.
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel, PeftConfig
import torch
prompt = "Generate a function that converts a hex char[] to its equivalent ASCII character and returns it as unsigned char[]."
config = PeftConfig.from_pretrained("zulqarnain-kernel/peft-starcoder-lora-a100")
base_model = AutoModelForCausalLM.from_pretrained("bigcode/starcoderbase-1b")
model = PeftModel.from_pretrained(base_model, "zulqarnain-kernel/peft-starcoder-lora-a100")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderbase-1b")
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_length=500)
response = tokenizer.decode(outputs[0], clean_up_tokenization_spaces=False)
print(response)