Train Reward Model using Llama2:

Question

this is my code that use to train reward model:

import os
import torch
from datasets import load_dataset,Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
    HfArgumentParser
)
import pandas as pd
from peft import LoraConfig, TaskType
from trl import RewardConfig, RewardTrainer
df = pd.read_csv('data.csv')
raw_dataset = Dataset.from_pandas(df[:3])
model_id = 'meta-llama/Llama-2-7b-hf'
compute_dtype = getattr(torch, "float16")
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)
model = AutoModelForCausalLM.from_pretrained(model_id,use_auth_token=hf_auth)
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=hf_auth)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def formatting_func(examples):
    kwargs = {
        "padding": "max_length",
        "truncation": True,
        "max_length": 256,
        "return_tensors": "pt"
    }
# Prepend the prompt and a line break to the chosen and rejected responses.
prompt_plus_chosen_response = examples[&quot;prompt&quot;] + &quot;\n&quot; + examples[&quot;chosen&quot;]
prompt_plus_rejected_response = examples[&quot;prompt&quot;] + &quot;\n&quot; + examples[&quot;rejected&quot;]


# Tokenize the modified fields.
tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

return {
    &quot;input_ids&quot;: tokens_chosen[&quot;input_ids&quot;][0],
    &quot;attention_mask&quot;: tokens_chosen[&quot;attention_mask&quot;][0],
    &quot;labels&quot;: tokens_rejected[&quot;input_ids&quot;][0],  # Use rejected as labels for causal LM
    &quot;input_ids_chosen&quot;: tokens_chosen[&quot;input_ids&quot;][0],
    &quot;attention_mask_chosen&quot;: tokens_chosen[&quot;attention_mask&quot;][0],
    &quot;input_ids_rejected&quot;: tokens_rejected[&quot;input_ids&quot;][0],
    &quot;attention_mask_rejected&quot;: tokens_rejected[&quot;attention_mask&quot;][0],
}

raw_datasets = raw_dataset.map(formatting_func)
OUTPUT_DIR = "/kaggle/working/"
training_args = RewardConfig(
           output_dir=OUTPUT_DIR,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    no_cuda=False,
    report_to="wandb",
    run_name="reward_model",
    )
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
trainer = RewardTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=raw_datasets,
        peft_config=peft_config,
        #  max_length=None
    )
trainer.train()

This code gives IndexError: index out of range in self in google colab. And im also use Kaggle notebooks with T4x2. I cannot load this models in boths GPU's Can anyone tell me what is the issue??

Kaggle: RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

I load this model into the CPU without quantization. Then it shows:

IndexError: index out of range in self

score 1 · Accepted Answer · answered Feb 09 '24 at 13:04

1

Problem is solved.

The issue is max_length. when lower value used to max_length this issue is not occurs. that means 30GB gpu not enogh to for this process.

answered Feb 09 '24 at 13:04

Sandun Tharaka

33
4

Train Reward Model using Llama2:

1 Answers1