In [21]:
import os
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate

# prevent annoying warnings
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# full bert is about 2x slower 
model_name="distilbert/distilbert-base-cased"#distilbert/distilbert-base-uncased"

# dataset -- has text and label divided into "train" and "test" segments
imdb = load_dataset("imdb")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# collator -- handles batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load("accuracy")



In [22]:
# it might take 30 min or so per epoch to train distillbert all 25k on my laptop; this takes about 70s
# it takes about 8m on full bert to just do a decode on laptop
train_dataset=imdb["train"].shuffle(seed=42).select(range(1000))
eval_dataset=imdb["test"].shuffle(seed=42).select(range(100))

In [23]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

In [24]:
eval_dataset[0]

{'text': "<br /><br />When I unsuspectedly rented A Thousand Acres, I thought I was in for an entertaining King Lear story and of course Michelle Pfeiffer was in it, so what could go wrong?<br /><br />Very quickly, however, I realized that this story was about A Thousand Other Things besides just Acres. I started crying and couldn't stop until long after the movie ended. Thank you Jane, Laura and Jocelyn, for bringing us such a wonderfully subtle and compassionate movie! Thank you cast, for being involved and portraying the characters with such depth and gentleness!<br /><br />I recognized the Angry sister; the Runaway sister and the sister in Denial. I recognized the Abusive Husband and why he was there and then the Father, oh oh the Father... all superbly played. I also recognized myself and this movie was an eye-opener, a relief, a chance to face my OWN truth and finally doing something about it. I truly hope A Thousand Acres has had the same effect on some others out there.<br /><b

In [36]:
# Let's see what happens when we change characteristics of the data. Does it affect performance?
def do_cap(examples):
    return {"text": [x.upper() for x in examples["text"]]}
cap_eval_dataset = eval_dataset.map(do_cap, batched=True)
cap_train_dataset = train_dataset.map(do_cap, batched=True)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 12433.09 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 68832.43 examples/s]


In [37]:
cap_eval_dataset[0]

{'text': "<BR /><BR />WHEN I UNSUSPECTEDLY RENTED A THOUSAND ACRES, I THOUGHT I WAS IN FOR AN ENTERTAINING KING LEAR STORY AND OF COURSE MICHELLE PFEIFFER WAS IN IT, SO WHAT COULD GO WRONG?<BR /><BR />VERY QUICKLY, HOWEVER, I REALIZED THAT THIS STORY WAS ABOUT A THOUSAND OTHER THINGS BESIDES JUST ACRES. I STARTED CRYING AND COULDN'T STOP UNTIL LONG AFTER THE MOVIE ENDED. THANK YOU JANE, LAURA AND JOCELYN, FOR BRINGING US SUCH A WONDERFULLY SUBTLE AND COMPASSIONATE MOVIE! THANK YOU CAST, FOR BEING INVOLVED AND PORTRAYING THE CHARACTERS WITH SUCH DEPTH AND GENTLENESS!<BR /><BR />I RECOGNIZED THE ANGRY SISTER; THE RUNAWAY SISTER AND THE SISTER IN DENIAL. I RECOGNIZED THE ABUSIVE HUSBAND AND WHY HE WAS THERE AND THEN THE FATHER, OH OH THE FATHER... ALL SUPERBLY PLAYED. I ALSO RECOGNIZED MYSELF AND THIS MOVIE WAS AN EYE-OPENER, A RELIEF, A CHANCE TO FACE MY OWN TRUTH AND FINALLY DOING SOMETHING ABOUT IT. I TRULY HOPE A THOUSAND ACRES HAS HAD THE SAME EFFECT ON SOME OTHERS OUT THERE.<BR /><B

In [38]:
tokenized_cap_eval_dataset = cap_eval_dataset.map(preprocess_function, batched=True)
tokenized_cap_train_dataset = cap_train_dataset.map(preprocess_function, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1291.60 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1335.01 examples/s]


In [28]:
# load the model. It is a generic BERT (like) model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# let's look at the architecture
model

# Vanilla Distilbert

In [30]:
# output directory will store the model. 
# batch size is "t"
# epochs is times through the data
# optimizer is "AdamW"; learning_rate is initial rate; weight_decay is another hyperparam of adamw
training_args = TrainingArguments(
    output_dir="test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps", # can also do "epoch" 
    eval_steps=20,
    save_strategy="steps",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [31]:
# should be random
trainer.evaluate(tokenized_eval_dataset)

{'eval_loss': 0.6957502961158752,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.53,
 'eval_runtime': 2.3972,
 'eval_samples_per_second': 41.715,
 'eval_steps_per_second': 2.92}

In [32]:
trainer.evaluate(tokenized_cap_eval_dataset)

{'eval_loss': 0.6954565644264221,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.53,
 'eval_runtime': 1.4346,
 'eval_samples_per_second': 69.707,
 'eval_steps_per_second': 4.88}

In [33]:
trainer.train()

Step,Training Loss,Validation Loss,Model Preparation Time,Accuracy
50,No log,0.552961,0.0008,0.8
100,No log,0.39821,0.0008,0.84


TrainOutput(global_step=126, training_loss=0.48765127999441965, metrics={'train_runtime': 79.6421, 'train_samples_per_second': 25.112, 'train_steps_per_second': 1.582, 'total_flos': 262556593545504.0, 'train_loss': 0.48765127999441965, 'epoch': 2.0})

In [34]:
trainer.evaluate(tokenized_eval_dataset)

{'eval_loss': 0.3923421800136566,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.85,
 'eval_runtime': 1.1875,
 'eval_samples_per_second': 84.208,
 'eval_steps_per_second': 5.895,
 'epoch': 2.0}

In [39]:
trainer.evaluate(tokenized_cap_eval_dataset)

{'eval_loss': 0.82340407371521,
 'eval_model_preparation_time': 0.0008,
 'eval_accuracy': 0.53,
 'eval_runtime': 1.3675,
 'eval_samples_per_second': 73.127,
 'eval_steps_per_second': 5.119,
 'epoch': 2.0}

In [45]:
preds = trainer.predict(tokenized_eval_dataset)

In [54]:
tokenized_train_dataset.set_format("pt", columns=["input_ids",  "attention_mask"], output_all_columns=True)

print(len(tokenized_train_dataset[1]["input_ids"]))
print(len(tokenized_cap_train_dataset[1]["input_ids"]))


166
354


In [55]:
# what happens if we don't have a gpu?
training_args = TrainingArguments(
    output_dir="slow_test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="steps", # can also do "epoch" 
    eval_steps=20,
    save_strategy="steps",
    load_best_model_at_end=True,
    use_cpu=True,
    use_mps_device=False,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#slow! -- 18:49 for 2 epochs
trainer.train()

In [88]:
# for distilbert -- we will use lora on these parts of the model
target_modules=[
"q_lin",
"k_lin",
"v_lin",
"out_lin",
]

In [89]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# PEFT

In [91]:
from peft import get_peft_model
from peft import LoraConfig, TaskType

# dx4 + 4xd matrix used instead of dxd to save memory and time (might be bad performance though
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=target_modules
)

peft_model = get_peft_model(model, peft_config)

In [92]:
# notice the savings
peft_model.print_trainable_parameters()

trainable params: 739,586 || all params: 66,522,628 || trainable%: 1.1118


In [93]:
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(28996, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768

In [97]:
training_args = TrainingArguments(
    output_dir="test",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
   # use_cpu=True,
   # use_mps_device=False,
)

In [98]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [99]:
# 15 minutes. Also it takes less memory. Still best to not use the cpu
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.685648,0.55
2,No log,0.683444,0.54


TrainOutput(global_step=126, training_loss=0.6884631202334449, metrics={'train_runtime': 72.6358, 'train_samples_per_second': 27.535, 'train_steps_per_second': 1.735, 'total_flos': 267059820282432.0, 'train_loss': 0.6884631202334449, 'epoch': 2.0})

In [60]:
trainer.evaluate(tokenized_eval_dataset)

{'eval_loss': 0.6999419927597046,
 'eval_model_preparation_time': 0.013,
 'eval_accuracy': 0.46,
 'eval_runtime': 2.2831,
 'eval_samples_per_second': 43.8,
 'eval_steps_per_second': 3.066}

In [61]:
trainer.evaluate(tokenized_cap_eval_dataset)

{'eval_loss': 0.7067443132400513,
 'eval_model_preparation_time': 0.013,
 'eval_accuracy': 0.46,
 'eval_runtime': 2.0594,
 'eval_samples_per_second': 48.558,
 'eval_steps_per_second': 3.399}

In [62]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,No log,0.694186,0.013,0.42
2,No log,0.69292,0.013,0.48


TrainOutput(global_step=126, training_loss=0.7009014553493924, metrics={'train_runtime': 129.1552, 'train_samples_per_second': 15.485, 'train_steps_per_second': 0.976, 'total_flos': 524201321214528.0, 'train_loss': 0.7009014553493924, 'epoch': 2.0})

In [63]:
trainer.evaluate(tokenized_eval_dataset)

{'eval_loss': 0.6929197907447815,
 'eval_model_preparation_time': 0.013,
 'eval_accuracy': 0.48,
 'eval_runtime': 2.3495,
 'eval_samples_per_second': 42.562,
 'eval_steps_per_second': 2.979,
 'epoch': 2.0}

In [64]:
trainer.evaluate(tokenized_cap_eval_dataset)

{'eval_loss': 0.6966588497161865,
 'eval_model_preparation_time': 0.013,
 'eval_accuracy': 0.46,
 'eval_runtime': 2.1934,
 'eval_samples_per_second': 45.592,
 'eval_steps_per_second': 3.191,
 'epoch': 2.0}