1. Transformer Models

from transformers import pipeline

# to start with we will classify a single sentnence
classifier = pipeline("sentiment-analysis")
print(classifier("I watched a good movie yesterday"))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.

[{'label': 'POSITIVE', 'score': 0.9990124702453613}]

# classify multiple sentences
classifier(['that was a good movie', 'He is unwell'])

[{'label': 'POSITIVE', 'score': 0.9998570680618286},
 {'label': 'NEGATIVE', 'score': 0.9989678859710693}]

# zero shot classification
# we havent trained the model on the labels we are using
zero_shot_classifier = pipeline("zero-shot-classification")
zero_shot_classifier("this is a very interesting course on algebra", candidate_labels=["mathematics", "physics", "chemistry", "biology"])

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.

{'sequence': 'this is a very interesting course on algebra',
 'labels': ['mathematics', 'biology', 'physics', 'chemistry'],
 'scores': [0.9866520166397095,
  0.004770115949213505,
  0.004415604285895824,
  0.004162236116826534]}

# text generation
generator = pipeline('text-generation')
generator("newtons first law states that")

No model was supplied, defaulted to gpt2 and revision 6c0e608 (https://huggingface.co/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
/Users/achinta/miniforge3/envs/ml/lib/python3.9/site-packages/transformers/generation_utils.py:1296: UserWarning: Neither `max_length` nor `max_new_tokens` has been set, `max_length` will default to 50 (`self.config.max_length`). Controlling `max_length` via the config is deprecated and `max_length` will be removed from the config in v5 of Transformers -- we recommend using `max_new_tokens` to control the maximum length of the generation.
  warnings.warn(

[{'generated_text': 'newtons first law states that students under the age of 18 "shall be admitted to any of these colleges and universities which may be established" with their respective states of residence and where they may obtain college degrees or certificates." Section 1-106 sets forth'}]

# text generation by specifying a model
generator = pipeline('text-generation', model='distilgpt2')
generator('in this course, we will teach you how to', max_length=30, num_return_sequences=3)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

[{'generated_text': 'in this course, we will teach you how to use your hands to manipulate your feet to push the button in many directions, and try to help you'},
 {'generated_text': 'in this course, we will teach you how to use them in a way for your students. So if you have any questions or concerns, do you'},
 {'generated_text': 'in this course, we will teach you how to be very confident in what you do during training sessions, during training sessions, and in your training sessions'}]

# Fill Mask
unmasker = pipeline('fill-mask')
unmasker("sun rises in the <mask>", top_k=3)

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.

[{'score': 0.296585351228714,
  'token': 6360,
  'token_str': ' sky',
  'sequence': 'sun rises in the sky'},
 {'score': 0.06161285191774368,
  'token': 12351,
  'token_str': ' Arctic',
  'sequence': 'sun rises in the Arctic'},
 {'score': 0.0532655231654644,
  'token': 3778,
  'token_str': ' sun',
  'sequence': 'sun rises in the sun'}]

# named entity recognnition
ner = pipeline('ner', grouped_entities=True)
ner("My name is Kiran and I work at Amazon")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
/Users/achinta/miniforge3/envs/ml/lib/python3.9/site-packages/transformers/pipelines/token_classification.py:135: UserWarning: `grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to `aggregation_strategy="simple"` instead.
  warnings.warn(

[{'entity_group': 'PER',
  'score': 0.9954788,
  'word': 'Kiran',
  'start': 11,
  'end': 16},
 {'entity_group': 'ORG',
  'score': 0.99733573,
  'word': 'Amazon',
  'start': 31,
  'end': 37}]

# question answering
question_answerer = pipeline('question-answering')
question_answerer(question="What is the capital of India?", context="India is a country in South Asia. Its capital is New Delhi")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.

{'score': 0.9939645528793335, 'start': 49, 'end': 58, 'answer': 'New Delhi'}

# summarizer = pipeline('summarization')
# summarizer('a stitch in time saves nine', min_length=100, max_length=200)

# !pip install sentencepiece
from transformers import pipeline
translator = pipeline('translation', model='Helsinki-NLP/opus-mt-fr-en')
translator('Bonjour, comment allez-vous?')

/Users/achinta/miniforge3/envs/ml/lib/python3.9/site-packages/transformers/models/marian/tokenization_marian.py:194: UserWarning: Recommended: pip install sacremoses.
  warnings.warn("Recommended: pip install sacremoses.")

[{'translation_text': 'Hello, how are you?'}]

Encoder and Decoder models

In encoder models, attention layers can access all words in the input sequence. Pretraining involves corruping an input sequence and predicting the original sequence (say masking). They are best suited for tasks that require a full understanding of the entire sequence, such as sentence classification.

Decoder models use only the decoder of the model. At each stage, the attention layer can access only words positioned before it in the sentence. These models are called auto-regresive models.

Sequence-to-sequence models, use both parts of the Transformer architecture. At each stage, the attention layers of the encoder can access all the words in the initial sentence, whereas the attention layers of the decoder can only access the words positioned before a given word in the input.

Loading a model

from transformers import AutoModel
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).

from transformers import AutoTokenizer
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}

# if task is sequence classification
from transformers import AutoModelForSequenceClassification
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)
print(f'outputs - {outputs.logits}')

# to output the probabilities, we need pass it thorugh a softmax
import torch
probs = torch.softmax(outputs.logits, dim=-1)
print(f'probabilities - {probs}')

print(model.config.id2label)

torch.Size([2, 2])
outputs - tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)
probabilities - tensor([[4.0195e-02, 9.5981e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)
{0: 'NEGATIVE', 1: 'POSITIVE'}

Models

from transformers import BertConfig, BertModel

config = BertConfig()
model = BertModel(config)

print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.

Tokenizers

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
text = "The medicine is arsenic album, ars albaam, allium sepa"
input_ids = tokenizer(text, return_tensors="pt").input_ids
print(input_ids[0])

# create the text from input ids
print(tokenizer.decode(input_ids[0]))

# print each token
tokens = [token for token in tokenizer.convert_ids_to_tokens(input_ids[0])]
print(tokens)


# print the vocab size
print(tokenizer.vocab_size)

tensor([  101,  1109,  5182,  1110,   170, 22972,  1596,  1312,   117,   170,
         1733,  2393,  2822,  2312,   117,  1155,  3656, 14516,  4163,   102])
[CLS] The medicine is arsenic album, ars albaam, allium sepa [SEP]
['[CLS]', 'The', 'medicine', 'is', 'a', '##rsen', '##ic', 'album', ',', 'a', '##rs', 'al', '##ba', '##am', ',', 'all', '##ium', 'se', '##pa', '[SEP]']
28996

We noticed that for unknown words, the tokenizer will split them into subwords that are not meaningful. For example, the word “huggingface” is split into “hug”, “##ging”, “##face”. This is because the tokenizer was trained on a vocabulary that did not contain the word “huggingface”.

But there is a way to simpler way to get tokens.

tokens = tokenizer.tokenize(text)
print(tokens)

# convert to ids
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

['The', 'medicine', 'is', 'a', '##rsen', '##ic', 'album', ',', 'a', '##rs', 'al', '##ba', '##am', ',', 'all', '##ium', 'se', '##pa']
[1109, 5182, 1110, 170, 22972, 1596, 1312, 117, 170, 1733, 2393, 2822, 2312, 117, 1155, 3656, 14516, 4163]

Attention masks are tensors with the exact same shape as the input IDs tensor, filled with 0s and 1s: 1s indicate the corresponding tokens should be attended to, and 0s indicate the corresponding tokens should not be attended to (i.e., they should be ignored by the attention layers of the model).

3.2 Processing the data

import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    'I have been waiting for a Hugging face course my whole life.',
    'This course is amazing'
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')
print(batch.keys())

batch['labels'] = torch.tensor([1, 1])
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

/Users/achinta/miniforge3/envs/ml/lib/python3.9/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(

loss = model(**batch).loss
loss.backward()
optimizer.step()

# download and cache datasets
# mrpc dataset is a dataset for paraphrase detection. 
from datasets import load_dataset
raw_datasets = load_dataset('glue','mrpc')
raw_datasets

Found cached dataset glue (/Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

raw_train_dataset = raw_datasets['train']
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

from transformers import AutoTokenizer
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_sentences_1 = tokenizer(raw_datasets['train']['sentence1'])
print(tokenized_sentences_1['input_ids'][0])

# we need sentence pairs, and the tokenizer understands that
inputs = tokenizer("This is the first sentence.", "This is the second one.")
print(inputs.keys())

print(tokenizer.convert_ids_to_tokens(inputs['input_ids']))
# Here token_type_ids is used to separate the two sentences

[101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102]
dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']

# We can tokenize the entire dataset in one go using
tokenized_dataset = tokenizer(raw_datasets['train']['sentence1'], raw_datasets['train']['sentence2'], padding=True, truncation=True)

# to tokenize in batches, we define a function to be applied to each sample. We can also handle any other preprocessing we want to do
def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding=True, truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(f"raw train dataset has keys - {raw_datasets['train'][0].keys()}")
print(f"tokenized train dataset has keys - {tokenized_datasets['train'][0].keys()}")

Loading cached processed dataset at /Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7570128720f579c6.arrow
Loading cached processed dataset at /Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-7b7428962528a4a4.arrow
Loading cached processed dataset at /Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3dcdb0892050254a.arrow

raw train dataset has keys - dict_keys(['sentence1', 'sentence2', 'label', 'idx'])
tokenized train dataset has keys - dict_keys(['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'])

# lets find the size of input_ids for a few samples
import random
samples = random.sample(list(tokenized_datasets["train"]), k=10)
samples = [{k: v for k, v in sample.items() if k not in ["idx", "sentence1", "sentence2"]} for sample in samples]
[len(sample['input_ids']) for sample in samples]

[96, 100, 96, 89, 89, 89, 96, 89, 103, 103]

# lets use the data collator to pad the dataset to the max length in the dataset (not the max length of the dataset)
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)    
batch = data_collator(samples)
print(batch.keys())
[len(batch['input_ids'][i]) for i in range(len(batch['input_ids']))]
{k: v.shape for k, v in batch.items()}

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

{'input_ids': torch.Size([10, 103]),
 'token_type_ids': torch.Size([10, 103]),
 'attention_mask': torch.Size([10, 103]),
 'labels': torch.Size([10])}

3.3 Finetuning the model

Lets sumamrize the steps we did till now

from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer
raw_datasets = load_dataset('glue','mrpc')
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], padding=True, truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import TrainingArguments
training_args = TrainingArguments('test-trainer')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Found cached dataset glue (/Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)

loading configuration file config.json from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file tokenizer.json from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

Loading cached processed dataset at /Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-df486dc2eb69ac71.arrow
Loading cached processed dataset at /Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-782aec776148990b.arrow
Loading cached processed dataset at /Users/achinta/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-8daf6f80b8639cc1.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /Users/achinta/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/pytorch_model.bin
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

import evaluate
import numpy as np

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

wandb: Network error (ConnectTimeout), entering retry loop.

We get a warning after instantiating this pretrained model. This is because BERT has not been pretrained on classifying pairs of sentences, so the head of the pretrained model has been discarded and a new head suitable for sequence classification has been added instead. The warnings indicate that some weights were not used (the ones corresponding to the dropped pretraining head) and that some others were randomly initialized (the ones for the new head).

5 The datasets library

!export TOKENIZERS_PARALLELISM=true
from datasets import load_dataset
# !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
# !wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

squad_it_dataset = load_dataset('json', data_files='SQuAD_it-train.json.gz', field='data')
squad_it_dataset

# we can include train and tests data in the squad_it_dataset
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset('json',data_files=data_files,field='data' )
squad_it_dataset

# we give remote urls also

Using custom data configuration default-bbdcaac21d7e3d0b
Found cached dataset json (/Users/achinta/.cache/huggingface/datasets/json/default-bbdcaac21d7e3d0b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)

Using custom data configuration default-80fa3afbe58e2f42
Found cached dataset json (/Users/achinta/.cache/huggingface/datasets/json/default-80fa3afbe58e2f42/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

%%time
# !wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
# !unzip drugsCom_raw.zip
from datasets import load_dataset
!ls -lrth drugs*.tsv

data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
drug_dataset

-rw-r--r--  1 achinta  staff    80M Oct  2  2018 drugsComTrain_raw.tsv
-rw-r--r--  1 achinta  staff    27M Oct  2  2018 drugsComTest_raw.tsv

Using custom data configuration default-4eaca5caac99961c
Found cached dataset csv (/Users/achinta/.cache/huggingface/datasets/csv/default-4eaca5caac99961c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)

CPU times: user 69.1 ms, sys: 17.9 ms, total: 87 ms
Wall time: 1.52 s

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

# look at a sample of the data
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
drug_sample[:3]

Loading cached shuffled indices for dataset at /Users/achinta/.cache/huggingface/datasets/csv/default-4eaca5caac99961c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-a77308af6ca3c149.arrow

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than an elevated blood pressure.  I had severe knee and ankle pain which completely went away after taking Mobic.  I attempted to stop the medication however pain returned after a few days."'],
 'rating': [9.0, 3.0, 10.0],
 'date': ['September 2, 2015', 'November 7, 2011', 'June 5, 2013'],
 'usefulCount': [36, 13, 128]}

# check that the "Unnamed: 0" is unique key in the dataset
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

# lets rename the column
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

def lowercase_condition(example):
    return {'condition': example['condition'].lower()}

def filter_nones(x):
    return x['condition'] is not None

drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

drug_dataset = drug_dataset.map(lowercase_condition)

def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

drug_dataset["train"].sort("review_length")[:2]

Loading cached sorted indices for dataset at /Users/achinta/.cache/huggingface/datasets/csv/default-4eaca5caac99961c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-75752611952acc02.arrow

{'patient_id': [103488, 23627],
 'drugName': ['Loestrin 21 1 / 20', 'Chlorzoxazone'],
 'condition': ['birth control', 'muscle spasm'],
 'review': ['"Excellent."', '"useless"'],
 'rating': [10.0, 1.0],
 'date': ['November 4, 2008', 'March 24, 2017'],
 'usefulCount': [5, 2],
 'review_length': [1, 1]}

import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)