Bert PyTorch Code Example
Setup
-
Download the Avro data file. Law Insider will copy this to Google Cloud or other storage for you. These are contracts and other documents you have uploaded to a Law Insider Private Repository exported to the Avro file format.
-
Install the required Python packages:
- avro
- pandas
- beautifulsoup4
- transformers
- numpy
- torch
Complete Python Code
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
import json
import pandas as pd
from bs4 import BeautifulSoup
from transformers import BertTokenizer
import numpy as np
import torch
from tqdm import tqdm
MAX_SEQ_LENGTH = 128
MODEL_INPUTS = frozenset(['input_ids', 'attention_mask'])
# Define a function to convert HTML to plain text
def html_to_text(html):
soup = BeautifulSoup(html, 'html.parser')
text = soup.get_text()
return text.strip()
avro_reader = DataFileReader(open("/Users/walkerrowe/Documents/avro/auto-contractfinder-20210512..20211129-00000-of-00128", "rb"), DatumReader())
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {"contract":1,
"not a contract":0
}
labels_with_text = []
cnt=0
for document in tqdm(avro_reader):
document_text = html_to_text(document['body'])
if cnt==50:
break
cnt+=1
if document['labels'].get('contract'):
labels_with_text.append(('contract', document_text))
else:
labels_with_text.append(('not a contract', document_text))
df=pd.DataFrame(labels_with_text,columns=['category','text'])
print(df)
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
[int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))
class Dataset(torch.utils.data.Dataset):
def __init__(self, df):
print(df['category'], df['category'])
self.labels = [labels[label] for label in df['category']]
self.texts = [tokenizer(text,
padding='max_length', max_length = 512, truncation=True,
return_tensors="pt") for text in df['text']]
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
# Fetch a batch of labels
return np.array(self.labels[idx])
def get_batch_texts(self, idx):
# Fetch a batch of inputs
return self.texts[idx]
def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx)
batch_y = self.get_batch_labels(idx)
return batch_texts, batch_y
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
[int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))
from torch import nn
from transformers import BertModel
class BertClassifier(nn.Module):
def __init__(self, dropout=0.5):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 5)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
def train(model, train_data, val_data, learning_rate, epochs):
train, val = Dataset(train_data), Dataset(val_data)
train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
device = torch.device("mps")
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr= learning_rate)
criterion = criterion.cuda()
for epoch_num in range(epochs):
total_acc_train = 0
total_loss_train = 0
for train_input, train_label in tqdm(train_dataloader):
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
input_id = train_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, train_label.long())
total_loss_train += batch_loss.item()
acc = (output.argmax(dim=1) == train_label).sum().item()
total_acc_train += acc
model.zero_grad()
batch_loss.backward()
optimizer.step()
total_acc_val = 0
total_loss_val = 0
with torch.no_grad():
for val_input, val_label in val_dataloader:
val_label = val_label.to(device)
mask = val_input['attention_mask'].to(device)
input_id = val_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, val_label.long())
total_loss_val += batch_loss.item()
acc = (output.argmax(dim=1) == val_label).sum().item()
total_acc_val += acc
print(
f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
| Train Accuracy: {total_acc_train / len(train_data): .3f} \
| Val Loss: {total_loss_val / len(val_data): .3f} \
| Val Accuracy: {total_acc_val / len(val_data): .3f}')
EPOCHS = 1
model = BertClassifier()
LR = 1e-6
train(model.to(torch.device("mps")), df_train, df_val, LR, EPOCHS)
Code Sections Explained
This code is not in the program, but it shows how the tokenizer encodes text.
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
example_text = 'some text to encode'
bert_input = tokenizer(example_text,padding='max_length', max_length = 10,
truncation=True, return_tensors="pt")
print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])
Then it responds with:
tensor([[ 101, 1199, 3087, 1106, 4035, 13775, 102, 0, 0, 0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
Reading from top to bottom, these are:
- input ids—numbers representing each token. Here there are 4 blanks since we requested
max_length = 10
- token type ids—shows which sequence a token belongs to.
- attention mask—if 1 it is a word, [CLS] or [SEP]. 0 means padding.
In this example we told it to return a tensor of type PyTorch (pt). You can also use TensorFlow (tf).
Dataset Class
We need a Dataset Class that extends torch.utils.data.Dataset
.
df=pd.DataFrame(labels_with_text,columns=['category','text'])
train, val = Dataset(train_data), Dataset(val_data)
class Dataset(torch.utils.data.Dataset):
def __init__(self, df):
self.labels = [labels[label] for label in df['category']]
self.texts = [tokenizer(text,
padding='max_length', max_length = 512, truncation=True,
return_tensors="pt") for text in df['text']]
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
# Fetch a batch of labels
return np.array(self.labels[idx])
def get_batch_texts(self, idx):
# Fetch a batch of inputs
return self.texts[idx]
def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx)
batch_y = self.get_batch_labels(idx)
return batch_texts, batch_y
The model is trained on each tokenization layer from the split training, validation, and training sets and a self-attention layer, and feed-forward (neural network) layer:
from torch import nn
from transformers import BertModel
class BertClassifier(nn.Module):
def __init__(self, dropout=0.5):
super(BertClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, 5)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
This returns a final layer with the categories contract and not a contract.
Train the Model
The next code trains the model model = BertClassifier()
.
Note that this code uses device mps since this was run on a MAC and the gpu on a MAC is accessed via the mps interface
from torch.optim import Adam
from tqdm import tqdm
def train(model, train_data, val_data, learning_rate, epochs):
train, val = Dataset(train_data), Dataset(val_data)
train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
device = torch.device("mps")
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr= learning_rate)
criterion = criterion.cuda()
for epoch_num in range(epochs):
total_acc_train = 0
total_loss_train = 0
for train_input, train_label in tqdm(train_dataloader):
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
input_id = train_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, train_label.long())
total_loss_train += batch_loss.item()
acc = (output.argmax(dim=1) == train_label).sum().item()
total_acc_train += acc
model.zero_grad()
batch_loss.backward()
optimizer.step()
total_acc_val = 0
total_loss_val = 0
with torch.no_grad():
for val_input, val_label in val_dataloader:
val_label = val_label.to(device)
mask = val_input['attention_mask'].to(device)
input_id = val_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
batch_loss = criterion(output, val_label.long())
total_loss_val += batch_loss.item()
acc = (output.argmax(dim=1) == val_label).sum().item()
total_acc_val += acc
print(
f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
| Train Accuracy: {total_acc_train / len(train_data): .3f} \
| Val Loss: {total_loss_val / len(val_data): .3f} \
| Val Accuracy: {total_acc_val / len(val_data): .3f}')
EPOCHS = 2
model = BertClassifier()
LR = 1e-6
train(model.to(torch.device("mps")), df_train, df_val, LR, EPOCHS)
Evaluate the Model
While we did not code that here, can identify which tokens in the text contribute most to a classification by looking at gradients. This is useful as the problem with all neural networks is you cannot easily see the weights and coefficients because there are too many in multiple dimensions. So you can't easily explain what the model made the predict it did.
We want to answer:
- why did the model classify the text as it did?
- what words in the text are most relevant for this classification?
- how accurate is our model?
References
https://huggingface.co/docs/transformers/main_classes/trainer#trainingarguments
https://huggingface.co/docs/transformers/model_doc/bert#bertforsequenceclassification
https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial
https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
Updated over 1 year ago