Setup

Download the Avro data file. Law Insider will copy this to Google Cloud or other storage for you. These are contracts and other documents you have uploaded to a Law Insider Private Repository exported to the Avro file format.
Install the required Python packages:

avro
pandas
beautifulsoup4
transformers
numpy
torch

Complete Python Code

import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter
import json
import pandas as pd
from bs4 import BeautifulSoup
from transformers import BertTokenizer
import numpy as np
import torch
from tqdm import tqdm
MAX_SEQ_LENGTH = 128
MODEL_INPUTS = frozenset(['input_ids', 'attention_mask'])


# Define a function to convert HTML to plain text
def html_to_text(html):
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.get_text()
    return text.strip()

avro_reader = DataFileReader(open("/Users/walkerrowe/Documents/avro/auto-contractfinder-20210512..20211129-00000-of-00128", "rb"),  DatumReader())

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

labels = {"contract":1,
          "not a contract":0
          }

labels_with_text = []

cnt=0
for document in tqdm(avro_reader):
    document_text = html_to_text(document['body'])
    if cnt==50:
        break
    cnt+=1
    if document['labels'].get('contract'):
        labels_with_text.append(('contract', document_text))
    else:
        labels_with_text.append(('not a contract', document_text))
     


df=pd.DataFrame(labels_with_text,columns=['category','text'])
print(df)



np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))



class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        print(df['category'], df['category'])

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer


def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

  
    device = torch.device("mps")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                
               
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 1
model = BertClassifier()
LR = 1e-6
              
train(model.to(torch.device("mps")), df_train, df_val, LR, EPOCHS)

Code Sections Explained

This code is not in the program, but it shows how the tokenizer encodes text.


from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

example_text = 'some text to encode'  
bert_input = tokenizer(example_text,padding='max_length', max_length = 10,  
                       truncation=True, return_tensors="pt")

print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])

Then it responds with:

tensor([[  101,  1199,  3087,  1106,  4035, 13775,   102,     0,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])

Reading from top to bottom, these are:

input ids—numbers representing each token. Here there are 4 blanks since we requested max_length = 10
token type ids—shows which sequence a token belongs to.
attention mask—if 1 it is a word, [CLS] or [SEP]. 0 means padding.

In this example we told it to return a tensor of type PyTorch (pt). You can also use TensorFlow (tf).

Dataset Class

We need a Dataset Class that extends torch.utils.data.Dataset.


df=pd.DataFrame(labels_with_text,columns=['category','text'])

train, val = Dataset(train_data), Dataset(val_data)

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
         

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

The model is trained on each tokenization layer from the split training, validation, and training sets and a self-attention layer, and feed-forward (neural network) layer:

from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

This returns a final layer with the categories contract and not a contract.

Train the Model

The next code trains the model model = BertClassifier().

Note that this code uses device mps since this was run on a MAC and the gpu on a MAC is accessed via the mps interface

from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

  
    device = torch.device("mps")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                
               
                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 2
model = BertClassifier()
LR = 1e-6
              
train(model.to(torch.device("mps")), df_train, df_val, LR, EPOCHS)

Evaluate the Model

While we did not code that here, can identify which tokens in the text contribute most to a classification by looking at gradients. This is useful as the problem with all neural networks is you cannot easily see the weights and coefficients because there are too many in multiple dimensions. So you can't easily explain what the model made the predict it did.

We want to answer:

why did the model classify the text as it did?
what words in the text are most relevant for this classification?
how accurate is our model?

References

https://huggingface.co/docs/transformers/main_classes/trainer#trainingarguments

https://huggingface.co/docs/transformers/model_doc/bert#bertforsequenceclassification

https://neptune.ai/blog/how-to-code-bert-using-pytorch-tutorial

https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f