<a href="https://colab.research.google.com/github/napsternxg/TRAC2020/blob/master/notebooks/TRAC2020_savedmodel_inference_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
pip install torch transformers



In [0]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import torch
from pathlib import Path
from scipy.special import softmax
import numpy as np
import pandas as pd

from collections import defaultdict

In [0]:
lang, task, base_model = "ALL", "Sub-task C", "bert-base-multilingual-uncased"
# socialmediaie/TRAC2020_ALL_C_bert-base-multilingual-uncased
# Since all models are on https://huggingface.co/socialmediaie
# You need not require using the databank_model
# The databank model includes:
# the model predictions on TRAC datasets and eval metrics
# Tensorboard events file.
databank_model = False 

tar_file = Path(f"./{lang}_{task}_{base_model}.tar.gz")
if databank_model:
  databank_url = "https://databank.illinois.edu/datafiles/sk3r0/download"
  !mkdir -p "databank_model"
  if not tar_file.exists():
    !curl -JLO "{databank_url}"
  print(tar_file.exists(), tar_file.absolute())
  ! tar -xzf "./{tar_file}" -C "./databank_model"
  ! pwd

In [0]:
TASK_LABEL_IDS = {
    "Sub-task A": ["OAG", "NAG", "CAG"],
    "Sub-task B": ["GEN", "NGEN"],
    "Sub-task C": ["OAG-GEN", "OAG-NGEN", "NAG-GEN", "NAG-NGEN", "CAG-GEN", "CAG-NGEN"]
}

def get_model(lang, task, base_model, databank_model=False):
  # other option is hugging face library
  if databank_model:
      # Make sure you have downloaded the required model file from https://databank.illinois.edu/datasets/IDB-8882752
      # Unzip the file at some model_path (we are using: "databank_model")
      model_path = Path(f"./databank_model/{lang}/{task}/output/{base_model}/model")
      print(model_path)
      # Assuming you get the following type of structure inside "databank_model"
      # 'databank_model/ALL/Sub-task C/output/bert-base-multilingual-uncased/model'
      #_, lang, task, _, base_model, _ = model_path.parts
      tokenizer = AutoTokenizer.from_pretrained(base_model)
      model = AutoModelForSequenceClassification.from_pretrained(model_path)
  else:
      #lang, task, base_model = "ALL", "Sub-task C", "bert-base-multilingual-uncased"
      base_model = f"socialmediaie/TRAC2020_{lang}_{task.split()[-1]}_{base_model}"
      print(base_model)
      tokenizer = AutoTokenizer.from_pretrained(base_model)
      model = AutoModelForSequenceClassification.from_pretrained(base_model)
  return model, tokenizer


def compute_dict_marginals(pred_probs, task_labels):
  task_preds = defaultdict(lambda: defaultdict(float))
  for l, p in zip(task_labels, preds_probs):
    for i, lt in enumerate(l.split("-")):
      task_preds[f"task_{i}"][lt] += p
  task_preds["task_joint"] = dict(zip(task_labels, pred_probs))
  return task_preds

def add_marginals(df):
  df["task_0"] = df.sum(axis=1)
  df.loc["task_1"] = df.sum(axis=0)
  return df

def show_marginal_probs(pred_probs, task_labels):
  df_t = pd.DataFrame({
    "labels": task_labels,
    "probs": pred_probs
  }).assign(
      task_0=lambda x: x["labels"].str.split("-", expand=True)[0],
      task_1=lambda x: x["labels"].str.split("-", expand=True)[1]
  ).drop("labels", axis=1).pivot_table(index="task_0", columns="task_1", values="probs", aggfunc="first").pipe(add_marginals)
  return df_t

In [5]:
model, tokenizer = get_model(lang, task, base_model, databank_model=databank_model)
# For doing inference set model in eval mode
model.eval();

socialmediaie/TRAC2020_ALL_C_bert-base-multilingual-uncased


In [0]:
#sentence = "This is a good cat and this is a bad dog."
sentence = "What a vacuum minded witch, product of May be so called Ranga-Billa. Such mean people gets Bookers Award, Disgusting!"

In [0]:
# If you want to further fine-tune the model you can reset the model to model.train()
task_labels = TASK_LABEL_IDS[task]

processed_sentence = f"{tokenizer.cls_token} {sentence}"
tokens = tokenizer.tokenize(sentence)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

In [8]:
with torch.no_grad():
  logits, = model(tokens_tensor, labels=None)
logits

tensor([[ 4.4276,  0.9031,  0.1941, -1.8775, -0.0150, -1.7862]])

In [9]:
preds = logits.detach().cpu().numpy()
preds_probs = softmax(preds, axis=1)
preds = np.argmax(preds_probs, axis=1)
preds_labels = np.array(task_labels)[preds]
print(f"Predicted: {preds_labels[0]}")
print(f"Probabilities: ")
dict(zip(task_labels, preds_probs[0]))

Predicted: OAG-GEN
Probabilities: 


{'CAG-GEN': 0.011104056,
 'CAG-NGEN': 0.0018891948,
 'NAG-GEN': 0.013686359,
 'NAG-NGEN': 0.0017242465,
 'OAG-GEN': 0.9437853,
 'OAG-NGEN': 0.027810896}

In [10]:
compute_dict_marginals(preds_probs[0], task_labels)

defaultdict(<function __main__.compute_dict_marginals.<locals>.<lambda>>,
            {'task_0': defaultdict(float,
                         {'OAG': array([0.9437853 , 0.0278109 , 0.01368636, 0.00172425, 0.01110406,
                                 0.00188919], dtype=float32)}),
             'task_1': defaultdict(float,
                         {'GEN': array([0.9437853 , 0.0278109 , 0.01368636, 0.00172425, 0.01110406,
                                 0.00188919], dtype=float32)}),
             'task_joint': {'CAG-GEN': 0.011104056,
              'CAG-NGEN': 0.0018891948,
              'NAG-GEN': 0.013686359,
              'NAG-NGEN': 0.0017242465,
              'OAG-GEN': 0.9437853,
              'OAG-NGEN': 0.027810896}})

In [11]:
df_preds = show_marginal_probs(preds_probs[0], task_labels)
df_preds.style.background_gradient(cmap='viridis')

task_1,GEN,NGEN,task_0
task_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAG,0.011104,0.001889,0.012993
NAG,0.013686,0.001724,0.015411
OAG,0.943785,0.027811,0.971596
task_1,0.968576,0.031424,1.0


In [12]:
pd.DataFrame({
    "labels": task_labels,
    "probs": preds_probs[0]
}).assign(
    task_0=lambda x: x["labels"].str.split("-", expand=True)[0],
    task_1=lambda x: x["labels"].str.split("-", expand=True)[1]
).drop("labels", axis=1).pivot_table(index="task_0", columns="task_1", values="probs", aggfunc="first").pipe(add_marginals)

task_1,GEN,NGEN,task_0
task_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAG,0.011104,0.001889,0.012993
NAG,0.013686,0.001724,0.015411
OAG,0.943785,0.027811,0.971596
task_1,0.968576,0.031424,1.0
