t_wsd

In [ ]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from tqdm import tqdm  
from tqdm.notebook import tqdm # for progress bars in notebooks
from random import shuffle
import os
import sys
import time

In [ ]:

Naming conventions¶

sentences are already segmented into words (with a rule-based tokenizer)
but are not segmented into subwords yet
we use “word” or “w” for the tokens obtained after pre-segmentation
and “token” for units obtained after BERT-like tokenization (BPE ou WordPiece etc…)
in variable names, we distinguish
integer identifiers for symbols (for the token vocabulary, the frame vocabulary …)
versus the rank of a unit (either word or token) within a sequence
tid => token identifier
trk / wrk => token rank / rank of a word in a sequence
tg => “target”, so
tg_wrk = rank of the target word
tg_trk = rank of the first token of the target

In [ ]:

if torch.cuda.is_available():
    # objet torch.device          
    DEVICE = torch.device("cuda")
        
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print("We will use GPU %d (%s) of compute capability %d.%d with "
          "%.2fGb total memory.\n" % 
          (device_id,
          gpu_properties.name,
          gpu_properties.major,
          gpu_properties.minor,
          gpu_properties.total_memory / 1e9))

else:
    print('No GPU available, using the CPU instead.')
    DEVICE = torch.device("cpu")

There are 1 GPU(s) available.
We will use GPU 0 (Tesla P100-PCIE-16GB) of compute capability 6.0 with 17.07Gb total memory.

“ASFALDA” dataset¶

A French FrameNet, comprisong about 16000 annotated targets, into about 100 distinct frames, along with their semantic role annotations.

Fetching the data¶

In [ ]:

if not os.path.exists('./asfalda_data_for_wsd/'):
  # shell commands can be run using !
  !pip install wget
  import wget
  
  # The URL for the dataset zip file.
  url = 'http://www.linguist.univ-paris-diderot.fr/~mcandito/divers/asfalda_data_for_wsd.tgz'

  
  if not os.path.exists('./asfalda_data_for_wsd.tgz'):
    print('Downloading dataset')
    wget.download(url, './asfalda_data_for_wsd.tgz')
    !tar zxf asfalda_data_for_wsd.tgz

Data loading method¶

In [ ]:

def load_asfalda_data(gold_data_file, split_info_file):
    """
        Inputs: - asfalda gold data file
                - file indicating the corpus type for each sentence id

        Returns 3 dictionaries (whose keys are corpus types (train/dev/test/val))
        - sentences
        - list of rank of target word in each sentence
        - gold labels

        Example:
        sentences['train'] = [['Le', 'code', 'comprend', 'des', 'erreurs','.'],
                              ['Comprends', '-tu', '?']]
         # the targets are the 3rd and first words                     
        tg_wrks['train'] = [2, 0]
        tg_lemmas['train'] = ['comprendre', 'comprendre']
        labels = ['frame1', 'frame2']
                                
    """
    # load the usual split into train / dev / test
    s = open(split_info_file)
    lines = [ l[:-1].split('\t') for l in s.readlines() ]
    split_info_dic = { line[0]:line[1] for line in lines }

    # dev / train / test sentences
    sentences = {'dev':[], 'train':[], 'test':[]}
    # the word ranks (wrk) for the target words
    tg_wrks = {'dev':[], 'train':[], 'test':[]}
    # target lemmas
    tg_lemmas = {'dev':[], 'train':[], 'test':[]}
    # the labels of targets (= frames)
    labels = {'dev':[], 'train':[], 'test':[]}
    tg_poss = {'dev':[], 'train':[], 'test':[]}
    max_sent_len = {'dev':0, 'train':0, 'test':0}
    max_tg_wrk = {'dev':0, 'train':0, 'test':0}

    stream = open(gold_data_file)
    for line in stream.readlines():
        if line.startswith('#'):
            continue
        line = line.strip()
        (sentid, tg_wrk, frame_name, tg_lemma, tg_pos, rest) = line.split('\t',5)
        # role annotation is ignored
        # sentences are pre-segmented into space-separated words
        # => we split, to use the is_split_into_words=True mode of the FlauBERT tokenizer
        sentence = rest.split("\t")[-1].split(' ')
        part = split_info_dic[sentid]
        tg_wrk = int(tg_wrk)

        l = len(sentence)
        sentences[part].append(sentence)
        labels[part].append(frame_name)
        tg_wrks[part].append(tg_wrk)
        tg_lemmas[part].append(tg_lemma)
        tg_poss[part].append(tg_pos)
        if max_sent_len[part] < l: 
            max_sent_len[part] = l 
        if max_tg_wrk[part] < tg_wrk: 
            max_tg_wrk[part] = tg_wrk 
    print("Max sentence length:", max_sent_len)
    print("Max target rank (in words):", max_tg_wrk)
    
    return sentences, tg_wrks, tg_lemmas, labels,tg_poss

Data loading and defining ids for labels¶

In [ ]:

MAX_LENGTH = 100
gold_data_file = './asfalda_data_for_wsd/sequoiaftb.asfalda_1_3.gold.uniq.nofullant.txt'

# usual split train / dev / test for this corpus
split_info_file = './asfalda_data_for_wsd/sequoiaftb_split_info'

sentences, tg_wrks, tg_lemmas, label_strs,tg_pos = load_asfalda_data(gold_data_file,split_info_file)


for p in sentences.keys():
    avgl = sum([len(s) for s in sentences[p]])/len(sentences[p])
    print("%s : %d sentences, average lentgh=%3.2f" 
          %(p, len(sentences[p]), avgl))

# creating label ids for frames seen in training set
i2label = list(set(label_strs['train']))
# id for unknown frame (for dev and test)
i2label.append('*UNK*')

label2i = {x:i for i,x in enumerate(i2label)}
# id of special frame "Other_sense"
i_OTHER_SENSE = label2i['Other_sense']

# sequence of gold labels 
# for each sub-corpus (key = dev/train/test)
labels = {}
for p in label_strs.keys():
    labels[p] = [label2i[x] if x in label2i else i2label[-1] for x in label_strs[p]]

i2pos = list(set(tg_pos['train']))
i2pos.append('*UNK*')
pos2i = {x:i for i,x in enumerate(i2pos)}

Max sentence length: {'dev': 115, 'train': 271, 'test': 140}
Max target rank (in words): {'dev': 96, 'train': 267, 'test': 115}
dev : 2688 sentences, average lentgh=38.03
train : 18657 sentences, average lentgh=38.99
test : 3447 sentences, average lentgh=38.45

Data encoding¶

FlauBERT tokenization¶

We use the FlauBERT model, using the Huggingface “transformers” module.

In [ ]:

try:
  import transformers
except ImportError:
  !pip install transformers
  
from transformers import AutoModel, AutoTokenizer, AutoConfig

In [ ]:

# We choose the FlauBERT model

# we load tokenizer and config for now
flaubert_tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")
flaubert_config = AutoConfig.from_pretrained("flaubert/flaubert_base_cased")

ENCODING¶

In [ ]:

'''
some target ranks (before or after BPE tokenization) are bigger than the max length of the tid_seq.
to solve this problem, some samples with out-of-range rank positions will be discarded, since this operation will mess up the original order, 
the encode function needs the original lemma and label lists and its outputs also will include new lemma and label lists 
'''
class WSDEncoder:
    def __init__(self, tokenizer, config):
        self.tokenizer = tokenizer
        self.config = config # to get indices of special tokens

    def encode(self, sentences, tg_wrks, lemmas = None,labels = None,max_length=100, verbose=False, is_split_into_words=True):
 
      if is_split_into_words:
        sentences_ = sentences
        sentences_join = [' '.join(sentence) for sentence in sentences]
        tid_seqs_ = flaubert_tokenizer(sentences_join,truncation=True,padding=True,max_length=max_length,add_special_tokens=True)['input_ids']
      else:
        sentences_ = [sentence.split(' ') for sentence in sentences] 
        tid_seqs_ = flaubert_tokenizer(sentences,truncation=True,padding=True,max_length=max_length,add_special_tokens=True)['input_ids']
      first_trk_of_targets = []
      tid_seqs = []
      tid_lemmas = []
      tid_labels = []
      discard_counter = 0
      discarded = ''
      if lemmas and labels:
        for sentence,wrk,seq,lem,lab in zip(sentences_,tg_wrks,tid_seqs_,lemmas,labels):
          target = sentence[wrk]
          if target[-1] == "'":
            target = target[:-1]
          encoded_word = flaubert_tokenizer.encode(target)[1:-1][0]
          if encoded_word in seq:
            encoded_word_index = seq.index(encoded_word)
            first_trk_of_targets.append(encoded_word_index)
            tid_seqs.append(seq)
            tid_lemmas.append(lem)
            tid_labels.append(lab)
          else :
            discard_counter += 1
            discarded = discarded + ' '  + sentence[wrk]
            sys.stdout.write('\rRank position is bigger than max length,this sample will be discarded : ' + discarded)
            sys.stdout.flush()

# the encode function did not tokenize word per word, instead, it tokenized the whole sentence.
# So, in order to get the target rank after tokenization, only the target word is tokenized to get its encoded id 
# and seq.index function is used to get the correct target rank in the tokenized sentence


      elif lemmas == None and labels == None:
          for sentence,wrk,seq in zip(sentences_,tg_wrks,tid_seqs_):
            target = sentence[wrk]
# some target word like " d' " is end with " ' ",which the sentence tokenizer will treat it as two tokens : " d "  and " ' "  
# but the word tokenizer treat it as a single word         
            if target[-1] == "'": 
              target = target[:-1]
            encoded_word = flaubert_tokenizer.encode(target)[1:-1][0]
            if encoded_word in seq:
              encoded_word_index = seq.index(encoded_word)
              first_trk_of_targets.append(encoded_word_index)
              tid_seqs.append(seq)
            else :
              discard_counter += 1
              print('\rRank position is bigger than max length,this sample will be discarded')
              print(sentence[wrk])
              print('==============================')          
      print(f"\n{discard_counter}/{len(sentences_)} samples has been discarded ")
      if labels and lemmas :
        assert len(tid_seqs) == len(first_trk_of_targets) == len(tid_lemmas) == len(tid_labels)
        result = (tid_seqs,first_trk_of_targets,tid_lemmas,tid_labels)
      else:
        assert len(tid_seqs) == len(first_trk_of_targets) 
        result = (tid_seqs,first_trk_of_targets)
      return result

Encoding test¶

In [ ]:

encoder = WSDEncoder(flaubert_tokenizer, flaubert_config)

# test encoder
test_sents = ["Conséquemment , nous comprendrions .",
              "Le code comprend des erreurs .",
            "J' essaie de comprendre les transformers .",  
            "Il n' a pas bien compris le code !"]
# target words are the occurrences of "comprendre"
test_tg_wrks = [3, 2, 3, 5]
max_length=10

tid_seqs, first_trk_of_targets = encoder.encode(test_sents, test_tg_wrks, max_length= 10, verbose=True,is_split_into_words=False)

for tid_seq, ft in zip(tid_seqs, first_trk_of_targets):
    print("Len = %d target token rank = %d tid_seq = %s" % (len(tid_seq), ft, str(tid_seq)))
    print(flaubert_tokenizer.convert_ids_to_tokens(tid_seq)[ft])
#flaubert_tokenizer.convert_ids_to_tokens(tid_seqs[0])

0/4 samples has been discarded 
Len = 10 target token rank = 6 tid_seq = [0, 1198, 17358, 13299, 14, 65, 18719, 1999, 19614, 1]
compr
Len = 10 target token rank = 3 tid_seq = [0, 55, 1138, 976, 23, 3842, 16, 1, 2, 2]
comprend</w>
Len = 10 target token rank = 5 tid_seq = [0, 2684, 68, 5213, 15, 965, 22, 14659, 896, 1]
comprendre</w>
Len = 10 target token rank = 7 tid_seq = [0, 59, 261, 68, 34, 42, 83, 681, 20, 1]
compris</w>

Full encoding and batch production¶

In [ ]:

import random
class WSDData:
    def __init__(self, corpus_type, sentences, tg_wrks, tg_lemmas, labels, encoder, max_length=100):
        """
        Inputs:
        - corpus type string (train/dev/test/val)
        - list of sentences (each sentence = list of word strings)
        - list of target word ranks : one per sentence
        - list of gold label id
        - encoder = instance of WSDEncoder

        - max_length = size of encoded sequences, in nb of bert tokens 
                      (padded / truncated via encoder.encode)
    
        Encodes all the data using the relevant identifiers
        """
        
        self.corpus_type = corpus_type # train / dev / test / val
        self.size = len(sentences)
        self.encoder = encoder

        
        self.sentences = sentences # list of list of word strings
        
        
        tid_seqs, tg_trks,lemmas,labels = encoder.encode(sentences, tg_wrks,tg_lemmas,labels, max_length = max_length ,is_split_into_words=True)

        self.tid_seqs = tid_seqs  # sequences of token ids
        self.tg_trks = tg_trks    # target token ranks
        self.tg_lemmas = lemmas
        self.labels = labels       # gold label ids

    def shuffle(self):
      seq_tg_pairs = list(zip(self.tid_seqs,self.tg_trks,self.tg_lemmas,self.labels))
      random.shuffle(seq_tg_pairs)
      ts,tt,tle,tla = zip(*seq_tg_pairs)
      self.tid_seqs,self.tg_trks,self.tg_lemmas,self.labels= list(ts),list(tt),list(tle),list(tla)
      assert len(self.tid_seqs) == len(self.tg_trks) == len(self.tg_lemmas) == len(self.labels)
      """
      Rearranges all the data in a new random order
      (sentences, tg_lemmas, tg_trks, tid_seqs, labels)

      NB: ** original order is lost **
      """

    # production of a batch
    def make_batches(self, batch_size, shuffle_data=False):
      assert len(self.tid_seqs) == len(self.tg_trks) == len(self.tg_lemmas) == len(self.labels)
      if shuffle_data:
        self.shuffle()
      if batch_size > len(self.tid_seqs):
        raise ValueError('Batch size is bigger than data size!!')
      for x in range(0,len(self.tid_seqs),batch_size):
        yield (self.tid_seqs[x:x+batch_size],self.tg_trks[x:x+batch_size],self.tg_lemmas[x:x+batch_size],self.labels[x:x+batch_size])

In [ ]:

MAX_LENGTH = 100
wsd_data = {}
# key = part of the split corpus (train/test/dev)
for p in sentences.keys():
    print("Encoding part %s ..." % p)
    wsd_data[p] = WSDData(p, sentences[p], tg_wrks[p], tg_lemmas[p], labels[p], 
                          encoder, max_length=MAX_LENGTH)
    # we check that encoding provides the right lengths
    for i, s in enumerate(wsd_data[p].tid_seqs):
        if len(s) != MAX_LENGTH:
            print("Size bug:", i, s)

Encoding part dev ...
Rank position is bigger than max length,this sample will be discarded :  produit considérés réalisées
3/2688 samples has been discarded 
Encoding part train ...
Rank position is bigger than max length,this sample will be discarded :  produit résultant concevez dites opposition faire semble impulsion sait applaudir conviction coûtent ouverte reconnue vu vu contraints accusent créations escomptées Paye laissant répondent conditions laissent contraintes dites entamée reprenant déclinait interprétation assureront répondront écrit réclamations régler demandes perdre raisons fonctions invité sentiment vu ordre accréditer idée conséquences refusa toucher soupçonnant fonctions vu anathème prononcer discours
55/18657 samples has been discarded 
Encoding part test ...
Rank position is bigger than max length,this sample will be discarded :  assurer cité céder ventes fondent favorables décidées attirer
8/3447 samples has been discarded

WSDClassifier class: the network for WSD¶

Base architecture =

the FlauBERT model
plus linear layer + softmax

The network

In [ ]:

flaubert_model = AutoModel.from_pretrained("flaubert/flaubert_base_cased", return_dict=True)

In [ ]:

class WSDClassifier(nn.Module):

    def __init__(self, num_labels, device, bert_model, bert_config,freeze_bert = True,MLP = False):
        super(WSDClassifier, self).__init__()

        self.device = device
                
        # the full *BERT*-like model
        # the .to(device) triggers the copy towards the relevant device 
        # (possibly a GPU)
        self.bert_layer = bert_model.to(device)
        # config will allow to get the hidden vectors' size
        self.bert_config = bert_config
        self.num_labels = num_labels
        self.emb_size = self.bert_config.emb_dim
        
        self.distribution = nn.Sequential (*[nn.Linear(in_features = self.emb_size,out_features=self.num_labels),
                                            nn.LogSoftmax(dim = -1)
                                           
        ]).to(device)
        if MLP:
          self.distribution = nn.Sequential (*[nn.Linear(in_features = self.emb_size,out_features=50),
                                            nn.ReLU(),
                                            nn.Linear(in_features = 50,out_features=50),
                                            nn.ReLU(),
                                            nn.Linear(in_features = 50,out_features=self.num_labels),
                                            nn.LogSoftmax(dim = -1)]).to(device)
        
        if freeze_bert:
          for param in self.bert_layer.parameters():
            param.requires_grad = False

    # make a mask matrix (batch_size,nb_class)
    # where only the seen in train frames are 0s and the not seen in train frame are 1e-45 (or -inf)
    def make_mask(self,nb_class,lemmas,seen_in_X):
      batch_size = len(lemmas)
      zeros = torch.zeros(batch_size,nb_class,device = self.device,requires_grad=False)
      zeros = zeros + 1e-45 #(-1)*float('inf')
      for i in range(batch_size):
        seen = seen_in_X[lemmas[i]]
        for j in seen:
          zeros[i][j] = 0
      return zeros




    def forward(self, b_tid_seq, b_tg_trk,lemmas = None,seen_in_X = None):
      bert_out = self.bert_layer(b_tid_seq,return_dict =True).last_hidden_state
      #b_tg_trk = torch.tensor(b_tg_trk, device=self.device)
      target = bert_out[torch.arange(bert_out.size(0)),b_tg_trk]
      out = self.distribution(target)
      if lemmas and seen_in_X:
    # if lemmas and seen_in_X is provided,mask the output
    # so only the seen in train frames position has predicted valued
    # other position are -inf or some very extreme negative values
        mask = self.make_mask(self.num_labels,lemmas,seen_in_X)
        #l1_norm = (-1*(torch.norm(out+mask,p=1, dim=-1))).view(1,-1)
        exp = torch.exp(out+mask) # turn it back to the normal probability to calculate the l1 norm
        l1_norm = torch.sum(exp,dim = -1).view(1,-1) 
        out = ((exp)/(l1_norm.unsqueeze(2))).squeeze() # use the l1 norm to replace an another log softmax for re-normalization
        out = torch.log(out)
        #out = self.log_softmax(out+mask)

      return out
    """
    Inputs: (all are tensors, on the relevant device)
        - a batch of sentences = a batch of token id sequences 
          (as output in 'input_ids' member of tokenizer output)
        - a batch of target token rank = for each of the sentences, 
          the rank of first token of the target word to disambiguate

    Output: log_softmax scores for the whole batch (batch_size x num_labels)
    """


            
    def run_on_dataset(self, wsd_data, return_loss = False,batch_size=32):
      pred_labels = []
      gold = []
      loss  = []
      self.eval()
      with torch.no_grad():
        for b_tid_seqs, b_tg_trks,_,b_labels in wsd_data.make_batches(batch_size, shuffle_data=False):
          b_tid_seqs = torch.tensor(b_tid_seqs, device=self.device)
          b_tg_trks = torch.tensor(b_tg_trks, device=self.device)
          b_labels = torch.tensor(b_labels, device=self.device)
          log_probs = self.forward(b_tid_seqs, b_tg_trks)
          pred_label = torch.argmax(log_probs,dim = -1)
          pred_labels.append(pred_label)
          gold.append(b_labels)
          if return_loss:
            loss.append(torch.nn.functional.nll_loss(log_probs,b_labels).item())
            
      if return_loss:
        return pred_labels,gold,loss
      else:
        return pred_labels,gold

      """
        Run classifier on wsd_data and compute accuracy
        Inputs = 
         - wsd_data (WSDDataset instance)
         - batch_size
        Returns:
         - list of predicted label ids
      """


    def evaluate(self,wsd_data,return_loss = False,batch_size=32):
        """ returns accuracy, nb_correct, nb_total """
        nb_correct = 0
        nb_total = 0
        result = self.run_on_dataset(wsd_data,return_loss ,batch_size)
        for p,g in zip(result[0],result[1]):
          nb_correct += torch.sum(p==g)
          nb_total += len(p)
        if return_loss:
          return float(nb_correct/nb_total),float(len(result[2])/sum(result[2]))
        else:
          return float(nb_correct/nb_total)

In [ ]:

# an instance of WSDClassifier
num_labels = len(i2label)
DEVICE = 'cuda'
classifier = WSDClassifier(num_labels, DEVICE, flaubert_model, flaubert_config)

# uncomment to see the huge nb of parameters ...
#for name, param in classifier.named_parameters():
   #print("PARAM named %s, of shape %s" % (name, str(param.shape)))
   #print(param)

Test of forward propagation¶

In [ ]:

# useless to compute gradients when testing
with torch.no_grad():
    # toggle train mode off
    classifier.eval()
    for b_tid_seqs, b_tg_trks,b_lemmas,b_labels in wsd_data['train'].make_batches(32, shuffle_data=True):
        b_tid_seqs = torch.tensor(b_tid_seqs, device=classifier.device)
        b_tg_trks = torch.tensor(b_tg_trks, device=classifier.device)
        #log_probs = classifier(b_tid_seqs, b_tg_trks)
        mask = b_lemmas
        log_probs = classifier(b_tid_seqs,b_tg_trks)
        gold = b_labels[0] #.item()
        gold_lemma = b_lemmas[0]
        print(f'first ex : {gold_lemma}')
        print("GOLD LABEL of first ex %d ( = %s)" % (gold, i2label[gold]))
        print("LOG_PROBS before training: %s\n\n" % str(log_probs[0]))
        break

first ex : s'allier
GOLD LABEL of first ex 8 ( = Make_agreement_on_action)
LOG_PROBS before training: tensor([-6.6512, -7.3057, -5.7636, -4.9219, -6.8631, -7.7480, -5.4645, -6.6998,
        -3.6390, -6.2735, -3.7086, -7.2166, -6.8038, -5.0077, -4.6372, -6.7378,
        -6.0674, -6.4070, -4.7714, -5.9692, -5.2273, -6.4065, -4.0394, -2.2338,
        -4.8827, -5.9193, -5.4862, -4.4570, -5.4531, -5.9702, -4.5276, -4.4424,
        -4.9234, -5.7870, -5.0041, -6.1230, -4.4383, -4.7733, -5.6809, -4.4642,
        -2.4661, -6.7032, -5.8991, -6.1683, -6.8110, -3.3326, -3.8445, -7.9213,
        -4.0945, -4.6576, -7.1300, -5.5480, -4.3570, -4.3896, -4.5131, -4.0048,
        -6.3958, -4.8002, -5.4079, -7.4110, -6.5508, -4.2832, -3.7300, -5.5400,
        -6.1726, -4.8362, -4.4931, -6.2256, -5.8992, -7.2413, -5.7279, -4.9946,
        -5.8178, -5.2648, -5.9453, -7.0204, -4.9609, -4.3784, -6.1446, -4.5937,
        -4.4972, -5.9254, -6.3766, -4.2953, -5.9825, -5.0797, -4.8794, -6.1832,
        -4.5445, -5.6917, -6.2342, -3.2223, -5.3736, -5.9121, -6.9648, -2.9838,
        -4.6563, -5.3115, -5.5499, -5.9452, -5.0163, -7.4239, -5.3953, -6.8540,
        -4.8919, -2.6048, -6.5160], device='cuda:0')

Training¶

In [ ]:

# training

import time
BATCH_SIZE = 32
LR = 0.0005
nb_epoch = 30
epoch_id = 0

time_0 = time.time()
num_labels = len(i2label)
DEVICE = 'cuda'
classifier = WSDClassifier(num_labels, DEVICE, flaubert_model, flaubert_config,MLP = False)
# ATTENTION ,USE MLP WILL SOME TIME RAISE OUT OF CUDA MEMORY ERROR

loss_function = nn.NLLLoss()
# SGD is quicker (more convenient for debug phase)
#optimizer = optim.SGD(classifier.parameters(), lr=LR)
optimizer = optim.Adam(classifier.parameters(), lr=LR)

config_name = 'sequoiaftb.asfalda_1_3.wsd.lr' + 'Adam' + str(LR) + '_bs' + str(BATCH_SIZE)
out_model_file = './' + config_name + '.model'
out_log_file = './' + config_name + '.log'


# losses at each epoch (on train / on validation set)
train_losses = []
val_losses = []
min_val_loss = None
min_val_accuracy = 0
# to speed up during debug: train on dev
#train_data = wsd_data['dev'] # data['train']
train_data = wsd_data['train']
val_data = wsd_data['dev']
val_batch_size = len(wsd_data['dev'].tid_seqs)
lemmas = None





while(epoch_id < nb_epoch):
  classifier.train()
  time_0 = time.time()
  
  for b_tid_seqs, b_tg_trks,b_lemmas,b_labels in train_data.make_batches(BATCH_SIZE, shuffle_data=True):
        b_tid_seqs = torch.tensor(b_tid_seqs, device=classifier.device)
        b_tg_trks = torch.tensor(b_tg_trks, device=classifier.device)
        gold = torch.tensor(b_labels, device=classifier.device)
        lemmas = b_lemmas
        optimizer.zero_grad()
        log_probs = classifier(b_tid_seqs,b_tg_trks)
        loss = loss_function(log_probs,gold)
        train_losses.append(loss.item())

        loss.backward()
        optimizer.step()


    

  val_accuracy,avg_val_loss = classifier.evaluate(val_data,return_loss = True,batch_size = val_batch_size)
  print(f"-- END OF EPOCH {epoch_id}.")
  print(f"Average loss on training set: {sum(train_losses) / len(train_losses)}.")
  print(f"Average loss on dev set: {avg_val_loss}.")
  print(f"Average accuracy on dev set: {val_accuracy}.")
  if val_accuracy <= min_val_accuracy:  #early stopping and roll back
    print('-- Aaccuracy down,roll back and stop --')
    classifier.load_state_dict(torch.load('classifier_params.pt'))
    break
  else:
    min_val_accuracy = val_accuracy
    torch.save(classifier.state_dict(), 'classifier_params.pt')
    

  duration = time.time() - time_0
  print(f"{duration} s elapsed (i.e. {duration / (epoch_id + 1)} s/epoch)")
  epoch_loss = []
  epoch_id += 1

Evaluation¶

In [ ]:

test_data = wsd_data['dev']
batch_size = len(test_data.tid_seqs)
classifier.evaluate(test_data,batch_size)
print()

Out[ ]:

(0.0029795158188790083, 0.18642462971275686)

Lan's LAND

Using BERT embedding for word sense disambiguation

Naming conventions¶

“ASFALDA” dataset¶

Fetching the data¶

Data loading method¶

Data loading and defining ids for labels¶

Data encoding¶

FlauBERT tokenization¶

ENCODING¶

Encoding test¶

Full encoding and batch production¶

WSDClassifier class: the network for WSD¶

Test of forward propagation¶

Training¶

Evaluation¶