import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#from tqdm import tqdm
from tqdm.notebook import tqdm # for progress bars in notebooks
from random import shuffle
import os
import sys
import time
Naming conventions¶
sentences are already segmented into words (with a rule-based tokenizer)
but are not segmented into subwords yet
we use “word” or “w” for the tokens obtained after pre-segmentation
and “token” for units obtained after BERT-like tokenization (BPE ou WordPiece etc…)
in variable names, we distinguish
integer identifiers for symbols (for the token vocabulary, the frame vocabulary …)
versus the rank of a unit (either word or token) within a sequence
tid => token identifier
trk / wrk => token rank / rank of a word in a sequence
tg => “target”, so
tg_wrk = rank of the target word
tg_trk = rank of the first token of the target
if torch.cuda.is_available():
# objet torch.device
DEVICE = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
print("We will use GPU %d (%s) of compute capability %d.%d with "
"%.2fGb total memory.\n" %
(device_id,
gpu_properties.name,
gpu_properties.major,
gpu_properties.minor,
gpu_properties.total_memory / 1e9))
else:
print('No GPU available, using the CPU instead.')
DEVICE = torch.device("cpu")
There are 1 GPU(s) available. We will use GPU 0 (Tesla P100-PCIE-16GB) of compute capability 6.0 with 17.07Gb total memory.
“ASFALDA” dataset¶
A French FrameNet, comprisong about 16000 annotated targets, into about 100 distinct frames, along with their semantic role annotations.
Fetching the data¶
if not os.path.exists('./asfalda_data_for_wsd/'):
# shell commands can be run using !
!pip install wget
import wget
# The URL for the dataset zip file.
url = 'http://www.linguist.univ-paris-diderot.fr/~mcandito/divers/asfalda_data_for_wsd.tgz'
if not os.path.exists('./asfalda_data_for_wsd.tgz'):
print('Downloading dataset')
wget.download(url, './asfalda_data_for_wsd.tgz')
!tar zxf asfalda_data_for_wsd.tgz
Data loading method¶
def load_asfalda_data(gold_data_file, split_info_file):
"""
Inputs: - asfalda gold data file
- file indicating the corpus type for each sentence id
Returns 3 dictionaries (whose keys are corpus types (train/dev/test/val))
- sentences
- list of rank of target word in each sentence
- gold labels
Example:
sentences['train'] = [['Le', 'code', 'comprend', 'des', 'erreurs','.'],
['Comprends', '-tu', '?']]
# the targets are the 3rd and first words
tg_wrks['train'] = [2, 0]
tg_lemmas['train'] = ['comprendre', 'comprendre']
labels = ['frame1', 'frame2']
"""
# load the usual split into train / dev / test
s = open(split_info_file)
lines = [ l[:-1].split('\t') for l in s.readlines() ]
split_info_dic = { line[0]:line[1] for line in lines }
# dev / train / test sentences
sentences = {'dev':[], 'train':[], 'test':[]}
# the word ranks (wrk) for the target words
tg_wrks = {'dev':[], 'train':[], 'test':[]}
# target lemmas
tg_lemmas = {'dev':[], 'train':[], 'test':[]}
# the labels of targets (= frames)
labels = {'dev':[], 'train':[], 'test':[]}
tg_poss = {'dev':[], 'train':[], 'test':[]}
max_sent_len = {'dev':0, 'train':0, 'test':0}
max_tg_wrk = {'dev':0, 'train':0, 'test':0}
stream = open(gold_data_file)
for line in stream.readlines():
if line.startswith('#'):
continue
line = line.strip()
(sentid, tg_wrk, frame_name, tg_lemma, tg_pos, rest) = line.split('\t',5)
# role annotation is ignored
# sentences are pre-segmented into space-separated words
# => we split, to use the is_split_into_words=True mode of the FlauBERT tokenizer
sentence = rest.split("\t")[-1].split(' ')
part = split_info_dic[sentid]
tg_wrk = int(tg_wrk)
l = len(sentence)
sentences[part].append(sentence)
labels[part].append(frame_name)
tg_wrks[part].append(tg_wrk)
tg_lemmas[part].append(tg_lemma)
tg_poss[part].append(tg_pos)
if max_sent_len[part] < l:
max_sent_len[part] = l
if max_tg_wrk[part] < tg_wrk:
max_tg_wrk[part] = tg_wrk
print("Max sentence length:", max_sent_len)
print("Max target rank (in words):", max_tg_wrk)
return sentences, tg_wrks, tg_lemmas, labels,tg_poss
Data loading and defining ids for labels¶
MAX_LENGTH = 100
gold_data_file = './asfalda_data_for_wsd/sequoiaftb.asfalda_1_3.gold.uniq.nofullant.txt'
# usual split train / dev / test for this corpus
split_info_file = './asfalda_data_for_wsd/sequoiaftb_split_info'
sentences, tg_wrks, tg_lemmas, label_strs,tg_pos = load_asfalda_data(gold_data_file,split_info_file)
for p in sentences.keys():
avgl = sum([len(s) for s in sentences[p]])/len(sentences[p])
print("%s : %d sentences, average lentgh=%3.2f"
%(p, len(sentences[p]), avgl))
# creating label ids for frames seen in training set
i2label = list(set(label_strs['train']))
# id for unknown frame (for dev and test)
i2label.append('*UNK*')
label2i = {x:i for i,x in enumerate(i2label)}
# id of special frame "Other_sense"
i_OTHER_SENSE = label2i['Other_sense']
# sequence of gold labels
# for each sub-corpus (key = dev/train/test)
labels = {}
for p in label_strs.keys():
labels[p] = [label2i[x] if x in label2i else i2label[-1] for x in label_strs[p]]
i2pos = list(set(tg_pos['train']))
i2pos.append('*UNK*')
pos2i = {x:i for i,x in enumerate(i2pos)}
Max sentence length: {'dev': 115, 'train': 271, 'test': 140} Max target rank (in words): {'dev': 96, 'train': 267, 'test': 115} dev : 2688 sentences, average lentgh=38.03 train : 18657 sentences, average lentgh=38.99 test : 3447 sentences, average lentgh=38.45
Data encoding¶
FlauBERT tokenization¶
We use the FlauBERT model, using the Huggingface “transformers” module.
try:
import transformers
except ImportError:
!pip install transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig
# We choose the FlauBERT model
# we load tokenizer and config for now
flaubert_tokenizer = AutoTokenizer.from_pretrained("flaubert/flaubert_base_cased")
flaubert_config = AutoConfig.from_pretrained("flaubert/flaubert_base_cased")
ENCODING¶
'''
some target ranks (before or after BPE tokenization) are bigger than the max length of the tid_seq.
to solve this problem, some samples with out-of-range rank positions will be discarded, since this operation will mess up the original order,
the encode function needs the original lemma and label lists and its outputs also will include new lemma and label lists
'''
class WSDEncoder:
def __init__(self, tokenizer, config):
self.tokenizer = tokenizer
self.config = config # to get indices of special tokens
def encode(self, sentences, tg_wrks, lemmas = None,labels = None,max_length=100, verbose=False, is_split_into_words=True):
if is_split_into_words:
sentences_ = sentences
sentences_join = [' '.join(sentence) for sentence in sentences]
tid_seqs_ = flaubert_tokenizer(sentences_join,truncation=True,padding=True,max_length=max_length,add_special_tokens=True)['input_ids']
else:
sentences_ = [sentence.split(' ') for sentence in sentences]
tid_seqs_ = flaubert_tokenizer(sentences,truncation=True,padding=True,max_length=max_length,add_special_tokens=True)['input_ids']
first_trk_of_targets = []
tid_seqs = []
tid_lemmas = []
tid_labels = []
discard_counter = 0
discarded = ''
if lemmas and labels:
for sentence,wrk,seq,lem,lab in zip(sentences_,tg_wrks,tid_seqs_,lemmas,labels):
target = sentence[wrk]
if target[-1] == "'":
target = target[:-1]
encoded_word = flaubert_tokenizer.encode(target)[1:-1][0]
if encoded_word in seq:
encoded_word_index = seq.index(encoded_word)
first_trk_of_targets.append(encoded_word_index)
tid_seqs.append(seq)
tid_lemmas.append(lem)
tid_labels.append(lab)
else :
discard_counter += 1
discarded = discarded + ' ' + sentence[wrk]
sys.stdout.write('\rRank position is bigger than max length,this sample will be discarded : ' + discarded)
sys.stdout.flush()
# the encode function did not tokenize word per word, instead, it tokenized the whole sentence.
# So, in order to get the target rank after tokenization, only the target word is tokenized to get its encoded id
# and seq.index function is used to get the correct target rank in the tokenized sentence
elif lemmas == None and labels == None:
for sentence,wrk,seq in zip(sentences_,tg_wrks,tid_seqs_):
target = sentence[wrk]
# some target word like " d' " is end with " ' ",which the sentence tokenizer will treat it as two tokens : " d " and " ' "
# but the word tokenizer treat it as a single word
if target[-1] == "'":
target = target[:-1]
encoded_word = flaubert_tokenizer.encode(target)[1:-1][0]
if encoded_word in seq:
encoded_word_index = seq.index(encoded_word)
first_trk_of_targets.append(encoded_word_index)
tid_seqs.append(seq)
else :
discard_counter += 1
print('\rRank position is bigger than max length,this sample will be discarded')
print(sentence[wrk])
print('==============================')
print(f"\n{discard_counter}/{len(sentences_)} samples has been discarded ")
if labels and lemmas :
assert len(tid_seqs) == len(first_trk_of_targets) == len(tid_lemmas) == len(tid_labels)
result = (tid_seqs,first_trk_of_targets,tid_lemmas,tid_labels)
else:
assert len(tid_seqs) == len(first_trk_of_targets)
result = (tid_seqs,first_trk_of_targets)
return result
Encoding test¶
encoder = WSDEncoder(flaubert_tokenizer, flaubert_config)
# test encoder
test_sents = ["Conséquemment , nous comprendrions .",
"Le code comprend des erreurs .",
"J' essaie de comprendre les transformers .",
"Il n' a pas bien compris le code !"]
# target words are the occurrences of "comprendre"
test_tg_wrks = [3, 2, 3, 5]
max_length=10
tid_seqs, first_trk_of_targets = encoder.encode(test_sents, test_tg_wrks, max_length= 10, verbose=True,is_split_into_words=False)
for tid_seq, ft in zip(tid_seqs, first_trk_of_targets):
print("Len = %d target token rank = %d tid_seq = %s" % (len(tid_seq), ft, str(tid_seq)))
print(flaubert_tokenizer.convert_ids_to_tokens(tid_seq)[ft])
#flaubert_tokenizer.convert_ids_to_tokens(tid_seqs[0])
0/4 samples has been discarded Len = 10 target token rank = 6 tid_seq = [0, 1198, 17358, 13299, 14, 65, 18719, 1999, 19614, 1] compr Len = 10 target token rank = 3 tid_seq = [0, 55, 1138, 976, 23, 3842, 16, 1, 2, 2] comprend</w> Len = 10 target token rank = 5 tid_seq = [0, 2684, 68, 5213, 15, 965, 22, 14659, 896, 1] comprendre</w> Len = 10 target token rank = 7 tid_seq = [0, 59, 261, 68, 34, 42, 83, 681, 20, 1] compris</w>
Full encoding and batch production¶
import random
class WSDData:
def __init__(self, corpus_type, sentences, tg_wrks, tg_lemmas, labels, encoder, max_length=100):
"""
Inputs:
- corpus type string (train/dev/test/val)
- list of sentences (each sentence = list of word strings)
- list of target word ranks : one per sentence
- list of gold label id
- encoder = instance of WSDEncoder
- max_length = size of encoded sequences, in nb of bert tokens
(padded / truncated via encoder.encode)
Encodes all the data using the relevant identifiers
"""
self.corpus_type = corpus_type # train / dev / test / val
self.size = len(sentences)
self.encoder = encoder
self.sentences = sentences # list of list of word strings
tid_seqs, tg_trks,lemmas,labels = encoder.encode(sentences, tg_wrks,tg_lemmas,labels, max_length = max_length ,is_split_into_words=True)
self.tid_seqs = tid_seqs # sequences of token ids
self.tg_trks = tg_trks # target token ranks
self.tg_lemmas = lemmas
self.labels = labels # gold label ids
def shuffle(self):
seq_tg_pairs = list(zip(self.tid_seqs,self.tg_trks,self.tg_lemmas,self.labels))
random.shuffle(seq_tg_pairs)
ts,tt,tle,tla = zip(*seq_tg_pairs)
self.tid_seqs,self.tg_trks,self.tg_lemmas,self.labels= list(ts),list(tt),list(tle),list(tla)
assert len(self.tid_seqs) == len(self.tg_trks) == len(self.tg_lemmas) == len(self.labels)
"""
Rearranges all the data in a new random order
(sentences, tg_lemmas, tg_trks, tid_seqs, labels)
NB: ** original order is lost **
"""
# production of a batch
def make_batches(self, batch_size, shuffle_data=False):
assert len(self.tid_seqs) == len(self.tg_trks) == len(self.tg_lemmas) == len(self.labels)
if shuffle_data:
self.shuffle()
if batch_size > len(self.tid_seqs):
raise ValueError('Batch size is bigger than data size!!')
for x in range(0,len(self.tid_seqs),batch_size):
yield (self.tid_seqs[x:x+batch_size],self.tg_trks[x:x+batch_size],self.tg_lemmas[x:x+batch_size],self.labels[x:x+batch_size])
MAX_LENGTH = 100
wsd_data = {}
# key = part of the split corpus (train/test/dev)
for p in sentences.keys():
print("Encoding part %s ..." % p)
wsd_data[p] = WSDData(p, sentences[p], tg_wrks[p], tg_lemmas[p], labels[p],
encoder, max_length=MAX_LENGTH)
# we check that encoding provides the right lengths
for i, s in enumerate(wsd_data[p].tid_seqs):
if len(s) != MAX_LENGTH:
print("Size bug:", i, s)
Encoding part dev ... Rank position is bigger than max length,this sample will be discarded : produit considérés réalisées 3/2688 samples has been discarded Encoding part train ... Rank position is bigger than max length,this sample will be discarded : produit résultant concevez dites opposition faire semble impulsion sait applaudir conviction coûtent ouverte reconnue vu vu contraints accusent créations escomptées Paye laissant répondent conditions laissent contraintes dites entamée reprenant déclinait interprétation assureront répondront écrit réclamations régler demandes perdre raisons fonctions invité sentiment vu ordre accréditer idée conséquences refusa toucher soupçonnant fonctions vu anathème prononcer discours 55/18657 samples has been discarded Encoding part test ... Rank position is bigger than max length,this sample will be discarded : assurer cité céder ventes fondent favorables décidées attirer 8/3447 samples has been discarded
WSDClassifier class: the network for WSD¶
Base architecture =
- the FlauBERT model
- plus linear layer + softmax
The network
flaubert_model = AutoModel.from_pretrained("flaubert/flaubert_base_cased", return_dict=True)
class WSDClassifier(nn.Module):
def __init__(self, num_labels, device, bert_model, bert_config,freeze_bert = True,MLP = False):
super(WSDClassifier, self).__init__()
self.device = device
# the full *BERT*-like model
# the .to(device) triggers the copy towards the relevant device
# (possibly a GPU)
self.bert_layer = bert_model.to(device)
# config will allow to get the hidden vectors' size
self.bert_config = bert_config
self.num_labels = num_labels
self.emb_size = self.bert_config.emb_dim
self.distribution = nn.Sequential (*[nn.Linear(in_features = self.emb_size,out_features=self.num_labels),
nn.LogSoftmax(dim = -1)
]).to(device)
if MLP:
self.distribution = nn.Sequential (*[nn.Linear(in_features = self.emb_size,out_features=50),
nn.ReLU(),
nn.Linear(in_features = 50,out_features=50),
nn.ReLU(),
nn.Linear(in_features = 50,out_features=self.num_labels),
nn.LogSoftmax(dim = -1)]).to(device)
if freeze_bert:
for param in self.bert_layer.parameters():
param.requires_grad = False
# make a mask matrix (batch_size,nb_class)
# where only the seen in train frames are 0s and the not seen in train frame are 1e-45 (or -inf)
def make_mask(self,nb_class,lemmas,seen_in_X):
batch_size = len(lemmas)
zeros = torch.zeros(batch_size,nb_class,device = self.device,requires_grad=False)
zeros = zeros + 1e-45 #(-1)*float('inf')
for i in range(batch_size):
seen = seen_in_X[lemmas[i]]
for j in seen:
zeros[i][j] = 0
return zeros
def forward(self, b_tid_seq, b_tg_trk,lemmas = None,seen_in_X = None):
bert_out = self.bert_layer(b_tid_seq,return_dict =True).last_hidden_state
#b_tg_trk = torch.tensor(b_tg_trk, device=self.device)
target = bert_out[torch.arange(bert_out.size(0)),b_tg_trk]
out = self.distribution(target)
if lemmas and seen_in_X:
# if lemmas and seen_in_X is provided,mask the output
# so only the seen in train frames position has predicted valued
# other position are -inf or some very extreme negative values
mask = self.make_mask(self.num_labels,lemmas,seen_in_X)
#l1_norm = (-1*(torch.norm(out+mask,p=1, dim=-1))).view(1,-1)
exp = torch.exp(out+mask) # turn it back to the normal probability to calculate the l1 norm
l1_norm = torch.sum(exp,dim = -1).view(1,-1)
out = ((exp)/(l1_norm.unsqueeze(2))).squeeze() # use the l1 norm to replace an another log softmax for re-normalization
out = torch.log(out)
#out = self.log_softmax(out+mask)
return out
"""
Inputs: (all are tensors, on the relevant device)
- a batch of sentences = a batch of token id sequences
(as output in 'input_ids' member of tokenizer output)
- a batch of target token rank = for each of the sentences,
the rank of first token of the target word to disambiguate
Output: log_softmax scores for the whole batch (batch_size x num_labels)
"""
def run_on_dataset(self, wsd_data, return_loss = False,batch_size=32):
pred_labels = []
gold = []
loss = []
self.eval()
with torch.no_grad():
for b_tid_seqs, b_tg_trks,_,b_labels in wsd_data.make_batches(batch_size, shuffle_data=False):
b_tid_seqs = torch.tensor(b_tid_seqs, device=self.device)
b_tg_trks = torch.tensor(b_tg_trks, device=self.device)
b_labels = torch.tensor(b_labels, device=self.device)
log_probs = self.forward(b_tid_seqs, b_tg_trks)
pred_label = torch.argmax(log_probs,dim = -1)
pred_labels.append(pred_label)
gold.append(b_labels)
if return_loss:
loss.append(torch.nn.functional.nll_loss(log_probs,b_labels).item())
if return_loss:
return pred_labels,gold,loss
else:
return pred_labels,gold
"""
Run classifier on wsd_data and compute accuracy
Inputs =
- wsd_data (WSDDataset instance)
- batch_size
Returns:
- list of predicted label ids
"""
def evaluate(self,wsd_data,return_loss = False,batch_size=32):
""" returns accuracy, nb_correct, nb_total """
nb_correct = 0
nb_total = 0
result = self.run_on_dataset(wsd_data,return_loss ,batch_size)
for p,g in zip(result[0],result[1]):
nb_correct += torch.sum(p==g)
nb_total += len(p)
if return_loss:
return float(nb_correct/nb_total),float(len(result[2])/sum(result[2]))
else:
return float(nb_correct/nb_total)
# an instance of WSDClassifier
num_labels = len(i2label)
DEVICE = 'cuda'
classifier = WSDClassifier(num_labels, DEVICE, flaubert_model, flaubert_config)
# uncomment to see the huge nb of parameters ...
#for name, param in classifier.named_parameters():
#print("PARAM named %s, of shape %s" % (name, str(param.shape)))
#print(param)
Test of forward propagation¶
# useless to compute gradients when testing
with torch.no_grad():
# toggle train mode off
classifier.eval()
for b_tid_seqs, b_tg_trks,b_lemmas,b_labels in wsd_data['train'].make_batches(32, shuffle_data=True):
b_tid_seqs = torch.tensor(b_tid_seqs, device=classifier.device)
b_tg_trks = torch.tensor(b_tg_trks, device=classifier.device)
#log_probs = classifier(b_tid_seqs, b_tg_trks)
mask = b_lemmas
log_probs = classifier(b_tid_seqs,b_tg_trks)
gold = b_labels[0] #.item()
gold_lemma = b_lemmas[0]
print(f'first ex : {gold_lemma}')
print("GOLD LABEL of first ex %d ( = %s)" % (gold, i2label[gold]))
print("LOG_PROBS before training: %s\n\n" % str(log_probs[0]))
break
first ex : s'allier GOLD LABEL of first ex 8 ( = Make_agreement_on_action) LOG_PROBS before training: tensor([-6.6512, -7.3057, -5.7636, -4.9219, -6.8631, -7.7480, -5.4645, -6.6998, -3.6390, -6.2735, -3.7086, -7.2166, -6.8038, -5.0077, -4.6372, -6.7378, -6.0674, -6.4070, -4.7714, -5.9692, -5.2273, -6.4065, -4.0394, -2.2338, -4.8827, -5.9193, -5.4862, -4.4570, -5.4531, -5.9702, -4.5276, -4.4424, -4.9234, -5.7870, -5.0041, -6.1230, -4.4383, -4.7733, -5.6809, -4.4642, -2.4661, -6.7032, -5.8991, -6.1683, -6.8110, -3.3326, -3.8445, -7.9213, -4.0945, -4.6576, -7.1300, -5.5480, -4.3570, -4.3896, -4.5131, -4.0048, -6.3958, -4.8002, -5.4079, -7.4110, -6.5508, -4.2832, -3.7300, -5.5400, -6.1726, -4.8362, -4.4931, -6.2256, -5.8992, -7.2413, -5.7279, -4.9946, -5.8178, -5.2648, -5.9453, -7.0204, -4.9609, -4.3784, -6.1446, -4.5937, -4.4972, -5.9254, -6.3766, -4.2953, -5.9825, -5.0797, -4.8794, -6.1832, -4.5445, -5.6917, -6.2342, -3.2223, -5.3736, -5.9121, -6.9648, -2.9838, -4.6563, -5.3115, -5.5499, -5.9452, -5.0163, -7.4239, -5.3953, -6.8540, -4.8919, -2.6048, -6.5160], device='cuda:0')
Training¶
# training
import time
BATCH_SIZE = 32
LR = 0.0005
nb_epoch = 30
epoch_id = 0
time_0 = time.time()
num_labels = len(i2label)
DEVICE = 'cuda'
classifier = WSDClassifier(num_labels, DEVICE, flaubert_model, flaubert_config,MLP = False)
# ATTENTION ,USE MLP WILL SOME TIME RAISE OUT OF CUDA MEMORY ERROR
loss_function = nn.NLLLoss()
# SGD is quicker (more convenient for debug phase)
#optimizer = optim.SGD(classifier.parameters(), lr=LR)
optimizer = optim.Adam(classifier.parameters(), lr=LR)
config_name = 'sequoiaftb.asfalda_1_3.wsd.lr' + 'Adam' + str(LR) + '_bs' + str(BATCH_SIZE)
out_model_file = './' + config_name + '.model'
out_log_file = './' + config_name + '.log'
# losses at each epoch (on train / on validation set)
train_losses = []
val_losses = []
min_val_loss = None
min_val_accuracy = 0
# to speed up during debug: train on dev
#train_data = wsd_data['dev'] # data['train']
train_data = wsd_data['train']
val_data = wsd_data['dev']
val_batch_size = len(wsd_data['dev'].tid_seqs)
lemmas = None
while(epoch_id < nb_epoch):
classifier.train()
time_0 = time.time()
for b_tid_seqs, b_tg_trks,b_lemmas,b_labels in train_data.make_batches(BATCH_SIZE, shuffle_data=True):
b_tid_seqs = torch.tensor(b_tid_seqs, device=classifier.device)
b_tg_trks = torch.tensor(b_tg_trks, device=classifier.device)
gold = torch.tensor(b_labels, device=classifier.device)
lemmas = b_lemmas
optimizer.zero_grad()
log_probs = classifier(b_tid_seqs,b_tg_trks)
loss = loss_function(log_probs,gold)
train_losses.append(loss.item())
loss.backward()
optimizer.step()
val_accuracy,avg_val_loss = classifier.evaluate(val_data,return_loss = True,batch_size = val_batch_size)
print(f"-- END OF EPOCH {epoch_id}.")
print(f"Average loss on training set: {sum(train_losses) / len(train_losses)}.")
print(f"Average loss on dev set: {avg_val_loss}.")
print(f"Average accuracy on dev set: {val_accuracy}.")
if val_accuracy <= min_val_accuracy: #early stopping and roll back
print('-- Aaccuracy down,roll back and stop --')
classifier.load_state_dict(torch.load('classifier_params.pt'))
break
else:
min_val_accuracy = val_accuracy
torch.save(classifier.state_dict(), 'classifier_params.pt')
duration = time.time() - time_0
print(f"{duration} s elapsed (i.e. {duration / (epoch_id + 1)} s/epoch)")
epoch_loss = []
epoch_id += 1
Evaluation¶
test_data = wsd_data['dev']
batch_size = len(test_data.tid_seqs)
classifier.evaluate(test_data,batch_size)
print()
(0.0029795158188790083, 0.18642462971275686)