Issue
In the HuggingFace tokenizer, applying the max_length
argument specifies the length of the tokenized text. I believe it truncates the sequence to max_length-2
(if truncation=True
) by cutting the excess tokens from the right. For the purposes of utterance classification, I need to cut the excess tokens from the left, i.e. the start of the sequence in order to preserve the last tokens. How can I do that?
from transformers import AutoTokenizer
train_texts = ['text 1', ...]
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
encodings = tokenizer(train_texts, max_length=128, truncation=True)
Solution
I wrote a solution, which is not very robust. Still looking for a better way. This is tested with the models mentioned in code.
from typing import Tuple
from transformers import AutoTokenizer
# also tested with: ufal/robeczech-base, Seznam/small-e-czech
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', use_fast=False)
texts = ["Do not meddle in the affairs of wizards for they are unpredictable.", "Did you meddle?"]
encoded_input = tokenizer(texts)
def cut_seq_left(seq: list, max_length: int, special_ids: dict) -> Tuple[int,int]:
# cut from left if longer. Keep special tokens.
normal_idx = 0
while seq[normal_idx] in special_ids and normal_idx < len(seq)-1:
normal_idx += 1
if normal_idx >= len(seq)-1:
normal_idx = 1
#raise Exception('normal_idx longer for seq:' + str(seq))
rest_idx = normal_idx + len(seq) - max_length
seq[:] = seq[0:normal_idx] + seq[rest_idx:]
return normal_idx, rest_idx
def pad_seq_right(seq: list, max_length: int, pad_id: int):
# pad if shorter
seq.extend(pad_id for _ in range(max_length - len(seq)))
def get_pad_token(tokenizerr) -> str:
specials = [t.lower() for t in tokenizerr.all_special_tokens]
pad_candidates = [t for t in specials if 'pad' in t]
if len(pad_candidates) < 1:
raise Exception('Cannot find PAD token in: ' + str(tokenizerr.all_special_tokens))
return tokenizerr.all_special_tokens[specials.index(pad_candidates[0])]
def cut_pad_encodings_left(encodingz, tokenizerr, max_length: int):
specials = dict(zip(tokenizerr.all_special_ids, tokenizerr.all_special_tokens))
pad_code = get_pad_token(tokenizerr)
padd_idx = tokenizerr.all_special_tokens.index(pad_code)
for i, e in enumerate(encodingz.data['input_ids']):
if len(e) < max_length:
pad_seq_right(e, max_length, tokenizerr.all_special_ids[padd_idx])
pad_seq_right(encodingz.data['attention_mask'][i], max_length, 0)
if 'token_type_ids' in encodingz.data:
pad_seq_right(encodingz.data['token_type_ids'][i], max_length, 0)
elif len(e) > max_length:
fro, to = cut_seq_left(e, max_length, specials)
encodingz.data['attention_mask'][i] = encodingz.data['attention_mask'][i][:fro] \
+ encodingz.data['attention_mask'][i][to:]
if 'token_type_ids' in encodingz.data:
encodingz.data['token_type_ids'][i] = encodingz.data['token_type_ids'][i][:fro] \
+ encodingz.data['token_type_ids'][i][to:]
cut_pad_encodings_left(encoded_input, tokenizer, 10) # returns nothing: works in-place
Answered By - Ondrej Sotolar
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.