Issue
I want to use the paraphrase-multilingual-mpnet-base-v2 model to build embeddings and I got this error:
RuntimeError: CUDA error: device-side assert triggered
The error occurs by executing string = {k: v.to(device=device) for k, v in string.items()}
.
Why do I get the error?
I work in a Google Colab with 12.7 GB RAM and 16 GB GPU-RAM
The goal of the code is to generate sentence embeddings. With some customizing is a chunk-wise execution also possible.
The complete error message:
RuntimeError Traceback (most recent call last) <ipython-input-17-8e6bf00d9e24> in <cell line: 104>()
102 return np.nan
103
--> 104 processed_data = processDataRAG(df[5000:], tokenizer, model)
4 frames <ipython-input-17-8e6bf00d9e24> in processDataRAG(data, tokenizer, model)
10 sents = [str(sentences[0]) for sentences in article_sentences]
11 number_of_article =[sentences[1] for sentences in article_sentences]
---> 12 embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")]
13 return pd.DataFrame({
14 "sentences": sents,
<ipython-input-17-8e6bf00d9e24> in <listcomp>(.0)
10 sents = [str(sentences[0]) for sentences in article_sentences]
11 number_of_article =[sentences[1] for sentences in article_sentences]
---> 12 embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")]
13 return pd.DataFrame({
14 "sentences": sents,
<ipython-input-17-8e6bf00d9e24> in embeddChunkwise(string, tokenizer, model, chunk_size)
55 #encoded_input = tokenizer(tokenizer.detokenize(tokenized_chunk))
56 if len(encoded_chunk) > 0:
---> 57 embedded_chunk = createEmbeddings(
58 tokenizer(tokenizer.decode(encoded_chunk, skip_special_tokens = True), return_tensors='pt', add_special_tokens=False),
59 model
<ipython-input-17-8e6bf00d9e24> in createEmbeddings(string, model)
77 #print("Length of input_ids: ", len(string["input_ids"][0]))
78 if "
input_ids" in string.keys():
---> 79 string = {k: v.to(device=device) for k, v in string.items()}
80 with torch.no_grad():
81
<ipython-input-17-8e6bf00d9e24> in <dictcomp>(.0)
77 #print("Length of input_ids: ", len(string["input_ids"][0]))
78 if "input_ids" in string.keys():
---> 79 string = {k: v.to(device=device) for k, v in string.items()}
80 with torch.no_grad():
81
RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
I run this code:
from transformers import AutoTokenizer, AutoModel
import torch
from torch import cuda
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] #First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Select device globally
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
device_map = device)
df = pd.read_json(file_path)
def processDataRAG(data, tokenizer, model):
article_sentences = data.content.progress_apply(lambda x: list(nlp_de(x).sents))
#tokenized_articles = data.content.progress_apply(lambda article: tokenizeChunkwise(article, tokenizer, 512))
article_sentences = [
(sentences, idx) for idx, article in tqdm(enumerate(list(article_sentences)), desc="Loop over articles with index")
for sentences in article
]
sents = [str(sentences[0]) for sentences in article_sentences]
number_of_article =[sentences[1] for sentences in article_sentences]
embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")]
return pd.DataFrame({
"sentences": sents,
"embeddings": embedded_sentencs,
"article": number_of_article
})
def embeddChunkwise(string, tokenizer, model, chunk_size):
decreasing_by_special_tokens = 0 # Because of speical tokens at the beginning and end
encoded_string = tokenizer(string, add_special_tokens=False)
if len(encoded_string["input_ids"])/chunk_size > 1:
print("Tokenized_string:", encoded_string)
print("Total tokens: ", str(len(encoded_string["input_ids"])))
print("Tokenized string in chunks: ", str(len(encoded_string["input_ids"])/chunk_size), " --- " , str(len(encoded_string["input_ids"])//chunk_size +1))
embedded_chunks = []
for idx in list(range(len(encoded_string["input_ids"])//chunk_size +1 )):
encoded_chunk=None
if (chunk_size-decreasing_by_special_tokens)*(idx+1) < len(encoded_string["input_ids"]): # sentences with 1000 words as instances
start_idx, end_idx = (chunk_size*idx - decreasing_by_special_tokens*idx, chunk_size*(idx+1) - decreasing_by_special_tokens*(idx+1))
encoded_chunk = encoded_string["input_ids"][start_idx:end_idx]
else: # If it is a sentences with 20 words as instance
if chunk_size-decreasing_by_special_tokens > len(encoded_string["input_ids"]):
encoded_chunk = encoded_string["input_ids"][chunk_size*(idx) - decreasing_by_special_tokens*(idx):]
else:
encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)):]
if len(encoded_chunk) > 0:
embedded_chunk = createEmbeddings(
tokenizer(tokenizer.decode(encoded_chunk, skip_special_tokens = True), return_tensors='pt', add_special_tokens=False),
model
)
if isinstance(embedded_chunk, list):
embedded_chunks.append(embedded_chunk[0])
if len(embedded_chunks) > 1:
return embedded_chunks
elif len(embedded_chunks) == 0:
return np.nan
else:
return embedded_chunks[0]
def createEmbeddings(string, model):
if "input_ids" in string.keys():
string = {k: v.to(device=device) for k, v in string.items()}
with torch.no_grad():
try:
model_output = model(**string)
except Exception as ex:
print("--- Error by creating Embeddings ---")
print("Error: ", str(ex))
return np.nan
# Perform pooling. In this case, average pooling
try:
sentence_embeddings = mean_pooling(model_output, string['attention_mask'])
except Exception as ex:
print("--- Error by pooling embeddings ---")
print("Model output: ", str(model_output))
print("Attention_mask: ", str(string['attention_mask']))
print("Error: ", str(ex))
return np.nan
sentence_embeddings = sentence_embeddings.detach().cpu().numpy()
return sentence_embeddings
else:
return np.nan
Solution
I have found the cause of the error in my case: It depends on the input size.
The model accept an input size of 512 tokens and I pass 513 tokens.
The cause of the "too long input" is in this line of code encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)):]
. I have to add -1 --> encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)-1):]
.
All in all, the cause was the wrong input size.
Answered By - Christian01
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.