Issue
I recently moved from tensorflow to pytorch, and today I had a little problem.
I run my scripts with slurm on an hpc, and all the stdout is redirected to a text file. The problem is that from this afternoon it stopped writing to that file while training (it should write training progresses), but the training is still going on, since slurm job is executing, and it saves model checkpoints every epoch.
The code of training loop is the following:
class TrainingParams:
def __init__(self, model_params):
self.BATCH_SIZE = 320
self.LEARNING_RATE = 6e-5
self.EPOCHS = 8
with open('weights/full2021/classes/classes_weights.json', 'r') as fp:
classes_weights = json.load(fp)
classes_weights = torch.as_tensor(list(classes_weights.values()), dtype=torch.float)
self.loss = nn.CrossEntropyLoss(weight=classes_weights)
self.optimizer = torch.optim.Adamax(model_params, lr=self.LEARNING_RATE)
def get_pbar(progress):
trattini = []
progress = math.ceil(100)
for i in range(0, math.floor(progress / 4)):
trattini.append('-')
for i in range(0, 25 - (math.floor(progress / 4))):
trattini.append(' ')
pbar = '[' + ''.join(trattini) + ']'
return pbar
def train(model: nn.Module, train_paths: dict, val_paths: dict,
resume=False, units=None, starting_epoch=1):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.cuda()
if resume:
checkpoint = torch.load(f"weights/full2021/unfreezed_cp_{units}/checkpoint_epoch-{starting_epoch - 1}")
model.load_state_dict(checkpoint['model'])
training_params = TrainingParams(model.parameters())
optimizer = training_params.optimizer
optimizer.load_state_dict(checkpoint['optimizer'])
scaler = torch.cuda.amp.GradScaler()
scaler.load_state_dict(checkpoint['scaler'])
print(f'Correctly restored checkpoint of network with units {units} at epoch {starting_epoch - 1}')
else:
training_params = TrainingParams(model.parameters())
optimizer = training_params.optimizer
scaler = torch.cuda.amp.GradScaler()
loss = training_params.loss.cuda()
train_data = Dataset(train_paths['ids'], train_paths['attention'], train_paths['labels'])
val_data = Dataset(val_paths['ids'], val_paths['attention'], val_paths['labels'])
train_len = len(train_data)
val_len = len(val_data)
train_data = torch.utils.data.DataLoader(train_data, batch_size=training_params.BATCH_SIZE, shuffle=True)
val_data = torch.utils.data.DataLoader(val_data, batch_size=training_params.BATCH_SIZE)
history = {
'epoch': [],
'loss': [],
'accuracy': [],
'val_loss': [],
'val_accuracy': []
}
print(f'Starting training from epoch {starting_epoch}...')
for epoch in range(0, training_params.EPOCHS):
start_time = time.time()
train_accuracy = 0
train_loss = 0
val_accuracy = 0
val_loss = 0
for i, data in enumerate(train_data, 0):
input, label = data
model.zero_grad(set_to_none=True)
with torch.cuda.amp.autocast():
label = label.long().to(device)
input_ids = input['input_ids'].squeeze(1).to(device)
attention_mask = input['attention_mask'].to(device)
output = model(input_ids, attention_mask)
batch_loss = loss(output, label)
scaler.scale(batch_loss).backward()
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=3.0)
scaler.step(optimizer)
scaler.update()
train_loss += batch_loss.item()
batch_accuracy = (output.argmax(dim=1) == label).sum().item()
train_accuracy += batch_accuracy
progress = 100 * i / (train_len // training_params.BATCH_SIZE)
print(f'Epoch: {starting_epoch + epoch} | '
f'It: {i: 5d}/{train_len // training_params.BATCH_SIZE} | '
f'Elapsed: {time.time() - start_time: .2f}s | '
f'Progress: {get_pbar(progress)} {progress: .2f}% | '
f'Loss: {train_loss / (i + 1): .3f} | '
f'Accuracy: {train_accuracy / (training_params.BATCH_SIZE * (i + 1)): .3f} |'
)
with torch.no_grad():
for input, label in val_data:
with torch.cuda.amp.autocast():
label = label.long().cuda()
input_ids = input['input_ids'].squeeze(1).cuda()
attention_mask = input['attention_mask'].cuda()
output = model(input_ids, attention_mask)
batch_loss = loss(output, label)
val_loss += batch_loss.item()
batch_accuracy = (output.argmax(dim=1) == label).sum().item()
val_accuracy += batch_accuracy
print_train_loss = train_loss / (train_len // training_params.BATCH_SIZE)
print_train_accuracy = train_accuracy / train_len
print_val_loss = val_loss / (val_len // training_params.BATCH_SIZE)
print_val_accuracy = val_accuracy / val_len
print(f'-----> Epoch {starting_epoch + epoch} ended. '
f'Total elapsed time: {(time.time() - start_time) * 60: .2f}m | '
f'| Training Loss: {print_train_loss: .3f} '
f'| Training Accuracy: {print_train_accuracy: .3f} '
f'| Validation Loss: {print_val_loss: .3f} '
f'| Validation Accuracy: {print_val_accuracy: .3f}'
)
history['epoch'].append(starting_epoch + epoch)
history['loss'].append(print_train_loss)
history['accuracy'].append(print_train_accuracy)
history['val_loss'].append(print_val_loss)
history['val_accuracy'].append(print_val_accuracy)
checkpoint = {"model": model.state_dict(),
"optimizer": optimizer.state_dict(),
"scaler": scaler.state_dict()}
torch.save(checkpoint, f"weights/full2021/unfreezed_cp_{units}/checkpoint_epoch-{starting_epoch + epoch}")
history = pd.DataFrame.from_dict(history)
history.to_csv(f'history_log-dir/full2021/history_{units}_epoch-{starting_epoch + training_params.EPOCHS - 1}',
index=False)
Basically it skips all "print" statements, since I can see everytime these lines:
2022-06-15 12:08:28.989315: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of the model checkpoint at Bert/ were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
I can't understand why this is happening, it always worked correctly since this afternoon.
PS: now it updated one of the output files (I'm training on two different nodes with two models), and it debugged ~2000 iterations in one shot, so it now debug only after tot iterations. This is strange because since this afternoon it was updating the text file at every iteration (e.g. if I was pressing f5 continuously, I was seeing the file in constant update).
What can be the cause?
Solution
The print statements are being loaded into be buffer which then prints once its filled up. To force the print to be printed directly try running your code with the -u
flag. Something like python -u my_code.py
. Alternatively, adding flush=True
to the print statements should accomplish the same.
Answered By - Sean
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.