[Fix] fix interntrain's tokenizer truncate (#1605)

Co-authored-by: x54-729 <xingshuhao.dispatch@pjlab.org.cn>
This commit is contained in:
x54-729 2024-10-15 16:03:57 +08:00 committed by GitHub
parent 8aba547e06
commit 2b1afa7d1e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -318,9 +318,14 @@ class InternTrain(BaseModel):
# keep same with InternTrain's default value
min_out_len = 1
tokens = self.batch_encode(inputs,
self.max_seq_len - max_out_len,
left_padding=True)
if self.mode == 'none':
tokens = self.batch_encode(inputs,
self.max_seq_len,
left_padding=True)
else:
tokens = self.batch_encode(inputs,
self.max_seq_len - max_out_len,
left_padding=True)
# random seed for pass@k
seed = torch.tensor(time.time(), dtype=torch.int64).cuda()