-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummary.py
30 lines (23 loc) · 1.19 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import torch
from vncorenlp import VnCoreNLP
from transformers import AutoTokenizer, EncoderDecoderModel
def predict(text):
device = "cuda" if torch.cuda.is_available() else "cpu"
model = EncoderDecoderModel.from_pretrained('./training/checkpoint-10000')
model.to(device)
rdrsegmenter = VnCoreNLP('./vncorenlp/VnCoreNLP-1.1.1.jar', annotators="wseg", max_heap_size="-Xmx500m")
text = rdrsegmenter.tokenize(text)
text = ' '.join([' '.join(x) for x in text])
tokenizer = AutoTokenizer.from_pretrained('vinai/phobert-base', use_fast=False)
inputs = tokenizer(text, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
outputs = model.generate(input_ids,
attention_mask=attention_mask,
max_length=256,
early_stopping=True,
num_beams=4,
no_repeat_ngram_size=3)
# all special tokens including will be removed
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
return output_str[0]