-
Notifications
You must be signed in to change notification settings - Fork 1
/
full_summarize.py
executable file
·74 lines (64 loc) · 2.69 KB
/
full_summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import argparse
import json
from numba.cuda import jit
from os.path import exists
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("datafile", type=str, help="Input file")
args = parser.parse_args()
DATA_FILE = f"./data/{args.datafile}/{args.datafile}.json"
OUTPUT_FILE = f"./data/{args.datafile}/full_summarized_{args.datafile}.json"
#GPU optimisation
@jit(forceobj=True)
def sum():
for i in range(len(json_read_in_objects)):
# Use tokenizer on the text
token = pegasus_tokenizer.encode(json_read_in_objects[i]["TEXT"], return_tensors="pt", truncation=True)
# encode the token
encode_token = pegasus_model.generate(token)
# decode the token to get words
decode_token = pegasus_tokenizer.decode(encode_token[0], skip_special_tokens=True)
print(i)
json_write_out_objects.append(decode_token)
if __name__ == "__main__":
# do not summarize if summarized file already exists
if exists(OUTPUT_FILE):
print("Translated docs already exist: " + OUTPUT_FILE)
print("Manually delete this file if you want to re-summarize the docs.")
exit(1)
# read data from file
print("Reading data from: " + DATA_FILE)
json_read_in_objects = []
with open(DATA_FILE, "r") as f:
for line in f:
# Parse the line as a JSON object
item = json.loads(line)
json_read_in_objects.append(item)
# Define the used model
model_name = "google/pegasus-xsum"
# Load tokenizer
pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
# Define Pegasus model
pegasus_model = PegasusForConditionalGeneration.from_pretrained(model_name)
json_write_out_objects = []
for i in range(len(json_read_in_objects)):
# Use tokenizer on the text
token = pegasus_tokenizer(json_read_in_objects[i]["TEXT"], return_tensors="pt", truncation=True)
# encode the token
encode_token = pegasus_model.generate(**token)
# decode the token to get words
decode_token = pegasus_tokenizer.decode(encode_token[0], skip_special_tokens=True)
print(i)
json_write_out_objects.append(decode_token)
doc_id = 1
str1 = "{\"DOCID\": \""
str2 = "\", \"TEXT\": \""
str3 = "\"}"
with open(OUTPUT_FILE, "w") as f:
for line in json_write_out_objects:
# Parse the line as a JSON object
# {"DOCID": "1", "TEXT": "Preliminary Report-International Algebraic Language"}
result_string = str1 + str(doc_id) + str2 + line + str3+ "\n"
f.write(result_string)
print("Current ", doc_id)
doc_id += 1