forked from ARBML/Taqyim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpos_tagging.py
51 lines (46 loc) · 1.86 KB
/
pos_tagging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import taqyim as tq
def map_fn(sample):
TAGS = ["NOUN","PUNCT","ADP","NUM","SYM","SCONJ","ADJ","PART","DET","CCONJ","PROPN","PRON","X","_", "ADV","INTJ","VERB","AUX"]
tag_names_to_labels = {tag_name: tag_label for tag_label, tag_name in enumerate(TAGS)}
tag_labels_to_names = {
str(tag_label): tag_name for tag_name, tag_label in tag_names_to_labels.items()
}
mapped_tags = []
for id_ in sample["upos"]:
mapped_tags.append(tag_labels_to_names[id_])
sample["upos"] = mapped_tags
tags_map = ""
tokens = []
for token, tag in zip(sample['tokens'], sample["upos"]):
if tag == '_':
continue
tags_map += f"{token}:{tag}"
tags_map += "\n"
tokens.append(token)
sample["upos"] = tags_map
sample['tokens'] = ' '.join(tokens)
return sample
prompt = """
I wish you can generate a table of Arabic POS tags following Universal Dependencies tagset in the following format:
TOKEN:POS
Please note that I tokenized the sentence for you. Please do not change, add, combine, merge or remove any of these tokens such as ب and ه. Please consider punctuation marks as separate tokens, always. Split them as two separate tokens if they come together and classify each of them independently.
Please give me the generated table and that is it. No further discussion, explanation or extrapolation required.
""".strip()
pipeline = tq.Pipeline(
eval_name = "padt-test",
task_class= "pos_tagging",
task_desc = "Arabic text PoS tagging",
input_column_name = 'tokens',
target_column_name = 'upos',
prompt=prompt,
api_key='<openai-key>',
dataset_name="universal_dependencies",
preprocessing_fn=map_fn,
train_split="train",
test_split="test",
model_name = "gpt-3.5-turbo-0301",
max_samples= 2,
subset= "ar_padt",
)
pipeline.run()
print(pipeline.show_results())