-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenize_policies.py
101 lines (84 loc) · 4.44 KB
/
tokenize_policies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import stanza
import glob
import pathlib
import re
nlp = stanza.Pipeline('en', download_method=stanza.DownloadMethod.REUSE_RESOURCES, processors='tokenize')
def get_sentences(input: str) -> list[str]:
sentences: list[str] = []
processed_text: stanza.Document = nlp(input)
for sentence in processed_text.sentences:
sentences.append(sentence.text)
return sentences
for markdown_doc in glob.glob("converted_markdown/*.md"):
# Calculate the target filename from the source filename
target_doc = pathlib.Path.joinpath(pathlib.Path.cwd(), pathlib.Path("tokenized_markdown"), pathlib.Path(markdown_doc).name)
# open and tokenize the source file
with open(markdown_doc) as raw_doc:
policy_lines = raw_doc.readlines()
line_count = len(policy_lines)
# Precompile some regex we'll be using repeatedly
# first, the regex for a markdown link
md_link_re = re.compile(r"\[(?P<link>.*?)\]\((?P<target>.*?)\)(?P<text>.*)")
processed_lines: list[str] = []
for index, line in enumerate(policy_lines):
print(f"Processing line {index} of {line_count} from {markdown_doc}.")
# Check for blank lines (lines with only '\n') and just add them to the processed_lines list
# Skip the rest of the processing for these lines
if len(line) == 1:
processed_lines.append("")
continue
# strip trailing newline from line
if line[-1] == "\n":
line = line[:-1]
# Don't process section headers
if line[0] == "#":
processed_lines.append(line)
#Don't process image links if they are on their own line
elif line[0] == "!" and line[-1] in [")","}"]:
processed_lines.append(line)
# Process lines starting with links "[" separately
elif line[0] == "[":
processed_lines.append(line)
# **This code isn't quite ready yet** - Based on manual review, we can skip the links.
# line_matches = md_link_re.match(line)
# if line_matches is not None:
# first_line = ""
# matched_elements = line_matches.groupdict()
# if matched_elements["link"] != "":
# first_line += f"[{matched_elements['link']}]"
# if matched_elements["target"] != "":
# first_line += f"({matched_elements['target']})"
# if matched_elements["text"] != "":
# sentences = get_sentences(input=matched_elements["text"])
# first_line += sentences[0]
# processed_lines.append(first_line)
# # Add the rest of the sentences immediately after
# processed_lines.extend([sentence for sentence in sentences])
# else:
# This line starts with '[' but is not a link - just copy it like a regular line
#processed_lines.extend(get_sentences(input=line))
# if the line looks like a table
elif line[0] == "|":
# TODO: Implement html tables for this - Turns out there's nothing interesting in the tables. Just skipping
processed_lines.append(line)
# This doesn't quite work - need to implement as html table
# split the line into columns
# columns = line.split("|")
# # for each column, split into sentences - reassemble with a sentance per line
# # takes advantage of the fact that MD counts a single \n as equivalent to a space. * TODO: This doesn't work
# for column in columns:
# sentences = get_sentences(input=column)
# # split strips the separater, so we'll need to add it back at the beginning of the first sentence
# if len(sentences) > 0:
# sentences[0] = "|" + sentences[0]
# processed_lines.extend(sentences)
# # Add the terminating '|' a the end of the last column
# processed_lines[-1] = processed_lines[-1] + "|"
# Preserve empty lines - insert a blank string
elif line[0] == "\n":
processed_lines.append("")
# If we get here, we're probably dealing with a regular line of text
else:
processed_lines.extend(get_sentences(input=line))
# Write out the tokenized file, terminating each string with a newline
target_doc.write_text("\n".join(processed_lines), encoding="utf-8")