forked from lancopku/AAPR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_latex.py
139 lines (119 loc) · 4.06 KB
/
clean_latex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 29 18:23:37 2017
@author: ypc
"""
import re
import os
import codecs
import tarfile
import json as js
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
english_stopwords = stopwords.words('english')
def clean_math(string):
while string.count('$') > 1:
pos0 = string.find('$')
pos1 = string.find('$', pos0+1)
string = (string[:pos0] + string[pos1+1:]).strip()
return string
def clean_str(string):
"""
Input:
string: One line in a latex file.
Return:
string cleaned.
"""
# Remove mathematical formulas between $$
string = clean_math(string)
# Remove "ref"
string = re.sub(r'~(.*)}', '', string)
string = re.sub(r'\\cite(.*)}', '', string)
string = re.sub(r'\\newcite(.*)}', '', string)
string = re.sub(r'\\ref(.*)}', '', string)
# Remove stopwords
texts_tokenized = [word.lower() for word in word_tokenize(string)]
texts_filtered_stopwords = [word for word in texts_tokenized if not word in english_stopwords]
string = ' '.join(texts_filtered_stopwords)
string = string.replace(',', '')
string = string.replace('.', '')
string = string.replace('?', '')
string = string.replace('!', '')
string = string.replace('/', '')
string = string.replace('$', '')
string = string.replace('~', '')
string = string.replace('\\', '')
string = string.replace('{', '')
string = string.replace('}', '')
string = string.replace('#', '')
string = string.replace('&', '')
string = string.replace('@', '')
string = string.replace('%', '')
string = string.replace('^', '')
string = string.replace('*', '')
string = string.replace('-', '')
string = string.replace('=', '')
string = string.replace('[', '')
string = string.replace(']', '')
string = string.replace('+', '')
string = string.replace('(', '')
string = string.replace(')', '')
return string
def process_text_list(text_list):
"""
Input:
text_list: Content of a latex file and each element represents a line.
Return:
A list, which is the cleaned content of a latex file.
"""
result = ''
for line in text_list:
line = line.strip()
if line.startswith('%') or line.startswith('\\') or line == '':
pass
elif line[0].isdigit():
pass
else:
result += clean_str(line)
return result
# Extract Introduction, related work, etc.================================================================
def split(tex_list, start_char, end_char):
lines = tex_list
length = len(lines)
start = None
end = None
i = 0
while i < length and (end is None):
if start is None:
if lines[i].startswith(start_char):
start = i + 1
else:
if lines[i].startswith(end_char):
end = i
i += 1
if (start is not None) and (end is None):
end = length
return lines[start:end]
def extract(tex_list, segment=False):
data = tex_list
text = ' '.join(data)
intro = ' '.join(split(tex_list, '\section{Intro', '\section{'))
related = ' '.join(split(tex_list, '\section{Related', '\section{'))
conclusion = ' '.join(split(tex_list, '\section{Conclu', '\section{'))
methods = text.replace(intro, '').replace(related, '').replace(conclusion, '')
if segment:
pass
else:
return list(map(process_text_list,
[intro.split('\n'), related.split('\n'), methods.split('\n'), conclusion.split('\n')]))
def main(file_dir):
result = {}
file_names = os.listdir(file_dir)
for file_name in file_names:
try:
f_name = os.path.join(file_dir, file_name)
tex_list = make_single_tex(f_name)
result[file_name] = extract(tex_list)
except:
continue
return result