-
Notifications
You must be signed in to change notification settings - Fork 2
/
indexing.py
74 lines (60 loc) · 2.34 KB
/
indexing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
import json, TED, path
import xml.etree.ElementTree as ET
import collections
# In this file: Everything related to Indexing Tables
directory = 'Documents'
def process_XML(filename, indexing_table):
doc = open(directory + "/" + filename,'r')
tree = TED.preprocessing(ET.parse(doc).getroot())
terms = path.TC(tree)
for term in terms:
if(term in indexing_table):
if filename not in indexing_table[term]:
(indexing_table[term]).append(filename)
else:
indexing_table[term] = [filename]
sorted_index = collections.OrderedDict(sorted(indexing_table.items()))
return sorted_index
def save_toJSON(indexing_table):
with open('IndexingTable.json', 'w') as f:
json.dump(indexing_table, f, indent=2)
def process_XML_TB(filename, indexing_table):
doc = open(directory + "/" + filename,'r')
tree = TED.preprocessing(ET.parse(doc).getroot())
terms = path.tag_based(tree)
for term in terms:
if(term in indexing_table):
(indexing_table[term]).append(filename)
else:
indexing_table[term] = [filename]
sorted_index = collections.OrderedDict(sorted(indexing_table.items()))
return sorted_index
def save_toJSON_TB(indexing_table):
with open('IndexingTableTags.json', 'w') as f:
json.dump(indexing_table, f, indent=2)
# Add file to directory -> Update Indexing Table
def add(filename):
with open('IndexingTable.json', 'r') as f:
indexing_table = json.loads(f.read())
new_indexing_table = process_XML(filename, indexing_table)
save_toJSON(new_indexing_table)
# Delete file from directory -> Update Indexing Table
def delete(filename):
with open('IndexingTable.json', 'r') as f:
indexing_table = json.loads(f.read())
for key in indexing_table:
if(indexing_table[key][filename]):
indexing_table[key].pop(filename)
save_toJSON(indexing_table)
# Compute Indexing Table with the available corpus in file "Documents"
def compute_indexing_table():
indexing_table1 = {}
indexing_table2 = {}
for filename in os.listdir(directory):
if filename.endswith((".xml",".XML")):
tr = process_XML(filename, indexing_table1)
tb = process_XML_TB(filename, indexing_table2)
save_toJSON(tr)
save_toJSON_TB(tb)
compute_indexing_table()