This repository has been archived by the owner on Sep 28, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
consolidate.py
157 lines (121 loc) · 6.45 KB
/
consolidate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Script to take the output of take_inventory.py or extract_metadata.py (.csv files), and
# consolidate the entries by filename, making columns for the different terms involved (and count).
# The terms are loaded from terms.txt and are used to make the columns.
#
# The terms are converted into allowable Python identifiers, which must use letters, numbers, and underscores.
# This conversion is done for the sake of apply-filters.txt, in which you express filters using column names
# and those names have to be allowable identifiers. Thus we use the make_identifier function in utility.py
# to produce the column names.
#
# Other fields are left as-is.
#
# take_inventory.py invokes this script automatically at the end of its processing using the output from
# extract_metadata.py.
#
# Note that this script depends on the CSV file being sorted by filename, as take_inventory.py produces.
import sys
import json
from utilities import parse_config_arguments, make_identifier, TAGS, COLUMNS
def consolidate(config, input_file, output_file):
print("consolidate, INFO, Starting consolidation, {}".format(input_file))
prefix = input_file.split('_')[0].lower()
terms = None
for content_set in config["inventory"]:
if content_set["name"].lower() == prefix:
terms = content_set["terms"]
break
if terms is None:
print("consolidate, ERROR, Could not find terms for {}, {}".format(prefix, input_file))
sys.exit(1)
tags = list(TAGS.values())
with open(input_file, encoding='utf-8') as f_in:
import csv
reader = csv.reader(f_in)
headers = next(reader)
# We'll replace the "Term" column with individual terms; Tags is also expanded to the distinct
# classification tags. We also remove Line and Extract because they're no longer meaningful.
index_filename = headers.index(COLUMNS["file"])
index_term = headers.index(COLUMNS["term"])
index_tag = headers.index(COLUMNS["tag"])
index_line = headers.index(COLUMNS["line"])
index_extract = headers.index(COLUMNS["extract"])
headers.remove(COLUMNS["term"])
headers.remove(COLUMNS["tag"])
headers.remove(COLUMNS["line"])
headers.remove(COLUMNS["extract"])
# Insert columns for each of the terms, plus a "Term_Total" column. Also insert columns for each of
# the tags.
#
# NOTE: all of these columns should be named with valid Python identifiers, which the make_identifier
# function guarantees.
for i in range(0, len(terms)):
headers.insert(index_term + i, make_identifier(terms[i]))
headers.insert(index_term + i + 1, COLUMNS["term_total"])
index_tags_new = index_term + i + 2 # Index for inserting tabs after term counts
for i in range(0, len(tags)):
headers.insert(index_tags_new + i, make_identifier(tags[i]))
index_filename_count = len(tags) - 1
with open(output_file, 'w', encoding='utf-8', newline='') as f_out:
writer = csv.writer(f_out)
writer.writerow(headers)
term_counts = [0] * len(terms)
tag_counts = [0] * len(tags)
try:
current_row = next(reader)
except:
current_row = None
while current_row is not None:
if current_row == None:
break
next_row = next(reader, None) # Assigns None if we're at the end
term = current_row[index_term]
term_counts[terms.index(term)] += 1
tag = current_row[index_tag]
tag_counts[tags.index(tag)] += 1
# If the filename in the next row is the same, continue with the next line
if next_row != None and current_row[1] == next_row[1]:
current_row = next_row
continue
# Otherwise, write the term count columns. But first, remove the Extract, Line, Tag, and Term columns,
# which are no longer relevant. We do this in reverse order because we're using indices. Note that
# this does assume the ordering generated by take_inventory.py and extract_metadata.py.
current_row.pop(index_extract)
current_row.pop(index_line)
current_row.pop(index_tag)
current_row.pop(index_term)
total = 0
for i in range(0, len(terms)):
current_row.insert(index_term + i, term_counts[i])
total += term_counts[i]
# Add one more row with the total count of terms, which accommodates sorting
current_row.insert(index_term + i + 1, total)
# Add the tag count columns after patching up the in_filename count,
# which we have to do separately.
# Add the filename occurrence count
filename = current_row[index_filename].lower()
filename_total = sum(filename.count(term.lower()) for term in terms)
tag_counts[index_filename_count] = filename_total
for i in range(0, len(tags)):
current_row.insert(index_tags_new + i, tag_counts[i])
# Write the row
writer.writerow(current_row)
# Reset state for the next round, including the counts
current_row = next_row
term_counts = [0] * len(terms)
tag_counts = [0] * len(tags)
print("consolidate, INFO, Consolidation complete, ,")
if __name__ == "__main__":
config_file, args = parse_config_arguments(sys.argv[1:])
if config_file == None:
print("Usage: python consolidate.py --config=<config_file> <input_csv_file.csv>")
print("<input_csv_file.csv> is the output from take_inventory.py or extract_metadata.py and should be sorted by filename.")
print("<config_file> should be the same one given to take_inventory.py.")
sys.exit(2)
# Making the output filename assumes the input filename has only one .
input_file = args[0]
elements = input_file.split('.')
output_file = elements[0] + '-consolidated.' + elements[1]
config = None
with open(config_file) as config_file:
config = json.load(config_file)
consolidate(config, input_file, output_file)