-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__main__.py
293 lines (261 loc) · 16.8 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
from confluence.api import ConfluenceAPI
from database.api import DatabaseAPI
from flatdict import FlatDict
import pandas as pd
import config_parser
import os
import re
import logging
import datetime
import argparse
VERSION = '1.3.4'
# noinspection PyTypeChecker,PyShadowingNames
def child_page_recursive(pages, space_id, parent_page_id, table_prefix, recheck_pages_meet_criteria=False,
config_modified=False):
"""Recursively inserts page information into the database after making requests to the Confluence API.
Args:
pages (dict): A dictionary of pages to crawl through, have a look at the example config for more information.
space_id (int): The top level space_id that the information relates to.
parent_page_id (int): The current pages parent page id.
table_prefix (str): The current database table name prefix.
recheck_pages_meet_criteria (bool): Ensures that all current pages meet the criteria set out in the config file.
If this is False, it will assume that all pages in the database meet the criteria and will only take delta changes for these.
config_modified (bool): Whether the config has been modified since last launch.
"""
# if the child page has not been updated since we last stored the information, then no need to check labels/title!
for page_type in pages:
for page_identifier in pages[page_type]:
# Create tables to store the pages in and the information they contain.
# table = table_prefix + '_' + page_type + '_' + page_identifier
table = table_prefix.replace(
' ', '') + '_' + page_identifier.replace('_', '').replace(' ', '')[:5].lower()
DatabaseAPI.create_table(table)
info_table = table + '__info'
DatabaseAPI.create_table(info_table, True)
try:
child_pages = ConfluenceAPI.get_child_page_ids(parent_page_id)
except:
logger.warning(
'child_page_recursive: Unable to get child pages for: %s' % str(parent_page_id))
continue
for child_page_id in child_pages:
# Decision tree to see if the current page meets the criteria provided in the config file.
# if we are not forced to recheck the page meets the criteria then use the pages in the database table.
# else, check to see if the page meets either criteria.
page_meets_criteria = False
if not recheck_pages_meet_criteria and not config_modified:
if DatabaseAPI.check_data_exists(table, parent_page_id, child_page_id):
# If the page already exists in the database ignore
# checking the page meets the criteria, unless forced to.
page_meets_criteria = True
else:
if page_type == 'titles':
if child_pages[child_page_id]['name'] == page_identifier:
page_meets_criteria = True
elif page_type == 'labels':
try:
if page_identifier in ConfluenceAPI.get_page_labels(child_page_id):
# Check that the page meets the criteria given,
# i.e. it is labelled as something/title is something and needs to be updated.
page_meets_criteria = True
except:
logger.warning(
'child_page_recursive: Unable to retrieve labels for: %s' % str(child_page_id))
continue
if page_meets_criteria:
page_updated = DatabaseAPI.insert_or_update(
table, parent_page_id, child_page_id, child_pages[child_page_id]['name'],
child_pages[child_page_id]['last_updated'], True)
# If the current page information was updated since the last run,
# delete all children information and re-fill it.
page_content = ''
if page_updated or config_modified:
logger.info('Updating information in space %s for page: %s' % (
str(space_id), child_pages[child_page_id]['name']))
DatabaseAPI.delete(info_table, child_page_id)
try:
page_content = ConfluenceAPI.get_page_content(
child_page_id)
except:
logger.warning(
'child_page_recursive: Unable to get page content for: %s' % str(child_page_id))
continue
for page_info_type in pages[page_type][page_identifier]:
if page_info_type == 'pages':
child_page_recursive(pages[page_type][page_identifier][page_info_type], space_id,
child_page_id, table, recheck_pages_meet_criteria, config_modified)
else:
if page_updated or config_modified:
try:
if page_info_type == 'panels':
for panel_identifier in pages[page_type][page_identifier][page_info_type]:
panel = FlatDict(ConfluenceAPI.get_panel(
page_content, panel_identifier, space_id))
for k, v in panel.items():
# For each key remove list numbers.
# i.e. FlatDict will put in :0, :1: for each list element.
k = re.sub(':(\d+)', '', k)
k = re.sub(':(\d+):', ':', k)
DatabaseAPI.insert_or_update(
info_table, child_page_id, k, v,
child_pages[child_page_id]['last_updated'])
elif page_info_type == 'page_properties':
# Get all page properties and put the values into the database.
page_properties = ConfluenceAPI.get_page_properties(
child_page_id, space_id, pages[page_type][page_identifier][page_info_type])
for page_property in page_properties:
for val in page_properties[page_property]:
DatabaseAPI.insert_or_update(
info_table, child_page_id, page_property, val,
child_pages[child_page_id]['last_updated'])
elif page_info_type == 'headings':
for heading_identifier in pages[page_type][page_identifier][page_info_type]:
heading = FlatDict(ConfluenceAPI.get_heading(
page_content, heading_identifier))
for k, v in heading.items():
# For each key remove list numbers.
# i.e. FlatDict will put in :0, :1: for each list element.
k = re.sub(':(\d+)', '', k)
k = re.sub(':(\d+):', ':', k)
DatabaseAPI.insert_or_update(
info_table, child_page_id, k, v,
child_pages[child_page_id]['last_updated'])
elif page_info_type == 'page':
page_information = FlatDict(ConfluenceAPI.get_page(
page_content, child_pages[child_page_id]['name']))
for k, v in page_information.items():
# For each key remove list numbers.
# i.e. FlatDict will put in :0, :1: for each list element.
k = re.sub(':(\d+)', '', k)
k = re.sub(':(\d+):', ':', k)
DatabaseAPI.insert_or_update(
info_table, child_page_id, k, v,
child_pages[child_page_id]['last_updated'])
elif page_info_type == 'url':
for url_type in pages[page_type][page_identifier][page_info_type]:
url = ConfluenceAPI.get_page_urls(
child_page_id, url_type)
DatabaseAPI.insert_or_update(
info_table, child_page_id, url_type, url,
child_pages[child_page_id]['last_updated'])
else:
logger.warning(
'child_page_recursive: Unknown page information retrieval type: %s' % page_info_type)
except:
logger.error(
'child_page_recursive: Error inserting data for page with id: %s, name: %s' % (
str(child_page_id), child_pages[child_page_id]['name']))
else:
# Cleanup the ignore, info and default table by removing any information associated with page.
# Child pages get cleaned up by the cleanup method.
DatabaseAPI.delete(table, parent_page_id, child_page_id)
DatabaseAPI.delete(info_table, child_page_id)
def recursive_db_cleanup(pages, space_id, table_prefix, mode):
"""Recursively remove page information from the database by checking if the current page still exists in the database.
Args:
pages (dict): A dictionary of pages to crawl through, have a look at the example config for more information.
space_id (int): The top level space_id that the information relates to.
table_prefix (str): The current database table name prefix.
mode (bool): Only perform cleanup during a full sync.
"""
if mode:
for page_type in pages:
for page_identifier in pages[page_type]:
# Determine the table name that we are looking in.
table = table_prefix.replace(
' ', '') + '_' + page_identifier.replace('_', '').replace(' ', '')[:5].lower()
info_table = table + '__info'
child_pages = DatabaseAPI.select(table)
# For each of the child pages check to see if they still exist, if they do not then delete the page.
for child_page in child_pages:
# See if the parent page exists in the database if not then we can immediately delete this child
# page.
parent_exists = True
if space_id != child_page['parent']:
parent_exists = len(DatabaseAPI.select(
table_prefix.replace(' ', ''), None, child_page['parent'])) != 0
exists = ConfluenceAPI.check_page_exists(child_page['key'])
# The page does not exist on the wiki or the parent does not exist so delete it from the database
# along with the info.
if not parent_exists or not exists:
logger.info("recursive_db_cleanup: Deleting page with id: %s, Name: %s" % (
str(child_page['key']), child_page['value']))
DatabaseAPI.delete(
table, child_page['parent'], child_page['key'])
DatabaseAPI.delete(info_table, child_page['key'])
# Go down the next level and remove these pages.
for page_info_type in pages[page_type][page_identifier]:
if page_info_type == 'pages':
recursive_db_cleanup(
pages[page_type][page_identifier][page_info_type], space_id, table, mode)
def dump_application_inventory(mode):
if mode:
logger.info("dump_application_inventory: Creating CSV dump file.")
dump = pd.DataFrame(DatabaseAPI.select('wiki_app_info_full'))
dump.to_csv(os.path.dirname(os.path.realpath(__file__)) + "/application_inventory/dump/" +
datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + ".csv", index=None)
def run(conf, mode, conf_modified):
for space, value in conf['wiki']['spaces'].items():
try:
space_id = ConfluenceAPI.get_homepage_id_of_space(space)
DatabaseAPI.update_spaces(
space_id, space, ConfluenceAPI.get_last_update_time_of_content(space_id))
child_page_recursive(value['pages'], space_id, space_id,
conf['mysql']['table_prefix'], mode, conf_modified)
recursive_db_cleanup(value['pages'], space_id,
conf['mysql']['table_prefix'], mode)
# dump_application_inventory(mode)
except:
logger.error('run: Error retrieving information for space: %s' % space)
if __name__ == '__main__':
# Argument parsing.
parser = argparse.ArgumentParser(description='Conflex')
parser.add_argument('--application-inventory', action='store_true',
help='run the application inventory existing database update script.')
# parser.add_argument('--bigquery', action='store_true', help='run the Google Big Query update application.')
parser.add_argument('--config', action='store',
help='the location of the configuration file to run the application with.')
# parser.add_argument('--datastore', action='store_true', help='run the Google DataStore update application.')
parser.add_argument('--full-sync', action='store_true',
help='runs the application in full sync mode. i.e. pages are checked to ensure they meet the criteria in the config file.')
parser.add_argument('--half-sync', action='store_true',
help='runs the application in half sync mode. i.e. no new pages will be added to the database.')
parser.add_argument('--version', action='version',
version='Conflex Version: ' + VERSION)
args = parser.parse_args()
# Get configuration file.
config = config_parser.parse(args.config or os.path.abspath(
os.path.join(os.path.dirname(__file__), 'config.yaml')))
# Setup logging.
logger = logging.getLogger(__name__)
# Connect to database and setup tables.
DatabaseAPI.connect(config)
DatabaseAPI.create_spaces_table()
DatabaseAPI.create_application_table()
# Setup the confluence API.
ConfluenceAPI.setup(config)
# Store Last config modified time in database.
config_data = DatabaseAPI.update_conflex_application(
'last_config_change', str(config['config_modified_time']))
config_modified = False
if config_data:
if float(config_data['value']) != config['config_modified_time']:
# The configuration has been modified since last time.
logger.info('Configuration file has been updated!')
config_modified = True
# Run the datastore sync application.
# if args.datastore:
# datastore.run(config)
# Run the main application in the appropriate mode.
if args.full_sync:
logger.info('Application starting at: %s, running in full sync mode.' %
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
run(config, True, config_modified)
if args.half_sync:
logger.info('Application starting at: %s, running in half sync mode.' %
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
run(config, False, config_modified)
# Disconnect from the database.
DatabaseAPI.disconnect()
logger.info('Application finished updating at: %s' %
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))