From 2faa9aed3ba4ee747e843d12439ea7b7c7e259ff Mon Sep 17 00:00:00 2001 From: Adrian Gaudebert Date: Tue, 16 May 2017 14:52:42 +0200 Subject: [PATCH] Fixes bug 1362048 - Redacted away StackTraces from raw_crash in ES. (#3771) * Fixes bug 1362048 - Redacted away StackTraces from raw_crash in ES. The raw_crash nowadays contain a key called "StackTraces" that contains a pretty big string. Storing it in Elasticsearch has a pretty high cost in terms of disk space and memory. This thus adds a new redactor, specifically made for the raw_crash, that we want to use every time we send a crash to Elasticsearch. . * Better variable usage. --- socorro/external/crashstorage_base.py | 15 ++- socorro/external/es/crashstorage.py | 41 ++++++- socorro/unittest/external/es/base.py | 57 ++++++---- .../unittest/external/es/test_crashstorage.py | 104 +++++++++++++++++- 4 files changed, 183 insertions(+), 34 deletions(-) diff --git a/socorro/external/crashstorage_base.py b/socorro/external/crashstorage_base.py index ce8c75beb9..8971428949 100644 --- a/socorro/external/crashstorage_base.py +++ b/socorro/external/crashstorage_base.py @@ -589,6 +589,7 @@ def save_raw_and_processed(self, raw_crash, dump, processed_crash, # processed crash which is a SocorroDotDict into a pure python # dict which we can more easily copy.deepcopy() operate on. processed_crash_as_dict = socorrodotdict_to_dict(processed_crash) + raw_crash_as_dict = socorrodotdict_to_dict(raw_crash) for a_store in self.stores.itervalues(): self.quit_check() @@ -601,14 +602,20 @@ def save_raw_and_processed(self, raw_crash, dump, processed_crash, # you can't deepcopy those, so we deepcopy the # pure dict version and then dress it back up as a # DotDict. - crash = SocorroDotDict(copy.deepcopy(processed_crash_as_dict)) + my_processed_crash = SocorroDotDict( + copy.deepcopy(processed_crash_as_dict) + ) + my_raw_crash = SocorroDotDict( + copy.deepcopy(raw_crash_as_dict) + ) else: - crash = processed_crash + my_processed_crash = processed_crash + my_raw_crash = raw_crash a_store.save_raw_and_processed( - raw_crash, + my_raw_crash, dump, - crash, + my_processed_crash, crash_id ) except Exception: diff --git a/socorro/external/es/crashstorage.py b/socorro/external/es/crashstorage.py index 4247825675..6f48cda271 100644 --- a/socorro/external/es/crashstorage.py +++ b/socorro/external/es/crashstorage.py @@ -17,8 +17,27 @@ from socorro.external.crashstorage_base import Redactor +class RawCrashRedactor(Redactor): + """Remove some specific keys from a dict. The dict is modified. + + This is a special Redactor used on raw crashes before we send them + to our Elasticsearch database. It is used to remove fields that we don't + need to store, in order mostly to save some disk space and memory. + + Not that this overwrites the list of forbidden_keys that would be defined + through configuration. That list is hard-coded in the __init__ function. + """ + def __init__(self, config): + super(RawCrashRedactor, self).__init__(config) + + # Overwrite the list of fields to redact away. + self.forbidden_keys = [ + 'StackTraces', + ] + + class ESCrashStorage(CrashStorageBase): - """This sends processed crash reports to Elasticsearch.""" + """This sends raw and processed crash reports to Elasticsearch.""" required_config = Namespace() required_config.add_option( @@ -236,24 +255,40 @@ class ESCrashStorageRedactedSave(ESCrashStorage): "upload_file_minidump_browser.json_dump" ) + required_config.namespace('raw_crash_es_redactor') + required_config.raw_crash_es_redactor.add_option( + name="redactor_class", + doc="the redactor class to use on the raw_crash", + default='socorro.external.es.crashstorage.RawCrashRedactor', + from_string_converter=class_converter, + ) + def __init__(self, config, quit_check_callback=None): super(ESCrashStorageRedactedSave, self).__init__( config, quit_check_callback ) self.redactor = config.es_redactor.redactor_class(config.es_redactor) + self.raw_crash_redactor = config.raw_crash_es_redactor.redactor_class( + config.raw_crash_es_redactor + ) self.config.logger.warning( "Beware, this crashstorage class is destructive to the " "processed crash - if you're using a polycrashstore you may " "find the modified processed crash saved to the other crashstores." ) + def is_mutator(self): + # This crash storage mutates the crash, so we mark it as such. + return True + def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): """This is the only write mechanism that is actually employed in normal usage. """ self.redactor.redact(processed_crash) + self.raw_crash_redactor.redact(raw_crash) super(ESCrashStorageRedactedSave, self).save_raw_and_processed( raw_crash, @@ -298,10 +333,6 @@ class ESCrashStorageRedactedJsonDump(ESCrashStorageRedactedSave): ) ) - def is_mutator(self): - # This crash storage mutates the crash, so we mark it as such. - return True - def save_raw_and_processed(self, raw_crash, dumps, processed_crash, crash_id): """This is the only write mechanism that is actually employed in normal diff --git a/socorro/unittest/external/es/base.py b/socorro/unittest/external/es/base.py index 98bfe09b89..7f5f981fd9 100644 --- a/socorro/unittest/external/es/base.py +++ b/socorro/unittest/external/es/base.py @@ -791,7 +791,39 @@ def get(self, **kwargs): return super(SuperSearchWithFields, self).get(**kwargs) -class ElasticsearchTestCase(TestCase): +class TestCaseWithConfig(TestCase): + """A simple TestCase class that can create configuration objects. + """ + + def get_tuned_config(self, sources, extra_values=None): + if not isinstance(sources, (list, tuple)): + sources = [sources] + + mock_logging = mock.Mock() + + config_definitions = [] + for source in sources: + conf = source.get_required_config() + conf.add_option('logger', default=mock_logging) + config_definitions.append(conf) + + values_source = {'logger': mock_logging} + if extra_values: + values_source.update(extra_values) + + config_manager = ConfigurationManager( + config_definitions, + app_name='testapp', + app_version='1.0', + app_description='Elasticsearch integration tests', + values_source_list=[environment, values_source], + argv_source=[], + ) + + return config_manager.get_config() + + +class ElasticsearchTestCase(TestCaseWithConfig): """Base class for Elastic Search related unit tests. """ def __init__(self, *args, **kwargs): @@ -842,33 +874,14 @@ def get_url(self): return self.config.elasticsearch.elasticsearch_urls[0] def get_tuned_config(self, sources, extra_values=None): - if not isinstance(sources, (list, tuple)): - sources = [sources] - - mock_logging = mock.Mock() - - config_definitions = [] - for source in sources: - conf = source.get_required_config() - conf.add_option('logger', default=mock_logging) - config_definitions.append(conf) - values_source = DEFAULT_VALUES.copy() - values_source.update({'logger': mock_logging}) if extra_values: values_source.update(extra_values) - config_manager = ConfigurationManager( - config_definitions, - app_name='testapp', - app_version='1.0', - app_description='Elasticsearch integration tests', - values_source_list=[environment, values_source], - argv_source=[], + return super(ElasticsearchTestCase, self).get_tuned_config( + sources, values_source ) - return config_manager.get_config() - def get_base_config(self, es_index=None): extra_values = None if es_index: diff --git a/socorro/unittest/external/es/test_crashstorage.py b/socorro/unittest/external/es/test_crashstorage.py index 1f38f35abe..4506aebe9f 100644 --- a/socorro/unittest/external/es/test_crashstorage.py +++ b/socorro/unittest/external/es/test_crashstorage.py @@ -2,8 +2,8 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -import mock import elasticsearch +import mock from nose.tools import eq_, ok_, assert_raises @@ -16,10 +16,14 @@ ESCrashStorage, ESCrashStorageRedactedSave, ESCrashStorageRedactedJsonDump, - ESBulkCrashStorage + ESBulkCrashStorage, + RawCrashRedactor, ) -from socorro.unittest.external.es.base import ElasticsearchTestCase from socorro.lib.datetimeutil import string_to_datetime +from socorro.unittest.external.es.base import ( + ElasticsearchTestCase, + TestCaseWithConfig, +) # Uncomment these lines to decrease verbosity of the elasticsearch library @@ -131,6 +135,36 @@ } +class TestRawCrashRedactor(TestCaseWithConfig): + """Test the custom RawCrashRedactor class does indeed redact crashes. + """ + + def __init__(self, *args, **kwargs): + super(TestRawCrashRedactor, self).__init__(*args, **kwargs) + self.config = self.get_tuned_config(RawCrashRedactor) + + def test_redact_raw_crash(self): + redactor = RawCrashRedactor(self.config) + crash = { + 'Key1': 'value', + 'Key2': [12, 23, 34], + 'StackTraces': 'foo:bar', + 'Key3': { + 'a': 1, + }, + } + expected_crash = { + 'Key1': 'value', + 'Key2': [12, 23, 34], + 'Key3': { + 'a': 1, + }, + } + + redactor.redact(crash) + eq_(crash, expected_crash) + + class IntegrationTestESCrashStorage(ElasticsearchTestCase): """These tests interact with Elasticsearch (or some other external resource). @@ -377,6 +411,9 @@ def test_success_with_no_stackwalker_class(self, espy_mock): "upload_file_minidump_flash2.json_dump, " "upload_file_minidump_browser.json_dump" ) + modified_config.raw_crash_es_redactor = DotDict() + modified_config.raw_crash_es_redactor.redactor_class = RawCrashRedactor + modified_config.raw_crash_es_redactor.forbidden_keys = 'unsused' # It's mocks all the way down. sub_mock = mock.MagicMock() @@ -439,6 +476,9 @@ def test_success_with_limited_json_dump_class(self, espy_mock): "upload_file_minidump_flash2.json_dump, " "upload_file_minidump_browser.json_dump" ) + modified_config.raw_crash_es_redactor = DotDict() + modified_config.raw_crash_es_redactor.redactor_class = RawCrashRedactor + modified_config.raw_crash_es_redactor.forbidden_keys = 'unsused' # It's mocks all the way down. sub_mock = mock.MagicMock() @@ -491,6 +531,64 @@ def test_success_with_limited_json_dump_class(self, espy_mock): **additional ) + @mock.patch('socorro.external.es.connection_context.elasticsearch') + def test_success_with_redacted_raw_crash(self, espy_mock): + """Test a successful index of a crash report. + """ + modified_config = deepcopy(self.config) + modified_config.es_redactor = DotDict() + modified_config.es_redactor.redactor_class = Redactor + modified_config.es_redactor.forbidden_keys = 'unsused' + modified_config.raw_crash_es_redactor = DotDict() + modified_config.raw_crash_es_redactor.redactor_class = RawCrashRedactor + modified_config.raw_crash_es_redactor.forbidden_keys = 'unsused' + + # It's mocks all the way down. + sub_mock = mock.MagicMock() + espy_mock.Elasticsearch.return_value = sub_mock + + es_storage = ESCrashStorageRedactedSave(config=modified_config) + + crash_id = a_processed_crash['uuid'] + + # Add a 'StackTraces' field to be redacted. + raw_crash = deepcopy(a_raw_crash) + raw_crash['StackTraces'] = 'something' + + # Submit a crash like normal, except that the back-end ES object is + # mocked (see the decorator above). + es_storage.save_raw_and_processed( + raw_crash=raw_crash, + dumps=None, + processed_crash=deepcopy(a_processed_crash), + crash_id=crash_id, + ) + + # Ensure that the ES objects were instantiated by ConnectionContext. + ok_(espy_mock.Elasticsearch.called) + + # Ensure that the IndicesClient was also instantiated (this happens in + # IndexCreator but is part of the crashstorage workflow). + ok_(espy_mock.client.IndicesClient.called) + + # The actual call to index the document (crash). + document = { + 'crash_id': crash_id, + 'processed_crash': a_processed_crash, + 'raw_crash': a_raw_crash # Note here we expect the original dict + } + + additional = { + 'doc_type': 'crash_reports', + 'id': crash_id, + 'index': 'socorro_integration_test_reports' + } + + sub_mock.index.assert_called_with( + body=document, + **additional + ) + @mock.patch('socorro.external.es.connection_context.elasticsearch') def test_fatal_failure(self, espy_mock): """Test an index attempt that fails catastrophically.