Skip to content

Commit

Permalink
Fixes bug 1362048 - Redacted away StackTraces from raw_crash in ES. (#…
Browse files Browse the repository at this point in the history
…3771)

* Fixes bug 1362048 - Redacted away StackTraces from raw_crash in ES.
The raw_crash nowadays contain a key called "StackTraces" that contains a pretty big string. Storing it in Elasticsearch has a pretty high cost in terms of disk space and memory. This thus adds a new redactor, specifically made for the raw_crash, that we want to use every time we send a crash to Elasticsearch. .

* Better variable usage.
  • Loading branch information
adngdb authored May 16, 2017
1 parent 3e61f2c commit 2faa9ae
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 34 deletions.
15 changes: 11 additions & 4 deletions socorro/external/crashstorage_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,7 @@ def save_raw_and_processed(self, raw_crash, dump, processed_crash,
# processed crash which is a SocorroDotDict into a pure python
# dict which we can more easily copy.deepcopy() operate on.
processed_crash_as_dict = socorrodotdict_to_dict(processed_crash)
raw_crash_as_dict = socorrodotdict_to_dict(raw_crash)

for a_store in self.stores.itervalues():
self.quit_check()
Expand All @@ -601,14 +602,20 @@ def save_raw_and_processed(self, raw_crash, dump, processed_crash,
# you can't deepcopy those, so we deepcopy the
# pure dict version and then dress it back up as a
# DotDict.
crash = SocorroDotDict(copy.deepcopy(processed_crash_as_dict))
my_processed_crash = SocorroDotDict(
copy.deepcopy(processed_crash_as_dict)
)
my_raw_crash = SocorroDotDict(
copy.deepcopy(raw_crash_as_dict)
)
else:
crash = processed_crash
my_processed_crash = processed_crash
my_raw_crash = raw_crash

a_store.save_raw_and_processed(
raw_crash,
my_raw_crash,
dump,
crash,
my_processed_crash,
crash_id
)
except Exception:
Expand Down
41 changes: 36 additions & 5 deletions socorro/external/es/crashstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,27 @@
from socorro.external.crashstorage_base import Redactor


class RawCrashRedactor(Redactor):
"""Remove some specific keys from a dict. The dict is modified.
This is a special Redactor used on raw crashes before we send them
to our Elasticsearch database. It is used to remove fields that we don't
need to store, in order mostly to save some disk space and memory.
Not that this overwrites the list of forbidden_keys that would be defined
through configuration. That list is hard-coded in the __init__ function.
"""
def __init__(self, config):
super(RawCrashRedactor, self).__init__(config)

# Overwrite the list of fields to redact away.
self.forbidden_keys = [
'StackTraces',
]


class ESCrashStorage(CrashStorageBase):
"""This sends processed crash reports to Elasticsearch."""
"""This sends raw and processed crash reports to Elasticsearch."""

required_config = Namespace()
required_config.add_option(
Expand Down Expand Up @@ -236,24 +255,40 @@ class ESCrashStorageRedactedSave(ESCrashStorage):
"upload_file_minidump_browser.json_dump"
)

required_config.namespace('raw_crash_es_redactor')
required_config.raw_crash_es_redactor.add_option(
name="redactor_class",
doc="the redactor class to use on the raw_crash",
default='socorro.external.es.crashstorage.RawCrashRedactor',
from_string_converter=class_converter,
)

def __init__(self, config, quit_check_callback=None):
super(ESCrashStorageRedactedSave, self).__init__(
config,
quit_check_callback
)
self.redactor = config.es_redactor.redactor_class(config.es_redactor)
self.raw_crash_redactor = config.raw_crash_es_redactor.redactor_class(
config.raw_crash_es_redactor
)
self.config.logger.warning(
"Beware, this crashstorage class is destructive to the "
"processed crash - if you're using a polycrashstore you may "
"find the modified processed crash saved to the other crashstores."
)

def is_mutator(self):
# This crash storage mutates the crash, so we mark it as such.
return True

def save_raw_and_processed(self, raw_crash, dumps, processed_crash,
crash_id):
"""This is the only write mechanism that is actually employed in normal
usage.
"""
self.redactor.redact(processed_crash)
self.raw_crash_redactor.redact(raw_crash)

super(ESCrashStorageRedactedSave, self).save_raw_and_processed(
raw_crash,
Expand Down Expand Up @@ -298,10 +333,6 @@ class ESCrashStorageRedactedJsonDump(ESCrashStorageRedactedSave):
)
)

def is_mutator(self):
# This crash storage mutates the crash, so we mark it as such.
return True

def save_raw_and_processed(self, raw_crash, dumps, processed_crash,
crash_id):
"""This is the only write mechanism that is actually employed in normal
Expand Down
57 changes: 35 additions & 22 deletions socorro/unittest/external/es/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,39 @@ def get(self, **kwargs):
return super(SuperSearchWithFields, self).get(**kwargs)


class ElasticsearchTestCase(TestCase):
class TestCaseWithConfig(TestCase):
"""A simple TestCase class that can create configuration objects.
"""

def get_tuned_config(self, sources, extra_values=None):
if not isinstance(sources, (list, tuple)):
sources = [sources]

mock_logging = mock.Mock()

config_definitions = []
for source in sources:
conf = source.get_required_config()
conf.add_option('logger', default=mock_logging)
config_definitions.append(conf)

values_source = {'logger': mock_logging}
if extra_values:
values_source.update(extra_values)

config_manager = ConfigurationManager(
config_definitions,
app_name='testapp',
app_version='1.0',
app_description='Elasticsearch integration tests',
values_source_list=[environment, values_source],
argv_source=[],
)

return config_manager.get_config()


class ElasticsearchTestCase(TestCaseWithConfig):
"""Base class for Elastic Search related unit tests. """

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -842,33 +874,14 @@ def get_url(self):
return self.config.elasticsearch.elasticsearch_urls[0]

def get_tuned_config(self, sources, extra_values=None):
if not isinstance(sources, (list, tuple)):
sources = [sources]

mock_logging = mock.Mock()

config_definitions = []
for source in sources:
conf = source.get_required_config()
conf.add_option('logger', default=mock_logging)
config_definitions.append(conf)

values_source = DEFAULT_VALUES.copy()
values_source.update({'logger': mock_logging})
if extra_values:
values_source.update(extra_values)

config_manager = ConfigurationManager(
config_definitions,
app_name='testapp',
app_version='1.0',
app_description='Elasticsearch integration tests',
values_source_list=[environment, values_source],
argv_source=[],
return super(ElasticsearchTestCase, self).get_tuned_config(
sources, values_source
)

return config_manager.get_config()

def get_base_config(self, es_index=None):
extra_values = None
if es_index:
Expand Down
104 changes: 101 additions & 3 deletions socorro/unittest/external/es/test_crashstorage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import mock
import elasticsearch
import mock

from nose.tools import eq_, ok_, assert_raises

Expand All @@ -16,10 +16,14 @@
ESCrashStorage,
ESCrashStorageRedactedSave,
ESCrashStorageRedactedJsonDump,
ESBulkCrashStorage
ESBulkCrashStorage,
RawCrashRedactor,
)
from socorro.unittest.external.es.base import ElasticsearchTestCase
from socorro.lib.datetimeutil import string_to_datetime
from socorro.unittest.external.es.base import (
ElasticsearchTestCase,
TestCaseWithConfig,
)


# Uncomment these lines to decrease verbosity of the elasticsearch library
Expand Down Expand Up @@ -131,6 +135,36 @@
}


class TestRawCrashRedactor(TestCaseWithConfig):
"""Test the custom RawCrashRedactor class does indeed redact crashes.
"""

def __init__(self, *args, **kwargs):
super(TestRawCrashRedactor, self).__init__(*args, **kwargs)
self.config = self.get_tuned_config(RawCrashRedactor)

def test_redact_raw_crash(self):
redactor = RawCrashRedactor(self.config)
crash = {
'Key1': 'value',
'Key2': [12, 23, 34],
'StackTraces': 'foo:bar',
'Key3': {
'a': 1,
},
}
expected_crash = {
'Key1': 'value',
'Key2': [12, 23, 34],
'Key3': {
'a': 1,
},
}

redactor.redact(crash)
eq_(crash, expected_crash)


class IntegrationTestESCrashStorage(ElasticsearchTestCase):
"""These tests interact with Elasticsearch (or some other external
resource).
Expand Down Expand Up @@ -377,6 +411,9 @@ def test_success_with_no_stackwalker_class(self, espy_mock):
"upload_file_minidump_flash2.json_dump, "
"upload_file_minidump_browser.json_dump"
)
modified_config.raw_crash_es_redactor = DotDict()
modified_config.raw_crash_es_redactor.redactor_class = RawCrashRedactor
modified_config.raw_crash_es_redactor.forbidden_keys = 'unsused'

# It's mocks all the way down.
sub_mock = mock.MagicMock()
Expand Down Expand Up @@ -439,6 +476,9 @@ def test_success_with_limited_json_dump_class(self, espy_mock):
"upload_file_minidump_flash2.json_dump, "
"upload_file_minidump_browser.json_dump"
)
modified_config.raw_crash_es_redactor = DotDict()
modified_config.raw_crash_es_redactor.redactor_class = RawCrashRedactor
modified_config.raw_crash_es_redactor.forbidden_keys = 'unsused'

# It's mocks all the way down.
sub_mock = mock.MagicMock()
Expand Down Expand Up @@ -491,6 +531,64 @@ def test_success_with_limited_json_dump_class(self, espy_mock):
**additional
)

@mock.patch('socorro.external.es.connection_context.elasticsearch')
def test_success_with_redacted_raw_crash(self, espy_mock):
"""Test a successful index of a crash report.
"""
modified_config = deepcopy(self.config)
modified_config.es_redactor = DotDict()
modified_config.es_redactor.redactor_class = Redactor
modified_config.es_redactor.forbidden_keys = 'unsused'
modified_config.raw_crash_es_redactor = DotDict()
modified_config.raw_crash_es_redactor.redactor_class = RawCrashRedactor
modified_config.raw_crash_es_redactor.forbidden_keys = 'unsused'

# It's mocks all the way down.
sub_mock = mock.MagicMock()
espy_mock.Elasticsearch.return_value = sub_mock

es_storage = ESCrashStorageRedactedSave(config=modified_config)

crash_id = a_processed_crash['uuid']

# Add a 'StackTraces' field to be redacted.
raw_crash = deepcopy(a_raw_crash)
raw_crash['StackTraces'] = 'something'

# Submit a crash like normal, except that the back-end ES object is
# mocked (see the decorator above).
es_storage.save_raw_and_processed(
raw_crash=raw_crash,
dumps=None,
processed_crash=deepcopy(a_processed_crash),
crash_id=crash_id,
)

# Ensure that the ES objects were instantiated by ConnectionContext.
ok_(espy_mock.Elasticsearch.called)

# Ensure that the IndicesClient was also instantiated (this happens in
# IndexCreator but is part of the crashstorage workflow).
ok_(espy_mock.client.IndicesClient.called)

# The actual call to index the document (crash).
document = {
'crash_id': crash_id,
'processed_crash': a_processed_crash,
'raw_crash': a_raw_crash # Note here we expect the original dict
}

additional = {
'doc_type': 'crash_reports',
'id': crash_id,
'index': 'socorro_integration_test_reports'
}

sub_mock.index.assert_called_with(
body=document,
**additional
)

@mock.patch('socorro.external.es.connection_context.elasticsearch')
def test_fatal_failure(self, espy_mock):
"""Test an index attempt that fails catastrophically.
Expand Down

0 comments on commit 2faa9ae

Please sign in to comment.