diff --git a/pyhindsight/analysis.py b/pyhindsight/analysis.py index 8ec0e1e..b395882 100644 --- a/pyhindsight/analysis.py +++ b/pyhindsight/analysis.py @@ -905,8 +905,8 @@ def generate_excel(self, output_object): s = workbook.add_worksheet('Storage') # Title bar s.merge_range('A1:G1', f'Hindsight Internet History Forensics (v{__version__})', title_header_format) - s.merge_range('H1:J1', 'Backing Database Specific', center_header_format) - s.merge_range('K1:M1', 'FileSystem Specific', center_header_format) + s.merge_range('H1:K1', 'Backing Database Specific', center_header_format) + s.merge_range('L1:N1', 'FileSystem Specific', center_header_format) # Write column headers s.write(1, 0, 'Type', header_format) @@ -917,11 +917,12 @@ def generate_excel(self, output_object): s.write(1, 5, 'Interpretation', header_format) s.write(1, 6, 'Profile', header_format) s.write(1, 7, 'Source Path', header_format) - s.write(1, 8, 'Sequence', header_format) - s.write(1, 9, 'State', header_format) - s.write(1, 10, 'File Exists?', header_format) - s.write(1, 11, 'File Size (bytes)', header_format) - s.write(1, 12, 'File Type (Confidence %)', header_format) + s.write(1, 8, 'Database', header_format) + s.write(1, 9, 'Sequence', header_format) + s.write(1, 10, 'State', header_format) + s.write(1, 11, 'File Exists?', header_format) + s.write(1, 12, 'File Size (bytes)', header_format) + s.write(1, 13, 'File Type (Confidence %)', header_format) # Set column widths s.set_column('A:A', 16) # Type @@ -932,11 +933,12 @@ def generate_excel(self, output_object): s.set_column('F:F', 50) # Interpretation s.set_column('G:G', 50) # Profile s.set_column('H:H', 50) # Source Path - s.set_column('I:I', 8) # Seq - s.set_column('J:J', 8) # State - s.set_column('K:K', 8) # Exists - s.set_column('L:L', 16) # Size - s.set_column('M:M', 25) # Type + s.set_column('I:I', 16) # Database + s.set_column('J:J', 8) # Seq + s.set_column('K:K', 8) # State + s.set_column('L:L', 8) # Exists + s.set_column('M:M', 16) # Size + s.set_column('N:N', 25) # Type # Start at the row after the headers, and begin writing out the items in parsed_artifacts row_number = 2 @@ -951,11 +953,11 @@ def generate_excel(self, output_object): s.write(row_number, 5, item.interpretation, black_value_format) s.write(row_number, 6, item.profile, black_value_format) s.write(row_number, 7, item.source_path, black_value_format) - s.write_number(row_number, 8, item.seq, black_value_format) - s.write_string(row_number, 9, item.state, black_value_format) - s.write(row_number, 10, item.file_exists, black_value_format) - s.write(row_number, 11, item.file_size, black_value_format) - s.write(row_number, 12, item.magic_results, black_value_format) + s.write_number(row_number, 9, item.seq, black_value_format) + s.write_string(row_number, 10, item.state, black_value_format) + s.write(row_number, 11, item.file_exists, black_value_format) + s.write(row_number, 12, item.file_size, black_value_format) + s.write(row_number, 13, item.magic_results, black_value_format) elif item.row_type.startswith(("local storage", "session storage")): s.write_string(row_number, 0, item.row_type, black_type_format) @@ -966,9 +968,19 @@ def generate_excel(self, output_object): s.write(row_number, 5, item.interpretation, black_value_format) s.write(row_number, 6, item.profile, black_value_format) s.write(row_number, 7, item.source_path, black_value_format) - s.write_number(row_number, 8, item.seq, black_value_format) - s.write_string(row_number, 9, item.state, black_value_format) + s.write_number(row_number, 9, item.seq, black_value_format) + s.write_string(row_number, 10, item.state, black_value_format) + elif item.row_type.startswith("indexeddb"): + s.write_string(row_number, 0, item.row_type, black_type_format) + s.write_string(row_number, 1, item.origin, black_url_format) + s.write_string(row_number, 2, item.key, black_field_format) + s.write_string(row_number, 3, item.value, black_value_format) + s.write(row_number, 5, item.interpretation, black_value_format) + s.write(row_number, 6, item.profile, black_value_format) + s.write(row_number, 7, item.source_path, black_value_format) + s.write(row_number, 8, item.database, black_value_format) + s.write_number(row_number, 9, item.seq, black_value_format) except Exception as e: log.error(f'Failed to write row to XLSX: {e}') diff --git a/pyhindsight/browsers/chrome.py b/pyhindsight/browsers/chrome.py index ed13381..b558264 100644 --- a/pyhindsight/browsers/chrome.py +++ b/pyhindsight/browsers/chrome.py @@ -15,7 +15,7 @@ import urllib import base64 -import pyhindsight.lib.ccl_chrome_indexeddb.ccl_blink_value_deserializer +from pyhindsight.lib.ccl_chrome_indexeddb import ccl_chromium_indexeddb from pyhindsight.browsers.webbrowser import WebBrowser from pyhindsight import utils @@ -1105,7 +1105,7 @@ def get_session_storage(self, path, dir_name): try: ss_ldb_records = ccl_chromium_sessionstorage.SessionStoreDb(pathlib.Path(ss_path)) except ValueError as e: - log.warning(f' - Error reading records; possible LevelDB corruption') + log.warning(f' - Error reading records ({e}); possible LevelDB corruption') self.artifacts_counts['Session Storage'] = 'Failed' if ss_ldb_records: @@ -1128,6 +1128,63 @@ def get_session_storage(self, path, dir_name): log.info(f' - Parsed {len(results)} Session Storage items') self.parsed_storage.extend(results) + def get_indexeddb(self, path, dir_name): + results = [] + + # Grab file list of 'IndexedDB' directory + idb_path = os.path.join(path, dir_name) + log.info('IndexedDB:') + log.info(f' - Reading from {idb_path}') + + idb_storage_listing = os.listdir(idb_path) + log.debug(f' - {len(idb_storage_listing)} files in IndexedDB directory') + + for storage_directory in idb_storage_listing: + if not storage_directory.endswith('.leveldb'): + continue + + # The Ghostery extension has 1M+ records in it; skip for now. + if storage_directory == 'chrome-extension_mlomiejdfkolichcflejclcbmpeaniij_0.indexeddb.leveldb': + continue + + origin = storage_directory.split('.indexeddb')[0] + blob_directory = None + blob_path = os.path.join(idb_path, f'{origin}.indexeddb.blob') + if os.path.exists(blob_path): + blob_directory = blob_path + + try: + origin_idb = ccl_chromium_indexeddb.WrappedIndexDB( + leveldb_dir=os.path.join(idb_path, f'{origin}.indexeddb.leveldb'), leveldb_blob_dir=blob_directory) + except ValueError as e: + log.error(f' - {e} when processing {storage_directory}') + continue + + except Exception as e: + log.error(f' - Unexpected Exception ({e}) when processing {storage_directory}') + continue + + for database_id in origin_idb.database_ids: + database = origin_idb[database_id.dbid_no] + for obj_store_name in database.object_store_names: + obj_store = database.get_object_store_by_name(obj_store_name) + try: + for record in obj_store.iterate_records(): + results.append(Chrome.IndexedDBItem( + self.profile_path, origin, str(record.key.value), str(record.value), + int(record.sequence_number), str(database.name), storage_directory)) + except FileNotFoundError as e: + log.error(f' - File ({e}) not found while processing {database}') + + except ValueError as e: + log.error(f' - ValueError ({e}) when processing {database}') + + except Exception as e: + log.error(f' - Unexpected Exception: {e}') + + self.artifacts_counts['IndexedDB'] = len(results) + log.info(f' - Parsed {len(results)} items from {len(idb_storage_listing)} files') + self.parsed_storage.extend(results) def get_extensions(self, path, dir_name): results = [] @@ -1575,7 +1632,7 @@ def zoom_level_to_zoom_factor(zoom_level): if prefs.get('password_manager'): if prefs['password_manager'].get('profile_store_date_last_used_for_filling'): timestamped_preference_item = Chrome.SiteSetting( - self.profile_path, url=None, + self.profile_path, url='', timestamp=utils.to_datetime( prefs['password_manager']['profile_store_date_last_used_for_filling'], self.timezone), key=f'profile_store_date_last_used_for_filling [in {preferences_file}.password_manager]', @@ -2482,6 +2539,13 @@ def process(self): self.artifacts_display['Archived History'], self.artifacts_counts.get('Archived History', '0'))) + if 'IndexedDB' in input_listing: + self.get_indexeddb(self.profile_path, 'IndexedDB') + self.artifacts_display['IndexedDB'] = 'IndexedDB records' + print(self.format_processing_output( + self.artifacts_display['IndexedDB'], + self.artifacts_counts.get('IndexedDB', '0'))) + if 'Media History' in input_listing: self.get_media_history(self.profile_path, 'Media History', self.version, 'media (playback end)') self.artifacts_display['Media History'] = "Media History records" diff --git a/pyhindsight/browsers/webbrowser.py b/pyhindsight/browsers/webbrowser.py index 9e94665..674f630 100644 --- a/pyhindsight/browsers/webbrowser.py +++ b/pyhindsight/browsers/webbrowser.py @@ -384,6 +384,29 @@ def __init__(self, profile, origin, key, value, seq, state, source_path): self.state = state self.source_path = source_path + class IndexedDBItem(StorageItem): + def __init__(self, profile, origin, key, value, seq, database, source_path): + """ + + :param profile: The path to the browser profile this item is part of. + :param origin: The web origin this IndexedDBItem item belongs to. + :param key: The key of the IndexedDBItem item. + :param value: The value of the IndexedDBItem item. + :param seq: The sequence number. + :param database: The database within the IndexedDB file the record is part of. + :param source_path: The path to the source of the record. + """ + super(WebBrowser.IndexedDBItem, self).__init__( + 'indexeddb', profile=profile, origin=origin, key=key, value=value, seq=seq, + source_path=source_path) + self.profile = profile + self.origin = origin + self.key = key + self.value = value + self.seq = seq + self.database = database + self.source_path = source_path + class FileSystemItem(StorageItem): def __init__(self, profile, origin, key, value, seq, state, source_path, last_modified=None, file_exists=None, file_size=None, magic_results=None): diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py index ca1aa8a..80faf9d 100644 --- a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py +++ b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_blink_value_deserializer.py @@ -28,6 +28,8 @@ from pyhindsight.lib.ccl_chrome_indexeddb import ccl_v8_value_deserializer # See: https://chromium.googlesource.com/chromium/src/third_party/+/master/blink/renderer/bindings/core/v8/serialization +# https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/bindings/modules/v8/serialization/v8_script_value_serializer_for_modules.cc + # WebCoreStrings are read as (length:uint32_t, string:UTF8[length]). # RawStrings are read as (length:uint32_t, string:UTF8[length]). @@ -48,11 +50,11 @@ # GenerateFreshObjectTag/GenerateFreshArrayTag); these reference IDs are then # used with ObjectReferenceTag to tie the recursive knot. -__version__ = "0.1" +__version__ = "0.2" __description__ = "Partial reimplementation of the Blink Javascript Object Serialization" __contact__ = "Alex Caithness" -__DEBUG = True +__DEBUG = False def log(msg, debug_only=True): @@ -73,6 +75,19 @@ class BlobIndex: index_id: int +@dataclass(frozen=True) +class CryptoKey: + sub_type: "V8CryptoKeySubType" + algorithm_type: typing.Optional["V8CryptoKeyAlgorithm"] + hash_type: typing.Optional["V8CryptoKeyAlgorithm"] + asymmetric_key_type: typing.Optional["V8AsymmetricCryptoKeyType"] + byte_length: typing.Optional[int] + public_exponent: typing.Optional[bytes] + named_curve_type: typing.Optional["V8CryptoNamedCurve"] + key_usage: "V8CryptoKeyUsage" + key_data: bytes + + class Constants: tag_kMessagePortTag = b"M" # index:int -> MessagePort. Fills the result with # transferred MessagePort. @@ -148,20 +163,179 @@ class Constants: tag_kDOMExceptionTag = b"x" # name:String,message:String,stack:String tag_kVersionTag = b"\xff" # version:uint32_t -> Uses this as the file version. + tag_kTrailerOffsetTag = b"\xfe" # offset:uint64_t (fixed width, network order) from buffer, start size:uint32_t (fixed width, network order) + tag_kTrailerRequiresInterfacesTag = b"\xA0" + + +class V8CryptoKeySubType(enum.IntEnum): + """ + See: third_party/blink/renderer/bindings/modules/v8/serialization/web_crypto_sub_tags.h + Used by the kCryptoKeyTag type + """ + AesKey = 1 + HmacKey = 2 + # ID 3 was used by RsaKeyTag, while still behind experimental flag. + RsaHashedKey = 4 + EcKey = 5 + NoParamsKey = 6 + + +class V8CryptoKeyAlgorithm(enum.IntEnum): + """ + See: third_party/blink/renderer/bindings/modules/v8/serialization/web_crypto_sub_tags.h + Used by the kCryptoKeyTag type + """ + AesCbcTag = 1 + HmacTag = 2 + RsaSsaPkcs1v1_5Tag = 3 + # ID 4 was used by RsaEs, while still behind experimental flag. + Sha1Tag = 5 + Sha256Tag = 6 + Sha384Tag = 7 + Sha512Tag = 8 + AesGcmTag = 9 + RsaOaepTag = 10 + AesCtrTag = 11 + AesKwTag = 12 + RsaPssTag = 13 + EcdsaTag = 14 + EcdhTag = 15 + HkdfTag = 16 + Pbkdf2Tag = 17 + + +class V8AsymmetricCryptoKeyType(enum.IntEnum): + Public = 1 + Private = 2 + + +class V8CryptoNamedCurve(enum.IntEnum): + """ + See: third_party/blink/renderer/bindings/modules/v8/serialization/web_crypto_sub_tags.h + Used by the kCryptoKeyTag type + """ + P256 = 1 + P384 = 2 + P521 = 3 + + +class V8CryptoKeyUsage(enum.IntFlag): + """ + See: third_party/blink/renderer/bindings/modules/v8/serialization/web_crypto_sub_tags.h + Used by the kCryptoKeyTag type + """ + kExtractableUsage = 1 << 0 + kEncryptUsage = 1 << 1 + kDecryptUsage = 1 << 2 + kSignUsage = 1 << 3 + kVerifyUsage = 1 << 4 + kDeriveKeyUsage = 1 << 5 + kWrapKeyUsage = 1 << 6 + kUnwrapKeyUsage = 1 << 7 + kDeriveBitsUsage = 1 << 8 class BlinkV8Deserializer: def _read_varint(self, stream) -> int: return ccl_v8_value_deserializer.read_le_varint(stream)[0] + def _read_varint32(self, stream) -> int: + return ccl_v8_value_deserializer.read_le_varint(stream, is_32bit=True)[0] + + # def _read_uint32(self, stream: typing.BinaryIO) -> int: + # raw = stream.read(4) + # if len(raw) < 4: + # raise ValueError("Could not read enough data when reading int32") + # return struct.unpack(" BlobIndex: return BlobIndex(BlobIndexType.File, self._read_varint(stream)) + def _read_blob_index(self, stream: typing.BinaryIO) -> BlobIndex: + return BlobIndex(BlobIndexType.Blob, self._read_varint(stream)) + def _read_file_list_index(self, stream: typing.BinaryIO) -> typing.Iterable[BlobIndex]: length = self._read_varint(stream) result = [self._read_file_index(stream) for _ in range(length)] return result + def _read_crypto_key(self, stream: typing.BinaryIO): + sub_type = V8CryptoKeySubType(stream.read(1)[0]) + + if sub_type == V8CryptoKeySubType.AesKey: + algorithm_id = V8CryptoKeyAlgorithm(self._read_varint32(stream)) + byte_length = self._read_varint32(stream) + params = { + "algorithm_type": algorithm_id, + "byte_length": byte_length, + "hash_type": None, + "named_curve_type": None, + "asymmetric_key_type": None, + "public_exponent": None + } + elif sub_type == V8CryptoKeySubType.HmacKey: + byte_length = self._read_varint32(stream) + hash_id = V8CryptoKeyAlgorithm(self._read_varint32(stream)) + params = { + "byte_length": byte_length, + "hash_type": hash_id, + "algorithm_type": None, + "named_curve_type": None, + "asymmetric_key_type": None, + "public_exponent": None + } + elif sub_type == V8CryptoKeySubType.RsaHashedKey: + algorithm_id = V8CryptoKeyAlgorithm(self._read_varint32(stream)) + asymmetric_key_type = V8AsymmetricCryptoKeyType(stream.read(1)[0]) + length_bytes = self._read_varint32(stream) + public_exponent_length = self._read_varint32(stream) + public_exponent = stream.read(public_exponent_length) + if len(public_exponent) != public_exponent_length: + raise ValueError(f"Could not read all of public exponent data") + hash_id = V8CryptoKeyAlgorithm(self._read_varint32(stream)) + params = { + "algorithm_type": algorithm_id, + "asymmetric_key_type": asymmetric_key_type, + "byte_length": length_bytes, + "public_exponent": public_exponent, + "hash_type": hash_id, + "named_curve_type": None + } + + elif sub_type == V8CryptoKeySubType.EcKey: + algorithm_id = V8CryptoKeyAlgorithm(self._read_varint32(stream)) + asymmetric_key_type = V8AsymmetricCryptoKeyType(stream.read(1)[0]) + named_curve = V8CryptoNamedCurve(self._read_varint32(stream)) + params = { + "algorithm_type": algorithm_id, + "asymmetric_key_type": asymmetric_key_type, + "named_curve_type": named_curve, + "hash_type": None, + "byte_length": None, + "public_exponent": None + } + elif sub_type == V8CryptoKeySubType.NoParamsKey: + algorithm_id = V8CryptoKeyAlgorithm(self._read_varint32(stream)) + params = { + "algorithm_type": algorithm_id, + "hash_type": None, + "asymmetric_key_type": None, + "byte_length": None, + "named_curve_type": None, + "public_exponent": None + } + else: + raise ValueError(f"Unknown V8CryptoKeySubType {sub_type}") + + params["key_usage"] = V8CryptoKeyUsage(self._read_varint32(stream)) + key_length = self._read_varint32(stream) + key_data = stream.read(key_length) + if len(key_data) < key_length: + raise ValueError("Could not read all key data") + + params["key_data"] = key_data + return CryptoKey(sub_type, **params) + def _not_implemented(self, stream): raise NotImplementedError() @@ -172,7 +346,7 @@ def read(self, stream: typing.BinaryIO) -> typing.Any: Constants.tag_kMessagePortTag: lambda x: self._not_implemented(x), Constants.tag_kMojoHandleTag: lambda x: self._not_implemented(x), Constants.tag_kBlobTag: lambda x: self._not_implemented(x), - Constants.tag_kBlobIndexTag: lambda x: self._not_implemented(x), + Constants.tag_kBlobIndexTag: lambda x: self._read_blob_index(x), Constants.tag_kFileTag: lambda x: self._not_implemented(x), Constants.tag_kFileIndexTag: lambda x: self._read_file_index(x), Constants.tag_kDOMFileSystemTag: lambda x: self._not_implemented(x), @@ -196,7 +370,7 @@ def read(self, stream: typing.BinaryIO) -> typing.Any: Constants.tag_kDOMMatrixReadOnlyTag: lambda x: self._not_implemented(x), Constants.tag_kDOMMatrix2DTag: lambda x: self._not_implemented(x), Constants.tag_kDOMMatrix2DReadOnlyTag: lambda x: self._not_implemented(x), - Constants.tag_kCryptoKeyTag: lambda x: self._not_implemented(x), + Constants.tag_kCryptoKeyTag: lambda x: self._read_crypto_key(x), Constants.tag_kRTCCertificateTag: lambda x: self._not_implemented(x), Constants.tag_kRTCEncodedAudioFrameTag: lambda x: self._not_implemented(x), Constants.tag_kRTCEncodedVideoFrameTag: lambda x: self._not_implemented(x), diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py index de59c0c..eae9f1a 100644 --- a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py +++ b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_indexeddb.py @@ -1,5 +1,5 @@ """ -Copyright 2020-2021, CCL Forensics +Copyright 2020-2023, CCL Forensics Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -31,11 +31,11 @@ import types import typing -import ccl_leveldb -import ccl_v8_value_deserializer -import ccl_blink_value_deserializer +from pyhindsight.lib.ccl_chrome_indexeddb import ccl_leveldb +from pyhindsight.lib.ccl_chrome_indexeddb import ccl_v8_value_deserializer +from pyhindsight.lib.ccl_chrome_indexeddb import ccl_blink_value_deserializer -__version__ = "0.6" +__version__ = "0.16" __description__ = "Module for reading Chromium IndexedDB LevelDB databases." __contact__ = "Alex Caithness" @@ -44,7 +44,7 @@ # (it should sit behind a switch for integers, fixed for most other stuff) -def _read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False): +def _read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False) -> typing.Optional[tuple[int, bytes]]: # this only outputs unsigned i = 0 result = 0 @@ -64,7 +64,7 @@ def _read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False): return result, bytes(underlying_bytes) -def read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False): +def read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False) -> typing.Optional[int]: x = _read_le_varint(stream, is_google_32bit=is_google_32bit) if x is None: return None @@ -72,16 +72,30 @@ def read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False): return x[0] -def _le_varint_from_bytes(data: bytes): +def _le_varint_from_bytes(data: bytes) -> typing.Optional[tuple[int, bytes]]: with io.BytesIO(data) as buff: return _read_le_varint(buff) -def le_varint_from_bytes(data: bytes): +def le_varint_from_bytes(data: bytes) -> typing.Optional[int]: with io.BytesIO(data) as buff: return read_le_varint(buff) +def decode_truncated_int(data: bytes) -> int: + # See: /content/browser/indexed_db/indexed_db_leveldb_coding.h EncodeInt() + # "// Unlike EncodeVarInt, this is a 'dumb' implementation of a variable int + # // encoder. It writes, little-endian', until there are no more '1' bits in the + # // number. The Decoder must know how to calculate the size of the encoded int, + # // typically by having this reside at the end of the value or key." + if len(data) == 0: + raise ValueError("No data to decode") + result = 0 + for i, b in enumerate(data): + result |= (b << (i * 8)) + return result + + class IdbKeyType(enum.IntEnum): Null = 0 String = 1 @@ -151,7 +165,10 @@ def __eq__(self, other): return self.raw_key == other.raw_key def __ne__(self, other): - return not self == other + return not (self == other) + + def __hash__(self): + return self.raw_key.__hash__() class IndexedDBExternalObjectType(enum.IntEnum): @@ -229,7 +246,7 @@ def __init__(self, raw_meta_dict: dict): db_name_length = read_le_varint(buff) db_name = buff.read(db_name_length * 2).decode("utf-16-be") - db_id_no = le_varint_from_bytes(dbid_rec.value) + db_id_no = decode_truncated_int(dbid_rec.value) dbids.append(DatabaseId(db_id_no, origin, db_name)) @@ -255,7 +272,7 @@ def get_meta(self, db_id: int, meta_type: DatabaseMetadataType) -> typing.Option return None if meta_type == DatabaseMetadataType.MaximumObjectStoreId: - return le_varint_from_bytes(record.value) + return decode_truncated_int(record.value) # TODO raise NotImplementedError() @@ -289,10 +306,31 @@ def get_meta(self, db_id: int, obj_store_id: int, meta_type: ObjectStoreMetadata raise NotImplementedError() +@dataclasses.dataclass(frozen=True) +class BlinkTrailer: + # third_party/blink/renderer/bindings/core/v8/serialization/trailer_reader.h + offset: int + length: int + + TRAILER_SIZE: typing.ClassVar[int] = 13 + MIN_WIRE_FORMAT_VERSION_FOR_TRAILER: typing.ClassVar[int] = 21 + + @classmethod + def from_buffer(cls, buffer, trailer_offset: int): + tag, offset, length = struct.unpack(">cQI", buffer[trailer_offset: trailer_offset + BlinkTrailer.TRAILER_SIZE]) + if tag != ccl_blink_value_deserializer.Constants.tag_kTrailerOffsetTag: + raise ValueError( + f"Trailer doesn't start with kTrailerOffsetTag " + f"(expected: 0x{ccl_blink_value_deserializer.Constants.tag_kTrailerOffsetTag.hex()}; " + f"got: 0x{tag.hex()}") + + return BlinkTrailer(offset, length) + + class IndexedDbRecord: def __init__( self, owner: "IndexedDb", db_id: int, obj_store_id: int, key: IdbKey, - value: typing.Any, is_live: bool, ldb_seq_no: int): + value: typing.Any, is_live: bool, ldb_seq_no: int, external_value_path: typing.Optional[str] = None): self.owner = owner self.db_id = db_id self.obj_store_id = obj_store_id @@ -300,6 +338,7 @@ def __init__( self.value = value self.is_live = is_live self.sequence_number = ldb_seq_no + self.external_value_path = external_value_path def resolve_blob_index(self, blob_index: ccl_blink_value_deserializer.BlobIndex) -> IndexedDBExternalObject: """Resolve a ccl_blink_value_deserializer.BlobIndex to its IndexedDBExternalObject @@ -317,20 +356,44 @@ class IndexedDb: # Of note, the first byte of the key defines the length of the db_id, obj_store_id and index_id in bytes: # 0b xxxyyyzz (x = db_id size - 1, y = obj_store size - 1, z = index_id - 1) - # Currently I just assume that everything falls between 1 and 127 for simplicity as it makes scanning the keys - # lots easier. + def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None): self._db = ccl_leveldb.RawLevelDb(leveldb_dir) self._blob_dir = leveldb_blob_dir - self.global_metadata = GlobalMetadata(self._get_raw_global_metadata()) - self.database_metadata = DatabaseMetadata(self._get_raw_database_metadata()) - self.object_store_meta = ObjectStoreMetadata(self._get_raw_object_store_metadata()) - + self.global_metadata = None + self.database_metadata = None + self.object_store_meta = None + self._cache_records() + self._fetch_meta_data() self._blob_lookup_cache = {} + def _cache_records(self): + self._fetched_records = [] + # Fetch the records only once + for record in self._db.iterate_records_raw(): + self._fetched_records.append(record) + + def _fetch_meta_data(self): + global_metadata_raw = {} + database_metadata_raw = {} + objectstore_metadata_raw = {} + # Fetch metadata + global_metadata_raw = self._get_raw_global_metadata() + self.global_metadata = GlobalMetadata(global_metadata_raw) + database_metadata_raw = self._get_raw_database_metadata() + self.database_metadata = DatabaseMetadata(database_metadata_raw) + objectstore_metadata_raw = self._get_raw_object_store_metadata() + self.object_store_meta = ObjectStoreMetadata(objectstore_metadata_raw) + @staticmethod - def make_prefix(db_id: int, obj_store_id: int, index_id: int) -> bytes: + def make_prefix( + db_id: int, obj_store_id: int, index_id: int, end: typing.Optional[typing.Sequence[int]]=None) -> bytes: + if end is None: + end = [] + def count_bytes(val): + if val == 0: + return 1 i = 0 while val > 0: i += 1 @@ -338,11 +401,13 @@ def count_bytes(val): return i def yield_le_bytes(val): + if val == 0: + yield 0 if val < 0: raise ValueError while val > 0: yield val & 0xff - val >> 8 + val = val >> 8 db_id_size = count_bytes(db_id) obj_store_id_size = count_bytes(obj_store_id) @@ -351,8 +416,38 @@ def yield_le_bytes(val): if db_id_size > 8 or obj_store_id_size > 8 or index_id_size > 4: raise ValueError("id sizes are too big") - byte_one = ((db_id_size - 1) << 5) | ((obj_store_id_size - 1) << 2) | index_id_size - return bytes([byte_one, *yield_le_bytes(db_id), *yield_le_bytes(obj_store_id), *yield_le_bytes(index_id)]) + byte_one = ((db_id_size - 1) << 5) | ((obj_store_id_size - 1) << 2) | (index_id_size - 1) + # print([byte_one, *yield_le_bytes(db_id), *yield_le_bytes(obj_store_id), *yield_le_bytes(index_id), *end]) + return bytes([byte_one, *yield_le_bytes(db_id), *yield_le_bytes(obj_store_id), *yield_le_bytes(index_id), *end]) + + @staticmethod + def read_prefix(stream: typing.BinaryIO) -> tuple[int, int, int, int]: + """ + :param stream: file-like to read the prefix from + :return: a tuple of db_id, object_store_id, index_id, length of the prefix + """ + lengths_bytes = stream.read(1) + if not lengths_bytes: + raise ValueError("Couldn't get enough data when reading prefix length") + lengths = lengths_bytes[0] + db_id_size = ((lengths >> 5) & 0x07) + 1 + object_store_size = ((lengths >> 2) & 0x07) + 1 + index_size = (lengths & 0x03) + 1 + + db_id_raw = stream.read(db_id_size) + object_store_raw = stream.read(object_store_size) + index_raw = stream.read(index_size) + + if (len(db_id_raw) != db_id_size or + len(object_store_raw) != object_store_size or + len(index_raw) != index_size): + raise ValueError("Couldn't read enough bytes for the prefix") + + db_id = int.from_bytes(db_id_raw, "little") + object_store_id = int.from_bytes(object_store_raw, "little") + index_id = int.from_bytes(index_raw, "little") + + return db_id, object_store_id, index_id, (db_id_size + object_store_size + index_size + 1) def get_database_metadata(self, db_id: int, meta_type: DatabaseMetadataType): return self.database_metadata.get_meta(db_id, meta_type) @@ -365,7 +460,7 @@ def _get_raw_global_metadata(self, live_only=True) -> typing.Dict[bytes, ccl_lev if not live_only: raise NotImplementedError("Deleted metadata not implemented yet") meta = {} - for record in self._db.iterate_records_raw(reverse=True): + for record in reversed(self._fetched_records): if record.key.startswith(b"\x00\x00\x00\x00") and record.state == ccl_leveldb.KeyState.Live: # we only want live keys and the newest version thereof (highest seq) if record.key not in meta or meta[record.key].seq < record.seq: @@ -380,11 +475,9 @@ def _get_raw_database_metadata(self, live_only=True): db_meta = {} for db_id in self.global_metadata.db_ids: - if db_id.dbid_no > 0x7f: - raise NotImplementedError("there could be this many dbs, but I don't support it yet") - prefix = bytes([0, db_id.dbid_no, 0, 0]) - for record in self._db.iterate_records_raw(reverse=True): + prefix = IndexedDb.make_prefix(db_id.dbid_no, 0, 0) + for record in reversed(self._fetched_records): if record.key.startswith(prefix) and record.state == ccl_leveldb.KeyState.Live: # we only want live keys and the newest version thereof (highest seq) meta_type = record.key[len(prefix)] @@ -401,12 +494,10 @@ def _get_raw_object_store_metadata(self, live_only=True): os_meta = {} for db_id in self.global_metadata.db_ids: - if db_id.dbid_no > 0x7f: - raise NotImplementedError("there could be this many dbs, but I don't support it yet") - prefix = bytes([0, db_id.dbid_no, 0, 0, 50]) + prefix = IndexedDb.make_prefix(db_id.dbid_no, 0, 0, [50]) - for record in self._db.iterate_records_raw(reverse=True): + for record in reversed(self._fetched_records): if record.key.startswith(prefix) and record.state == ccl_leveldb.KeyState.Live: # we only want live keys and the newest version thereof (highest seq) objstore_id, varint_raw = _le_varint_from_bytes(record.key[len(prefix):]) @@ -419,43 +510,95 @@ def _get_raw_object_store_metadata(self, live_only=True): return os_meta + def read_record_precursor( + self, key: IdbKey, db_id: int, store_id: int, buffer: bytes, + bad_deserializer_data_handler: typing.Callable[[IdbKey, bytes], typing.Any], + external_data_path: typing.Optional[str] = None): + val_idx = 0 + trailer = None + blink_type_tag = buffer[val_idx] + if blink_type_tag != 0xff: + # TODO: probably don't want to fail hard here long term... + if bad_deserializer_data_handler is not None: + bad_deserializer_data_handler(key, buffer) + return None + else: + raise ValueError("Blink type tag not present") + + val_idx += 1 + + blink_version, varint_raw = _le_varint_from_bytes(buffer[val_idx:]) + + val_idx += len(varint_raw) + + # Peek the next byte to work out if the data is held externally: + # third_party/blink/renderer/modules/indexeddb/idb_value_wrapping.cc + if buffer[val_idx] == 0x01: # kReplaceWithBlob + val_idx += 1 + externally_serialized_blob_size, varint_raw = _le_varint_from_bytes(buffer[val_idx:]) + val_idx += len(varint_raw) + externally_serialized_blob_index, varint_raw = _le_varint_from_bytes(buffer[val_idx:]) + val_idx += len(varint_raw) + + try: + info = self.get_blob_info(db_id, store_id, key.raw_key, externally_serialized_blob_index) + except KeyError: + info = None + + if info is not None: + data_path = pathlib.Path(str(db_id), f"{info.blob_number >> 8:02x}", f"{info.blob_number:x}") + try: + blob = self.get_blob(db_id, store_id, key.raw_key, externally_serialized_blob_index).read() + except FileNotFoundError: + if bad_deserializer_data_handler is not None: + bad_deserializer_data_handler(key, buffer) + return None + raise + + return self.read_record_precursor( + key, db_id, store_id, + blob, + bad_deserializer_data_handler, str(data_path)) + else: + return None + else: + if blink_version >= BlinkTrailer.MIN_WIRE_FORMAT_VERSION_FOR_TRAILER: + trailer = BlinkTrailer.from_buffer(buffer, val_idx) # TODO: do something with the trailer + val_idx += BlinkTrailer.TRAILER_SIZE + + obj_raw = io.BytesIO(buffer[val_idx:]) + + return blink_version, obj_raw, trailer, external_data_path + def iterate_records( self, db_id: int, store_id: int, *, live_only=False, bad_deserializer_data_handler: typing.Callable[[IdbKey, bytes], typing.Any] = None): - if db_id > 0x7f or store_id > 0x7f: - raise NotImplementedError("there could be this many dbs or object stores, but I don't support it yet") - blink_deserializer = ccl_blink_value_deserializer.BlinkV8Deserializer() # goodness me this is a slow way of doing things - prefix = bytes([0, db_id, store_id, 1]) - for record in self._db.iterate_records_raw(): + prefix = IndexedDb.make_prefix(db_id, store_id, 1) + + for record in self._fetched_records: if record.key.startswith(prefix): key = IdbKey(record.key[len(prefix):]) if not record.value: # empty values will obviously fail, returning None is probably better than dying. - return key, None + yield IndexedDbRecord(self, db_id, store_id, key, None, + record.state == ccl_leveldb.KeyState.Live, record.seq) + continue value_version, varint_raw = _le_varint_from_bytes(record.value) val_idx = len(varint_raw) # read the blink envelope - blink_type_tag = record.value[val_idx] - if blink_type_tag != 0xff: - # TODO: probably don't want to fail hard here long term... - if bad_deserializer_data_handler is not None: - bad_deserializer_data_handler(key, record.value) - continue - else: - raise ValueError("Blink type tag not present") - val_idx += 1 - - blink_version, varint_raw = _le_varint_from_bytes(record.value[val_idx:]) + precursor = self.read_record_precursor( + key, db_id, store_id, record.value[val_idx:], bad_deserializer_data_handler) + if precursor is None: + continue # only returns None on error, handled in the function if bad_deserializer_data_handler can - val_idx += len(varint_raw) + blink_version, obj_raw, trailer, external_path = precursor - obj_raw = io.BytesIO(record.value[val_idx:]) - deserializer = ccl_v8_value_deserializer.Deserializer( - obj_raw, host_object_delegate=blink_deserializer.read) try: + deserializer = ccl_v8_value_deserializer.Deserializer( + obj_raw, host_object_delegate=blink_deserializer.read) value = deserializer.read() except Exception: if bad_deserializer_data_handler is not None: @@ -463,27 +606,31 @@ def iterate_records( continue raise yield IndexedDbRecord(self, db_id, store_id, key, value, - record.state == ccl_leveldb.KeyState.Live, record.seq) + record.state == ccl_leveldb.KeyState.Live, + record.seq, external_path) def get_blob_info(self, db_id: int, store_id: int, raw_key: bytes, file_index: int) -> IndexedDBExternalObject: - if db_id > 0x7f or store_id > 0x7f: - raise NotImplementedError("there could be this many dbs, but I don't support it yet") + # if db_id > 0x7f or store_id > 0x7f: + # raise NotImplementedError("there could be this many dbs, but I don't support it yet") if result := self._blob_lookup_cache.get((db_id, store_id, raw_key, file_index)): return result # goodness me this is a slow way of doing things, # TODO: we should at least cache along the way to our record - prefix = bytes([0, db_id, store_id, 3]) - for record in self._db.iterate_records_raw(): - if record.key.startswith(prefix): + # prefix = bytes([0, db_id, store_id, 3]) + prefix = IndexedDb.make_prefix(db_id, store_id, 3) + for record in self._fetched_records: + if record.user_key.startswith(prefix): + this_raw_key = record.user_key[len(prefix):] buff = io.BytesIO(record.value) idx = 0 while buff.tell() < len(record.value): blob_info = IndexedDBExternalObject.from_stream(buff) - self._blob_lookup_cache[(db_id, store_id, raw_key, idx)] = blob_info + self._blob_lookup_cache[(db_id, store_id, this_raw_key, idx)] = blob_info idx += 1 - break + # if this_raw_key == raw_key: + # break if result := self._blob_lookup_cache.get((db_id, store_id, raw_key, file_index)): return result @@ -498,26 +645,118 @@ def get_blob(self, db_id: int, store_id: int, raw_key: bytes, file_index: int) - # path will be: origin.blob/database id/top 16 bits of blob number with two digits/blob number # TODO: check if this is still the case on non-windows systems - path = pathlib.Path(self._blob_dir, str(db_id), f"{info.blob_number >> 8:02x}", f"{info.blob_number:x}") + path = pathlib.Path(self._blob_dir, f"{db_id:x}", f"{info.blob_number >> 8:02x}", f"{info.blob_number:x}") if path.exists(): return path.open("rb") raise FileNotFoundError(path) + def get_undo_task_scopes(self): + # https://github.com/chromium/chromium/blob/master/components/services/storage/indexed_db/scopes/leveldb_scopes_coding.cc + + # Prefix will be 00 00 00 00 32 (00|01|02) (varint of scope number) 00 + # 00 00 00 00 = Global metadata + # 32 = kScopesPrefixByte from indexed_db_leveldb_coding.cc + # (00|01|02) = one of: kGlobalMetadataByte, kScopesMetadataByte or kLogByte from leveldb_scopes_coding.h + # (varint of scope) + # 00 = kUndoTasksByte from leveldb_scopes_coding.h + + # This is a slow way of doing this: + prefix = bytes.fromhex("00 00 00 00 32") + for record in self._fetched_records: + if record.state != ccl_leveldb.KeyState.Live: + continue + if record.user_key.startswith(prefix): + # process the key first as they define what we'll do later + o = len(prefix) + metadata_byte = record.user_key[o] + assert metadata_byte in (0, 1, 2) # TODO: replace with real exception + + o += 1 + + if metadata_byte == 0: # global meta + # print(f"Global metadata:\t{record.user_key.hex(' ')}") + continue # Don't currently think I need this to do the work + elif metadata_byte == 1: # scope meta + # print(f"Scope metadata:\t{record.user_key.hex(' ')}") + # scope_number, varint_bytes = _le_varint_from_bytes(record.user_key) + # o += len(varint_bytes) + continue # Don't currently think I need this to do the work + elif metadata_byte == 2: # log + scope_number, varint_bytes = _le_varint_from_bytes(record.user_key) + o += len(varint_bytes) + undo_byte = record.key[o] + if undo_byte != 0: + continue + o += 1 + # print(f"Log\t{record.user_key.hex(' ')}") + undo_sequence_number, = struct.unpack(">q", record.user_key[o:o + 8]) + + # Value should be a LevelDBScopesUndoTask protobuf + # (indexed_db_components\indexed_db\scopes\scopes_metadata.proto). + # We're looking for a "Put" protobuf (first and only tag should be a Message numbered 1, with two + # bytes values numbered 1 and 2 which are the original key and value respectively. + # To reduce the need for dependencies, as they are so simple, the protobuf can be decoded "manually" + with io.BytesIO(record.value) as value_stream: + root_tag_raw = read_le_varint(value_stream) + root_number = root_tag_raw >> 3 + if root_tag_raw & 0x07 != 2 or root_number != 1: + assert root_number in (2, 3) # TODO: remove? + continue # I don't think I need to raise an exception here? + data_length = read_le_varint(value_stream) + inner_value_bytes = value_stream.read(data_length) + if len(inner_value_bytes) != data_length: + raise ValueError("Couldn't get all data when reading the LevelDBScopesUndoTask") + + record_key_raw = None + record_value_raw = None + with io.BytesIO(inner_value_bytes) as inner_value_stream: + while inner_value_stream.tell() < len(inner_value_bytes) and ( + record_key_raw is None or record_value_raw is None): + tag_raw = read_le_varint(inner_value_stream) + assert tag_raw & 0x07 == 2 + tag_number = tag_raw >> 3 + data_length = read_le_varint(inner_value_stream) + data = inner_value_stream.read(data_length) + if len(data) != data_length: + raise ValueError("Couldn't get enough from the protobuf in LevelDBScopesUndoTask") + if tag_number == 1: + record_key_raw = data + elif tag_number == 2: + record_value_raw = data + else: + raise ValueError("Unexpected message in LevelDBScopesUndoTask") + + if not record_value_raw: + continue # I don't think we need to go further here + + with io.BytesIO(record_key_raw) as record_key_stream: + db_id, object_store, index_id, length = IndexedDb.read_prefix(record_key_stream) + if db_id < 1 or object_store < 1 or index_id < 1: + continue # only work with indexeddb record records + + key = IdbKey(record_key_stream.read()) + + yield key, record_value_raw + @property def database_path(self): return self._db.in_dir_path class WrappedObjectStore: + """ + A wrapper class around a "raw" IndexedDb which simplifies accessing records related to an object store. Usually only + created by a WrappedDatabase. + """ def __init__(self, raw_db: IndexedDb, dbid_no: int, obj_store_id: int): self._raw_db = raw_db self._dbid_no = dbid_no self._obj_store_id = obj_store_id @property - def object_store_id(self): + def object_store_id(self) -> int: return self._obj_store_id @property @@ -530,6 +769,14 @@ def _log_error(key: IdbKey, data: bytes): sys.stderr.write(f"ERROR decoding key: {key}\n") def get_blob(self, raw_key: bytes, file_index: int) -> typing.BinaryIO: + """ + Deprecated: use IndexedDbRecord.get_blob_stream + + :param raw_key: raw key of the record from which the blob originates + :param file_index: the file/blob index from a ccl_blink_value_deserializer.BlobIndex + :return: a file-like object of the blob + """ + return self._raw_db.get_blob(self._dbid_no, self.object_store_id, raw_key, file_index) # def __iter__(self): @@ -556,6 +803,10 @@ def __repr__(self): class WrappedDatabase: + """ + A wrapper class around the raw "IndexedDb" class which simplifies access to a Database in the IndexedDb. Usually + only created by WrappedIndexedDb. + """ def __init__(self, raw_db: IndexedDb, dbid: DatabaseId): self._raw_db = raw_db self._dbid = dbid @@ -572,18 +823,32 @@ def __init__(self, raw_db: IndexedDb, dbid: DatabaseId): @property def name(self) -> str: + """ + :return: the name of this WrappedDatabase + """ return self._dbid.name @property def origin(self) -> str: + """ + :return: the origin (host name) for this WrappedDatabase + """ return self._dbid.origin @property def db_number(self) -> int: + """ + :return: the numerical ID assigned to this WrappedDatabase + """ return self._dbid.dbid_no @property def object_store_count(self) -> int: + """ + :return: the "MaximumObjectStoreId" value fot this database; NB this may not be the *actual* number of object + stores which can be read - it is possible that some object stores may be deleted. Use len() to check the + number of object stores you can actually access + """ # NB obj store ids are enumerated from 1. return self._raw_db.get_database_metadata( self.db_number, @@ -591,26 +856,55 @@ def object_store_count(self) -> int: @property def object_store_names(self) -> typing.Iterable[str]: + """ + :return: yields the names of the object stores in this WrappedDatabase + """ yield from self._obj_store_names def get_object_store_by_id(self, obj_store_id: int) -> WrappedObjectStore: + """ + :param obj_store_id: the numerical ID for an object store in this WrappedDatabase + :return: the WrappedObjectStore with the ID provided + """ if obj_store_id > 0 and obj_store_id <= self.object_store_count: return self._obj_stores[obj_store_id - 1] raise ValueError("obj_store_id must be greater than zero and less or equal to object_store_count " "NB object stores are enumerated from 1 - there is no store with id 0") def get_object_store_by_name(self, name: str) -> WrappedObjectStore: + """ + :param name: the name of an object store in this WrappedDatabase + :return: the WrappedObjectStore with the name provided + """ if name in self: return self.get_object_store_by_id(self._obj_store_names.index(name) + 1) raise KeyError(f"{name} is not an object store in this database") - def __len__(self): - len(self._obj_stores) - - def __contains__(self, item): + def __iter__(self) -> typing.Iterable[WrappedObjectStore]: + """ + :return: yields the object stores in this WrappedDatabase + """ + yield from self._obj_stores + + def __len__(self) -> int: + """ + :return: the number of object stores accessible in this WrappedDatabase + """ + return len(self._obj_stores) + + def __contains__(self, item: str) -> bool: + """ + :param item: the name of an object store in this WrappedDatabase + :return: True if the name provided matches one of the Object stores in this WrappedDatabase + """ return item in self._obj_store_names - def __getitem__(self, item) -> WrappedObjectStore: + def __getitem__(self, item: typing.Union[int, str]) -> WrappedObjectStore: + """ + :param item: either the numerical ID of an object store (as an int) or the name of an object store in this + WrappedDatabase + :return: + """ if isinstance(item, int): return self.get_object_store_by_id(item) elif isinstance(item, str): @@ -622,6 +916,10 @@ def __repr__(self): class WrappedIndexDB: + """ + A wrapper object around the "raw" IndexedDb class. This should be used in most cases as the code required to use it + is simpler and more pythonic. + """ def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = None): self._raw_db = IndexedDb(leveldb_dir, leveldb_blob_dir) self._multiple_origins = len(set(x.origin for x in self._raw_db.global_metadata.db_ids)) > 1 @@ -635,21 +933,35 @@ def __init__(self, leveldb_dir: os.PathLike, leveldb_blob_dir: os.PathLike = Non for x in self._db_number_lookup.values()} @property - def database_count(self): + def database_count(self) -> int: + """ + :return: The number of databases in this IndexedDB + """ return len(self._db_number_lookup) @property - def database_ids(self): + def database_ids(self) -> typing.Iterable[DatabaseId]: + """ + :return: yields DatabaseId objects which define the databases in this IndexedDb + """ yield from self._raw_db.global_metadata.db_ids @property - def has_multiple_origins(self): + def has_multiple_origins(self) -> bool: return self._multiple_origins def __len__(self): + """ + :return: the number of databases in this IndexedDb + """ len(self._db_number_lookup) - def __contains__(self, item): + def __contains__(self, item: typing.Union[str, int, tuple[str, str]]): + """ + :param item: either a database id number, the name of a database (as a string), or (if the database has multiple + origins), a tuple of database name and origin + :return: True if this IndexedDb contains the referenced database identifier + """ if isinstance(item, str): if self.has_multiple_origins: raise ValueError( @@ -666,6 +978,12 @@ def __contains__(self, item): raise TypeError("keys must be provided as a tuple of (name, origin) or a str (if only single origin) or int") def __getitem__(self, item: typing.Union[int, str, typing.Tuple[str, str]]) -> WrappedDatabase: + """ + + :param item: either a database id number, the name of a database (as a string), or (if the database has multiple + origins), a tuple of database name and origin + :return: the WrappedDatabase referenced by the id in item + """ if isinstance(item, int): if item in self._db_number_lookup: return self._db_number_lookup[item] diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py index 8bc6982..6b7affc 100644 --- a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py +++ b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_chromium_sessionstorage.py @@ -25,9 +25,9 @@ import dataclasses from types import MappingProxyType -from . import ccl_leveldb +from pyhindsight.lib.ccl_chrome_indexeddb import ccl_leveldb -__version__ = "0.1" +__version__ = "0.2.1" __description__ = "Module for reading the Chromium leveldb sessionstorage format" __contact__ = "Alex Caithness" @@ -141,24 +141,25 @@ def __init__(self, in_dir: pathlib.Path): try: value = rec.value.decode("UTF-16-LE") except UnicodeDecodeError: - # print(f"Error decoding value for {key}") - # print(f"Raw Value: {rec.value}") + print(f"Error decoding value for {key}") + print(f"Raw Value: {rec.value}") continue - #guid_host_pair = self._map_id_to_host_guid.get(map_id) host = self._map_id_to_host.get(map_id) - #if not guid_host_pair: if not host: self._orphans.append((ss_key, SessionStoreValue(value, None, rec.seq))) else: - #guid, host = guid_host_pair self._host_lookup.setdefault(host, {}) self._host_lookup[host].setdefault(ss_key, []) self._host_lookup[host][ss_key].append(SessionStoreValue(value, None, rec.seq)) def __contains__(self, item: typing.Union[str, typing.Tuple[str, str]]) -> bool: - """if item is a str, returns true if that host is present - if item is a tuple of (str, str), returns True if that host and key pair are present""" + """ + :param item: either the host as a str or a tuple of the host and a key (both str) + :return: if item is a str, returns true if that host is present, if item is a tuple of (str, str), returns True + if that host and key pair are present + """ + if isinstance(item, str): return item in self._host_lookup elif isinstance(item, tuple) and len(item) == 2: @@ -168,9 +169,17 @@ def __contains__(self, item: typing.Union[str, typing.Tuple[str, str]]) -> bool: raise TypeError("item must be a string or a tuple of (str, str)") def iter_hosts(self) -> typing.Iterable[str]: + """ + :return: yields the hosts present in this SessionStorage + """ yield from self._host_lookup.keys() - def get_all_for_host(self, host): + def get_all_for_host(self, host: str) -> dict[str, tuple[SessionStoreValue, ...]]: + """ + :param host: the host (domain name) for the session storage + :return: a dictionary where the keys are storage keys and the values are tuples of SessionStoreValue objects + for that key. Multiple values may be returned as deleted or old values may be recovered. + """ if host not in self: return {} result_raw = dict(self._host_lookup[host]) @@ -178,15 +187,27 @@ def get_all_for_host(self, host): result_raw[ss_key] = tuple(result_raw[ss_key]) return result_raw - def get_session_storage_key(self, host, key): + def get_session_storage_key(self, host, key) -> tuple[SessionStoreValue, ...]: + """ + :param host: the host (domain name) for the session storage + :param key: the storage key + :return: a tuple of SessionStoreValue matching the host and key. Multiple values may be returned as deleted or + old values may be recovered. + """ if (host, key) not in self: return tuple() return tuple(self._host_lookup[host][key]) - def iter_orphans(self): + def iter_orphans(self) -> typing.Iterable[tuple[str, SessionStoreValue]]: + """ + Returns records which have been orphaned from their host (domain name) where it cannot be recovered. The keys + may be named uniquely enough that the host may be inferred. + :return: yields tuples of (session key, SessionStoreValue) + """ yield from self._orphans - def __getitem__(self, item: typing.Union[str, typing.Tuple[str, str]]): + def __getitem__(self, item: typing.Union[str, typing.Tuple[str, str]]) -> typing.Union[ + dict[str, tuple[SessionStoreValue, ...]], tuple[SessionStoreValue, ...]]: if item not in self: raise KeyError(item) @@ -197,13 +218,21 @@ def __getitem__(self, item: typing.Union[str, typing.Tuple[str, str]]): else: raise TypeError("item must be a string or a tuple of (str, str)") - def __iter__(self): - """iterates the hosts present""" + def __iter__(self) -> typing.Iterable[str]: + """ + iterates the hosts present + """ return self.iter_hosts() def close(self): self._ldb.close() + def __enter__(self) -> "SessionStoreDb": + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.close() + def main(args): ldb_in_dir = pathlib.Path(args[0]) diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_leveldb.py b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_leveldb.py index e6c5e3b..eee1978 100644 --- a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_leveldb.py +++ b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_leveldb.py @@ -70,7 +70,8 @@ def read_le_varint(stream: typing.BinaryIO, *, is_google_32bit=False) -> typing. return x[0] -def read_length_prefixed_blob(stream: typing.BinaryIO): +def read_length_prefixed_blob(stream: typing.BinaryIO) -> bytes: + """Reads a blob of data which is prefixed with a varint length""" length = read_le_varint(stream) data = stream.read(length) if len(data) != length: @@ -86,11 +87,11 @@ class BlockHandle: length: int @classmethod - def from_stream(cls, stream: typing.BinaryIO): + def from_stream(cls, stream: typing.BinaryIO) -> "BlockHandle": return cls(read_le_varint(stream), read_le_varint(stream)) @classmethod - def from_bytes(cls, data: bytes): + def from_bytes(cls, data: bytes) -> "BlockHandle": with io.BytesIO(data) as stream: return BlockHandle.from_stream(stream) @@ -129,7 +130,8 @@ class Record: was_compressed: bool @property - def user_key(self): + def user_key(self) -> bytes: + """Returns the "userkey" which omits the metadata bytes which may reside at the end of the raw key""" if self.file_type == FileType.Ldb: if len(self.key) < 8: return self.key @@ -138,10 +140,9 @@ def user_key(self): else: return self.key - @classmethod def ldb_record(cls, key: bytes, value: bytes, origin_file: os.PathLike, - offset: int, was_compressed: bool): + offset: int, was_compressed: bool) -> "Record": seq = (struct.unpack("> 8 if len(key) > 8: state = KeyState.Deleted if key[-8] == 0 else KeyState.Live @@ -151,7 +152,7 @@ def ldb_record(cls, key: bytes, value: bytes, origin_file: os.PathLike, @classmethod def log_record(cls, key: bytes, value: bytes, seq: int, state: KeyState, - origin_file: os.PathLike, offset: int): + origin_file: os.PathLike, offset: int) -> "Record": return cls(key, value, seq, state, FileType.Log, origin_file, offset, False) @@ -223,7 +224,7 @@ def __init__(self, file: pathlib.Path): self._index = self._read_index() - def _read_block(self, handle: BlockHandle): + def _read_block(self, handle: BlockHandle) -> Block: # block is the size in the blockhandle plus the trailer # the trailer is 5 bytes long. # idx size meaning diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_simplesnappy.py b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_simplesnappy.py index ff11f30..eee1d11 100644 --- a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_simplesnappy.py +++ b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_simplesnappy.py @@ -26,7 +26,7 @@ import typing import enum -__version__ = "0.1" +__version__ = "0.3" __description__ = "Pure Python reimplementation of Google's Snappy decompression" __contact__ = "Alex Caithness" @@ -78,17 +78,17 @@ def read_le_varint(stream: typing.BinaryIO) -> typing.Optional[int]: def read_uint16(stream: typing.BinaryIO) -> int: - """Reads a Uint16 from stream""" + """Reads an Uint16 from stream""" return struct.unpack(" int: - """Reads a Uint24 from stream""" + """Reads an Uint24 from stream""" return struct.unpack(" int: - """Reads a Uint32 from stream""" + """Reads an Uint32 from stream""" return struct.unpack(" bytes: raise ValueError("Offset cannot be 0") actual_offset = out.tell() - offset - log(f"Current Outstream Length: {out.tell()}") - log(f"Backreference length: {length}") - log(f"Backreference relative offset: {offset}") - log(f"Backreference absolute offset: {actual_offset}") + # log(f"Current Outstream Length: {out.tell()}") + # log(f"Backreference length: {length}") + # log(f"Backreference relative offset: {offset}") + # log(f"Backreference absolute offset: {actual_offset}") # have to read incrementally because you might have to read data that you've just written - # this is probably a really slow way of doing this. - for i in range(length): - out.write(out.getbuffer()[actual_offset + i: actual_offset + i + 1].tobytes()) + # for i in range(length): + # out.write(out.getbuffer()[actual_offset + i: actual_offset + i + 1].tobytes()) + buffer = out.getbuffer()[actual_offset: actual_offset + length].tobytes() + if offset - length <= 0: + # better safe than sorry, this way we're sure to extend it + # as much as needed without doing some extra calculations + buffer = (buffer * length)[:length] + out.write(buffer) result = out.getvalue() if uncompressed_length != len(result): diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py index d80750d..9619b85 100644 --- a/pyhindsight/lib/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py +++ b/pyhindsight/lib/ccl_chrome_indexeddb/ccl_v8_value_deserializer.py @@ -27,7 +27,7 @@ import typing import re -__version__ = "0.1" +__version__ = "0.1.1" __description__ = "Partial reimplementation of the V8 Javascript Object Serialization" __contact__ = "Alex Caithness" @@ -46,12 +46,13 @@ def log(msg, debug_only=True): print(f"{caller_name} ({caller_line}):\t{msg}") -def read_le_varint(stream: typing.BinaryIO) -> typing.Optional[typing.Tuple[int, bytes]]: +def read_le_varint(stream: typing.BinaryIO, is_32bit=False) -> typing.Optional[typing.Tuple[int, bytes]]: # this only outputs unsigned + limit = 5 if is_32bit else 10 i = 0 result = 0 underlying_bytes = [] - while i < 10: # 64 bit max possible? + while i < limit: # 64 bit max possible? raw = stream.read(1) if len(raw) < 1: return None @@ -80,9 +81,20 @@ def __str__(self): return "" +class SharedObject: + def __init__(self, object_id: int): + self.id = object_id + + def __repr__(self): + return f"" + + def __str__(self): + return f"" + + class Constants: # Constants - kLatestVersion = 13 + kLatestVersion = 15 # version:uint32_t (if at beginning of data, sets version > 0) token_kVersion = b"\xFF" @@ -163,6 +175,8 @@ class Constants: token_kArrayBufferView = b"V" # Shared array buffer. transferID:uint32_t token_kSharedArrayBuffer = b"u" + # A HeapObject shared across Isolates.sharedValueID: uint32_t + token_kSharedObject = 'p' # A wasm module object transfer. next value is its index. token_kWasmModuleTransfer = b"w" # The delegate is responsible for processing all following data. @@ -284,6 +298,9 @@ def _read_zigzag(self) -> int: else: return unsigned >> 1 + def _read_unit32(self) -> int: + return self._read_le_varint()[0] + def _read_double(self) -> float: return struct.unpack(f"{self._endian}d", self._read_raw(8))[0] @@ -512,6 +529,10 @@ def _wrap_js_array_buffer_view(self, raw: bytes) -> tuple: if byte_offset + byte_length > len(raw): raise ValueError("Not enough data in the raw data to hold the defined data") + # See: https://github.com/v8/v8/blob/4d34ea98bb655295ab1f9003f6783bd509b7ccb3/src/objects/value-serializer.cc#L1967 + if self.version >= 14: + flags = self._read_le_varint()[0] + log(f"ArrayBufferView: tag: {tag}; byte_offset: {byte_offset}; byte_length: {byte_length}") fmt = ArrayBufferViewTag.STRUCT_LOOKUP[tag] @@ -529,6 +550,10 @@ def _read_host_object(self) -> typing.Any: self._objects.append(result) return result + def _read_shared_object(self) -> SharedObject: + shobj_id = self._read_le_varint()[0] + return SharedObject(shobj_id) + def _not_implemented(self): raise NotImplementedError("Todo") @@ -544,7 +569,7 @@ def _read_object_internal(self) -> typing.Tuple[bytes, typing.Any]: Constants.token_kTrueObject: lambda: Deserializer.__ODDBALLS[Constants.token_kTrue], Constants.token_kFalseObject: lambda: Deserializer.__ODDBALLS[Constants.token_kFalse], Constants.token_kNumberObject: self._read_double, - Constants.token_kUint32: self._read_le_varint, + Constants.token_kUint32: self._read_unit32, Constants.token_kInt32: self._read_zigzag, Constants.token_kDouble: self._read_double, Constants.token_kDate: self._read_date, @@ -557,6 +582,7 @@ def _read_object_internal(self) -> typing.Tuple[bytes, typing.Any]: Constants.token_kRegExp: self._read_js_regex, Constants.token_kObjectReference: self._read_object_by_reference, Constants.token_kBeginJSObject: self._read_js_object, + Constants.token_kSharedObject: self._read_shared_object, Constants.token_kBeginSparseJSArray: self._read_js_sparse_array, Constants.token_kBeginDenseJSArray: self._read_js_dense_array, Constants.token_kBeginJSMap: self._read_js_map, diff --git a/pyhindsight/lib/ccl_chrome_indexeddb/dump_indexeddb_details.py b/pyhindsight/lib/ccl_chrome_indexeddb/dump_indexeddb_details.py deleted file mode 100644 index 287c005..0000000 --- a/pyhindsight/lib/ccl_chrome_indexeddb/dump_indexeddb_details.py +++ /dev/null @@ -1,37 +0,0 @@ -import sys -import pathlib -import ccl_chromium_indexeddb - - -def main(args): - ldb_path = pathlib.Path(args[0]) - wrapper = ccl_chromium_indexeddb.WrappedIndexDB(ldb_path) - - for db_info in wrapper.database_ids: - db = wrapper[db_info.dbid_no] - print("------Database------") - print(f"db_number={db.db_number}; name={db.name}; origin={db.origin}") - print() - print("\t---Object Stores---") - for obj_store_name in db.object_store_names: - obj_store = db[obj_store_name] - print(f"\tobject_store_id={obj_store.object_store_id}; name={obj_store.name}") - try: - one_record = next(obj_store.iterate_records()) - except StopIteration: - one_record = None - if one_record is not None: - print("\tExample record:") - print(f"\tkey: {one_record.key}") - print(f"\tvalue: {one_record.value}") - else: - print("\tNo records") - print() - print() - - -if __name__ == '__main__': - if len(sys.argv) < 2: - print(f"USAGE: {pathlib.Path(sys.argv[0]).name} ") - exit(1) - main(sys.argv[1:]) diff --git a/pyhindsight/plugins/chrome_extensions.py b/pyhindsight/plugins/chrome_extensions.py index cbb4094..a9ff606 100644 --- a/pyhindsight/plugins/chrome_extensions.py +++ b/pyhindsight/plugins/chrome_extensions.py @@ -12,11 +12,11 @@ # Config friendlyName = "Chrome Extension Names" description = "Adds the name and description of each Chrome extension found to the Interpretation field" -artifactTypes = ("url", "local storage") +artifactTypes = ("url", "local storage", "indexeddb") remoteLookups = 0 browser = "Chrome" browserVersion = 1 -version = "20210424" +version = "20240428" parsedItems = 0 diff --git a/pyhindsight/plugins/generic_timestamps.py b/pyhindsight/plugins/generic_timestamps.py index 717265b..f0630cd 100644 --- a/pyhindsight/plugins/generic_timestamps.py +++ b/pyhindsight/plugins/generic_timestamps.py @@ -10,11 +10,11 @@ # Config friendlyName = "Generic Timestamp Decoder" description = "Attempts to detect and decode potential epoch second, epoch millisecond, and Webkit timestamps" -artifactTypes = ("cookie (created)", "cookie (accessed)", "local storage") +artifactTypes = ("cookie (created)", "cookie (accessed)", "local storage", "indexeddb") remoteLookups = 0 browser = "Chrome" browserVersion = 1 -version = "20160907" +version = "20240428" parsedItems = 0