Skip to content

Commit

Permalink
Merge pull request #163 from obsidianforensics/issue-162
Browse files Browse the repository at this point in the history
Add try/except around urllib parsing. Modify the HSTS domain parsing …
  • Loading branch information
obsidianforensics authored Mar 18, 2024
2 parents f186013 + ba0c482 commit 6adb28d
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 12 deletions.
38 changes: 27 additions & 11 deletions pyhindsight/browsers/chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -2203,21 +2203,37 @@ def get_site_characteristics(self, path, dir_name):
def build_hsts_domain_hashes(self):
domains = set()
for artifact in self.parsed_artifacts:
if isinstance(artifact, self.HistoryItem):
artifact_url = artifact.url
if not isinstance(artifact, self.HistoryItem):
continue

if not artifact_url:
continue
if not artifact.url:
continue

artifact_url = artifact.url

# Some artifact "URLs" will be in invalid forms, which urllib (rightly)
# won't parse. Modify these URLs so they will parse properly.
# Examples:
# Cookie: ".example.com",
# Preferences (cookie_controls_metadata): "https://[*.]example.com"
prefixes = ('.', 'https://[*.]', 'http://[*.]')

# Cookie artifact's "URLs" will be in the form ".example.com",
# which won't parse, so modify it so it will
if artifact_url and artifact_url.startswith('.'):
artifact_url = 'http://' + artifact_url[1:]
for prefix in prefixes:
if artifact_url.startswith(prefix):
artifact_url = 'http://' + artifact_url[len(prefix):]

if artifact_url.endswith(',*'):
artifact_url = artifact_url[:-2]

try:
domain = urllib.parse.urlparse(artifact_url).hostname
# Some URLs don't have a domain, like local PDF files
if domain:
domains.add(domain)
except ValueError as e:
log.warning(f'Error when parsing domain from {artifact_url}; {e}')
continue

# Some URLs don't have a domain, like local PDF files
if domain:
domains.add(domain)

for domain in domains:

Expand Down
6 changes: 5 additions & 1 deletion pyhindsight/browsers/webbrowser.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,11 @@ def dict_factory(cursor, row):
def build_md5_hash_list_of_origins(self):
for artifact in self.parsed_artifacts:
if isinstance(artifact, self.HistoryItem):
domain = urllib.parse.urlparse(artifact.url).hostname
try:
domain = urllib.parse.urlparse(artifact.url).hostname
except ValueError as e:
log.warning(f'Error when parsing domain from {artifact.url}; {e}')
continue
# Some URLs don't have a domain, like local PDF files
if domain:
self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain
Expand Down

0 comments on commit 6adb28d

Please sign in to comment.