Merge pull request #163 from obsidianforensics/issue-162

Add try/except around urllib parsing. Modify the HSTS domain parsing …
obsidianforensics · Mar 18, 2024 · 6adb28d · 6adb28d
2 parents f186013 + ba0c482
commit 6adb28d
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 12 deletions.
diff --git a/pyhindsight/browsers/chrome.py b/pyhindsight/browsers/chrome.py
@@ -2203,21 +2203,37 @@ def get_site_characteristics(self, path, dir_name):
     def build_hsts_domain_hashes(self):
         domains = set()
         for artifact in self.parsed_artifacts:
-            if isinstance(artifact, self.HistoryItem):
-                artifact_url = artifact.url
+            if not isinstance(artifact, self.HistoryItem):
+                continue
 
-                if not artifact_url:
-                    continue
+            if not artifact.url:
+                continue
+
+            artifact_url = artifact.url
+
+            # Some artifact "URLs" will be in invalid forms, which urllib (rightly)
+            # won't parse. Modify these URLs so they will parse properly.
+            # Examples:
+            #   Cookie: ".example.com",
+            #   Preferences (cookie_controls_metadata): "https://[*.]example.com"
+            prefixes = ('.', 'https://[*.]', 'http://[*.]')
 
-                # Cookie artifact's "URLs" will be in the form ".example.com",
-                # which won't parse, so modify it so it will
-                if artifact_url and artifact_url.startswith('.'):
-                    artifact_url = 'http://' + artifact_url[1:]
+            for prefix in prefixes:
+                if artifact_url.startswith(prefix):
+                    artifact_url = 'http://' + artifact_url[len(prefix):]
 
+            if artifact_url.endswith(',*'):
+                artifact_url = artifact_url[:-2]
+
+            try:
                 domain = urllib.parse.urlparse(artifact_url).hostname
-                # Some URLs don't have a domain, like local PDF files
-                if domain:
-                    domains.add(domain)
+            except ValueError as e:
+                log.warning(f'Error when parsing domain from {artifact_url}; {e}')
+                continue
+
+            # Some URLs don't have a domain, like local PDF files
+            if domain:
+                domains.add(domain)
 
         for domain in domains:
 

diff --git a/pyhindsight/browsers/webbrowser.py b/pyhindsight/browsers/webbrowser.py
@@ -114,7 +114,11 @@ def dict_factory(cursor, row):
     def build_md5_hash_list_of_origins(self):
         for artifact in self.parsed_artifacts:
             if isinstance(artifact, self.HistoryItem):
-                domain = urllib.parse.urlparse(artifact.url).hostname
+                try:
+                    domain = urllib.parse.urlparse(artifact.url).hostname
+                except ValueError as e:
+                    log.warning(f'Error when parsing domain from {artifact.url}; {e}')
+                    continue
                 # Some URLs don't have a domain, like local PDF files
                 if domain:
                     self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain