diff --git a/demo.py b/demo.py index 4267faf43..7feaac7e2 100644 --- a/demo.py +++ b/demo.py @@ -65,7 +65,6 @@ def callback(success: bool, val: str = site) -> None: command_sequence = CommandSequence( site, site_rank=index, - reset=True, callback=callback, ) @@ -74,5 +73,5 @@ def callback(success: bool, val: str = site) -> None: # Have a look at custom_command.py to see how to implement your own command command_sequence.append_command(LinkCountingCommand()) - # Run commands across the three browsers (simple parallelization) + # Run commands across all browsers (simple parallelization) manager.execute_command_sequence(command_sequence) diff --git a/docs/Configuration.md b/docs/Configuration.md index 7d9e42b1a..433817516 100644 --- a/docs/Configuration.md +++ b/docs/Configuration.md @@ -249,11 +249,6 @@ TODO # Browser Profile Support -**WARNING: Stateful crawls are currently not supported. Attempts to run -stateful crawls will throw `NotImplementedError`s. The work required to -restore support is tracked in -[this project](https://github.com/mozilla/OpenWPM/projects/2).** - ## Stateful vs Stateless crawls By default OpenWPM performs a "stateful" crawl, in that it keeps a consistent @@ -329,7 +324,6 @@ but will not be used during crash recovery. Specifically: profile specified by `seed_tar`. If OpenWPM determines that Firefox needs to restart for some reason during the crawl, it will use the profile from the most recent page visit (pre-crash) rather than the `seed_tar` profile. -Note that stateful crawls are currently [unsupported](https://github.com/mozilla/OpenWPM/projects/2)). * For stateless crawls, the initial `seed_tar` will be loaded during each new page visit. Note that this means the profile will very likely be _incomplete_, as cookies or storage may have been set or changed during the diff --git a/docs/Release-Checklist.md b/docs/Release-Checklist.md index 8bca1c5b3..8eb45d1c9 100644 --- a/docs/Release-Checklist.md +++ b/docs/Release-Checklist.md @@ -1,6 +1,6 @@ # Release Checklist -We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release +We aim to release a new version of OpenWPM with each new Firefox release (~1 release per month). The following steps are necessary for a release: 1. Upgrade Firefox to the newest version. 1. Go to: https://hg.mozilla.org/releases/mozilla-release/tags. @@ -11,10 +11,11 @@ We aim to release a new version of OpenWPM with each new Firefox release (~1 rel 2. Run `npm update` in `openwpm/Extension/webext-instrumentation`. 3. Run `npm update` in the base directory 3. Update python and system dependencies by following the ["managing requirements" instructions](../CONTRIBUTING.md#managing-requirements). -4. Increment the version number in [VERSION](../VERSION) -5. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md) -6. Squash and merge the release PR to master. -7. Publish a new release from https://github.com/mozilla/OpenWPM/releases: +4. If a new version of geckodriver is used, check whether the default geckodriver browser preferences in [`openwpm/deploy_browsers/configure_firefox.py`](../openwpm/deploy_browsers/configure_firefox.py#L8L65) need to be updated. +5. Increment the version number in [VERSION](../VERSION) +6. Add a summary of changes since the last version to [CHANGELOG](../CHANGELOG.md) +7. Squash and merge the release PR to master. +8. Publish a new release from https://github.com/mozilla/OpenWPM/releases: 1. Click "Draft a new release". 2. Enter the "Tag version" and "Release title" as `vX.X.X`. 3. In the description: diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 544b8d817..1341b7432 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -5,9 +5,11 @@ import shutil import signal import sys +import tempfile import threading import time import traceback +from pathlib import Path from queue import Empty as EmptyQueue from typing import Optional, Union @@ -16,6 +18,7 @@ from selenium.common.exceptions import WebDriverException from tblib import pickling_support +from .commands.profile_commands import dump_profile from .commands.types import BaseCommand, ShutdownSignal from .config import BrowserParamsInternal, ManagerParamsInternal from .deploy_browsers import deploy_firefox @@ -33,7 +36,7 @@ class Browser: """ - The Browser class is responsbile for holding all of the + The Browser class is responsible for holding all of the configuration and status information on BrowserManager process it corresponds to. It also includes a set of methods for managing the BrowserManager process and its child processes/threads. @@ -52,7 +55,7 @@ def __init__( self._UNSUCCESSFUL_SPAWN_LIMIT = 4 # manager parameters - self.current_profile_path = None + self.current_profile_path: Optional[Path] = None self.db_socket_address = manager_params.storage_controller_address assert browser_params.browser_id is not None self.browser_id: BrowserId = browser_params.browser_id @@ -62,7 +65,7 @@ def __init__( # Queues and process IDs for BrowserManager - # thread to run commands issues from TaskManager + # thread to run commands issued from TaskManager self.command_thread: Optional[threading.Thread] = None # queue for passing command tuples to BrowserManager self.command_queue: Optional[Queue] = None @@ -75,7 +78,7 @@ def __init__( # the port of the display for the Xvfb display (if it exists) self.display_port: Optional[int] = None - # boolean that says if the BrowserManager new (to optimize restarts) + # boolean that says if the BrowserManager is new (to optimize restarts) self.is_fresh = True # boolean indicating if the browser should be restarted self.restart_required = False @@ -97,29 +100,29 @@ def launch_browser_manager(self): sets up the BrowserManager and gets the process id, browser pid and, if applicable, screen pid. loads associated user profile if necessary """ - # Unsupported. See https://github.com/mozilla/OpenWPM/projects/2 # if this is restarting from a crash, update the tar location # to be a tar of the crashed browser's history - """ if self.current_profile_path is not None: # tar contents of crashed profile to a temp dir - tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/" - profile_commands.dump_profile( - self.current_profile_path, - self.manager_params, - self.browser_params, - tempdir, - close_webdriver=False, + tempdir = tempfile.mkdtemp(prefix="openwpm_profile_archive_") + tar_path = Path(tempdir) / "profile.tar" + + dump_profile( + browser_profile_path=self.current_profile_path, + tar_path=tar_path, + compress=False, + browser_params=self.browser_params, ) + # make sure browser loads crashed profile - self.browser_params.recovery_tar = tempdir + self.browser_params.recovery_tar = tar_path crash_recovery = True else: - """ + tempdir = None + crash_recovery = False + self.logger.info("BROWSER %i: Launching browser..." % self.browser_id) - tempdir = None - crash_recovery = False self.is_fresh = not crash_recovery # Try to spawn the browser within the timelimit @@ -159,8 +162,8 @@ def check_queue(launch_status): # Read success status of browser manager launch_status = dict() try: - # 1. Selenium profile created - spawned_profile_path = check_queue(launch_status) + # 1. Browser profile created + browser_profile_path = check_queue(launch_status) # 2. Profile tar loaded (if necessary) check_queue(launch_status) # 3. Display launched (if necessary) @@ -170,7 +173,7 @@ def check_queue(launch_status): # 5. Browser launched self.geckodriver_pid = check_queue(launch_status) - (driver_profile_path, ready) = check_queue(launch_status) + ready = check_queue(launch_status) if ready != "READY": self.logger.error( "BROWSER %i: Mismatch of status queue return values, " @@ -183,7 +186,6 @@ def check_queue(launch_status): unsuccessful_spawns += 1 error_string = "" status_strings = [ - "Proxy Ready", "Profile Created", "Profile Tar", "Display", @@ -202,17 +204,15 @@ def check_queue(launch_status): ) self.close_browser_manager() if "Profile Created" in launch_status: - shutil.rmtree(spawned_profile_path, ignore_errors=True) + shutil.rmtree(browser_profile_path, ignore_errors=True) # If the browser spawned successfully, we should update the # current profile path class variable and clean up the tempdir # and previous profile path. if success: - self.logger.debug("BROWSER %i: Browser spawn sucessful!" % self.browser_id) + self.logger.debug("BROWSER %i: Browser spawn successful!" % self.browser_id) previous_profile_path = self.current_profile_path - self.current_profile_path = driver_profile_path - if driver_profile_path != spawned_profile_path: - shutil.rmtree(spawned_profile_path, ignore_errors=True) + self.current_profile_path = browser_profile_path if previous_profile_path is not None: shutil.rmtree(previous_profile_path, ignore_errors=True) if tempdir is not None: @@ -360,7 +360,7 @@ def kill_browser_manager(self): os.kill(self.display_pid, signal.SIGKILL) except OSError: self.logger.debug( - "BROWSER %i: Display process does not " "exit" % self.browser_id + "BROWSER %i: Display process does not exit" % self.browser_id ) pass except TypeError: @@ -368,7 +368,7 @@ def kill_browser_manager(self): "BROWSER %i: PID may not be the correct " "type %s" % (self.browser_id, str(self.display_pid)) ) - if self.display_port is not None: # xvfb diplay lock + if self.display_port is not None: # xvfb display lock lockfile = "/tmp/.X%s-lock" % self.display_port try: os.remove(lockfile) @@ -394,33 +394,27 @@ def shutdown_browser(self, during_init: bool, force: bool = False) -> None: self.close_browser_manager(force=force) # Archive browser profile (if requested) - if not during_init and self.browser_params.profile_archive_dir is not None: - self.logger.warning( - "BROWSER %i: Archiving the browser profile directory is " - "currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2" % self.browser_id - ) - """ self.logger.debug( - "BROWSER %i: during_init=%s | profile_archive_dir=%s" % ( - self.browser_id, str(during_init), - self.browser_params.profile_archive_dir) + "BROWSER %i: during_init=%s | profile_archive_dir=%s" + % ( + self.browser_id, + str(during_init), + self.browser_params.profile_archive_dir, + ) ) - if (not during_init and - self.browser_params.profile_archive_dir is not None): + if not during_init and self.browser_params.profile_archive_dir is not None: self.logger.debug( - "BROWSER %i: Archiving browser profile directory to %s" % ( - self.browser_id, - self.browser_params.profile_archive_dir)) - profile_commands.dump_profile( - self.current_profile_path, - self.manager_params, - self.browser_params, - self.browser_params.profile_archive_dir, - close_webdriver=False, - compress=True + "BROWSER %i: Archiving browser profile directory to %s" + % (self.browser_id, self.browser_params.profile_archive_dir) + ) + tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz" + assert self.current_profile_path is not None + dump_profile( + browser_profile_path=self.current_profile_path, + tar_path=tar_path, + compress=True, + browser_params=self.browser_params, ) - """ # Clean up temporary files if self.current_profile_path is not None: @@ -441,22 +435,20 @@ def BrowserManager( display = None try: # Start Xvfb (if necessary), webdriver, and browser - driver, prof_folder, display = deploy_firefox.deploy_firefox( + driver, browser_profile_path, display = deploy_firefox.deploy_firefox( status_queue, browser_params, manager_params, crash_recovery ) - if prof_folder[-1] != "/": - prof_folder += "/" # Read the extension port -- if extension is enabled # TODO: Initial communication from extension to TM should use sockets if browser_params.extension_enabled: logger.debug( "BROWSER %i: Looking for extension port information " - "in %s" % (browser_params.browser_id, prof_folder) + "in %s" % (browser_params.browser_id, browser_profile_path) ) elapsed = 0 port = None - ep_filename = os.path.join(prof_folder, "extension_port.txt") + ep_filename = browser_profile_path / "extension_port.txt" while elapsed < 5: try: with open(ep_filename, "rt") as f: @@ -483,10 +475,9 @@ def BrowserManager( logger.debug("BROWSER %i: BrowserManager ready." % browser_params.browser_id) - # passes the profile folder back to the - # TaskManager to signal a successful startup - status_queue.put(("STATUS", "Browser Ready", (prof_folder, "READY"))) - browser_params.profile_path = prof_folder + # passes "READY" to the TaskManager to signal a successful startup + status_queue.put(("STATUS", "Browser Ready", "READY")) + browser_params.profile_path = browser_profile_path # starts accepting arguments until told to die while True: @@ -498,12 +489,6 @@ def BrowserManager( command: Union[ShutdownSignal, BaseCommand] = command_queue.get() if type(command) is ShutdownSignal: - # Geckodriver creates a copy of the profile (and the original - # temp file created by FirefoxProfile() is deleted). - # We clear the profile attribute here to prevent prints from: - # https://github.com/SeleniumHQ/selenium/blob/4e4160dd3d2f93757cafb87e2a1c20d6266f5554/py/selenium/webdriver/firefox/webdriver.py#L193-L199 - if driver.profile and not os.path.isdir(driver.profile.path): - driver.profile = None driver.quit() status_queue.put("OK") return diff --git a/openwpm/command_sequence.py b/openwpm/command_sequence.py index a5eca5b0c..ac79dce55 100644 --- a/openwpm/command_sequence.py +++ b/openwpm/command_sequence.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Callable, List, Tuple from .commands.browser_commands import ( @@ -10,6 +11,7 @@ SaveScreenshotCommand, ScreenshotFullPageCommand, ) +from .commands.profile_commands import DumpProfileCommand from .commands.types import BaseCommand from .errors import CommandExecutionError @@ -18,7 +20,7 @@ class CommandSequence: """A CommandSequence wraps a series of commands to be performed on a visit to one top-level site into one logical "site visit," keyed by a visit id. An example of a CommandSequence - that visits a page and dumps cookies modified on that visit would be: + that visits a page and saves a screenshot of it would be: sequence = CommandSequence(url) sequence.get() @@ -87,15 +89,15 @@ def browse(self, num_links=2, sleep=0, timeout=60): self.contains_get_or_browse = True def dump_profile( - self, dump_folder, close_webdriver=False, compress=True, timeout=120 - ): + self, + tar_path: Path, + close_webdriver: bool = False, + compress: bool = True, + timeout: int = 120, + ) -> None: """ dumps from the profile path to a given file (absolute path) """ - raise NotImplementedError( - "Profile saving is currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2." - ) self.total_timeout += timeout - command = DumpProfCommand(dump_folder, close_webdriver, compress) + command = DumpProfileCommand(tar_path, close_webdriver, compress) self._commands_with_timeout.append((command, timeout)) def save_screenshot(self, suffix="", timeout=30): @@ -103,7 +105,7 @@ def save_screenshot(self, suffix="", timeout=30): self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the save screenshot command", + "No get or browse request preceding the save screenshot command", self, ) command = SaveScreenshotCommand(suffix) @@ -131,7 +133,7 @@ def screenshot_full_page(self, suffix="", timeout=30): self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", + "No get or browse request preceding the screenshot full page command", self, ) command = ScreenshotFullPageCommand(suffix) @@ -142,7 +144,7 @@ def dump_page_source(self, suffix="", timeout=30): self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", + "No get or browse request preceding the dump page source command", self, ) command = DumpPageSourceCommand(suffix) @@ -171,7 +173,8 @@ def recursive_dump_page_source(self, suffix="", timeout=30): self.total_timeout += timeout if not self.contains_get_or_browse: raise CommandExecutionError( - "No get or browse request preceding " "the dump page source command", + "No get or browse request preceding the recursive dump" + " page source command", self, ) command = RecursiveDumpPageSourceCommand(suffix) @@ -188,7 +191,6 @@ def get_commands_with_timeout(self) -> List[Tuple[BaseCommand, int]]: """Returns a list of all commands in the command_sequence appended by a finalize command """ - commands = list(self._commands_with_timeout) commands.insert(0, (InitializeCommand(), 10)) commands.append((FinalizeCommand(sleep=5), 10)) diff --git a/openwpm/commands/profile_commands.py b/openwpm/commands/profile_commands.py index d44781d49..f4483e9d2 100644 --- a/openwpm/commands/profile_commands.py +++ b/openwpm/commands/profile_commands.py @@ -15,23 +15,91 @@ logger = logging.getLogger("openwpm") +def dump_profile( + browser_profile_path: Path, + tar_path: Path, + compress: bool, + browser_params: BrowserParamsInternal, +) -> None: + """Dumps a browser profile to a tar file.""" + assert browser_params.browser_id is not None + + # Creating the folders if need be + tar_path.parent.mkdir(exist_ok=True, parents=True) + + # see if this file exists first + # if it does, delete it before we try to save the current session + if tar_path.exists(): + tar_path.unlink() + + # backup and tar profile + if compress: + tar = tarfile.open(tar_path, "w:gz", errorlevel=1) + else: + tar = tarfile.open(tar_path, "w", errorlevel=1) + logger.debug( + "BROWSER %i: Backing up full profile from %s to %s" + % (browser_params.browser_id, browser_profile_path, tar_path) + ) + + storage_vector_files = [ + "cookies.sqlite", # cookies + "cookies.sqlite-shm", + "cookies.sqlite-wal", + "places.sqlite", # history + "places.sqlite-shm", + "places.sqlite-wal", + "webappsstore.sqlite", # localStorage + "webappsstore.sqlite-shm", + "webappsstore.sqlite-wal", + ] + storage_vector_dirs = [ + "webapps", # related to localStorage? + "storage", # directory for IndexedDB + ] + for item in storage_vector_files: + full_path = browser_profile_path / item + if ( + not full_path.is_file() + and not full_path.name.endswith("shm") + and not full_path.name.endswith("wal") + ): + logger.critical( + "BROWSER %i: %s NOT FOUND IN profile folder, skipping." + % (browser_params.browser_id, full_path) + ) + elif not full_path.is_file() and ( + full_path.name.endswith("shm") or full_path.name.endswith("wal") + ): + continue # These are just checkpoint files + tar.add(full_path, arcname=item) + for item in storage_vector_dirs: + full_path = browser_profile_path / item + if not full_path.is_dir(): + logger.warning( + "BROWSER %i: %s NOT FOUND IN profile folder, skipping." + % (browser_params.browser_id, full_path) + ) + continue + tar.add(full_path, arcname=item) + tar.close() + + class DumpProfileCommand(BaseCommand): """ - Dumps a browser profile currently stored in to - + Dumps a browser profile currently stored in to + . """ - def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> None: + def __init__( + self, tar_path: Path, close_webdriver: bool, compress: bool = True + ) -> None: self.tar_path = tar_path self.close_webdriver = close_webdriver self.compress = compress - raise NotImplementedError( - "Profile dumping is currently unsupported. " - "See: https://github.com/mozilla/OpenWPM/projects/2." - ) def __repr__(self) -> str: - return "DumpProfCommand({},{},{})".format( + return "DumpProfileCommand({},{},{})".format( self.tar_path, self.close_webdriver, self.compress ) @@ -42,110 +110,40 @@ def execute( manager_params: ManagerParamsInternal, extension_socket: ClientSocket, ) -> None: - browser_profile_folder = browser_params.profile_path - assert browser_profile_folder is not None - - # Creating the folders if need be - self.tar_path.parent.mkdir(exist_ok=True, parents=True) - - # see if this file exists first - # if it does, delete it before we try to save the current session - if self.tar_path.exists(): - self.tar_path.unlink() # IDK why it's called like this # if this is a dump on close, close the webdriver and wait for checkpoint if self.close_webdriver: webdriver.close() - sleep_until_sqlite_checkpoint(browser_profile_folder) - - # backup and tar profile - if self.compress: - tar = tarfile.open(self.tar_path, "w:gz", errorlevel=1) - else: - tar = tarfile.open(self.tar_path, "w", errorlevel=1) - logger.debug( - "BROWSER %i: Backing up full profile from %s to %s" - % ( - self.browser_id, - browser_profile_folder, - self.tar_path, - ) + sleep_until_sqlite_checkpoint(browser_params.profile_path) + + assert browser_params.profile_path is not None + dump_profile( + browser_params.profile_path, + self.tar_path, + self.compress, + browser_params, ) - storage_vector_files = [ - "cookies.sqlite", # cookies - "cookies.sqlite-shm", - "cookies.sqlite-wal", - "places.sqlite", # history - "places.sqlite-shm", - "places.sqlite-wal", - "webappsstore.sqlite", # localStorage - "webappsstore.sqlite-shm", - "webappsstore.sqlite-wal", - ] - storage_vector_dirs = [ - "webapps", # related to localStorage? - "storage", # directory for IndexedDB - ] - for item in storage_vector_files: - full_path = browser_profile_folder / item - if ( - not full_path.is_file() - and not full_path.name.endswith("shm") - and not full_path.name.endswith("wal") - ): - logger.critical( - "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (self.browser_id, full_path) - ) - elif not full_path.is_file() and ( - full_path.name.endswith("shm") or full_path.name.endswith("wal") - ): - continue # These are just checkpoint files - tar.add(full_path, arcname=item) - for item in storage_vector_dirs: - full_path = browser_profile_folder / item - if not full_path.is_dir(): - logger.warning( - "BROWSER %i: %s NOT FOUND IN profile folder, skipping." - % (self.browser_id, full_path) - ) - continue - tar.add(full_path, arcname=item) - tar.close() def load_profile( - browser_profile_folder: Path, + browser_profile_path: Path, manager_params: ManagerParamsInternal, browser_params: BrowserParamsInternal, tar_path: Path, ) -> None: """ - loads a zipped cookie-based profile stored at and - unzips it to . - The tar will remain unmodified. + Loads a zipped cookie-based profile stored at and unzips + it to . The tar will remain unmodified. """ - - assert tar_path.is_file() assert browser_params.browser_id is not None try: - # Copy and untar the loaded profile - logger.debug( - "BROWSER %i: Copying profile tar from %s to %s" - % ( - browser_params.browser_id, - tar_path, - browser_profile_folder, - ) - ) - shutil.copy(tar_path, browser_profile_folder) - tar_path = browser_profile_folder / tar_path.name + assert tar_path.is_file() + # Untar the loaded profile if tar_path.name.endswith("tar.gz"): f = tarfile.open(tar_path, "r:gz", errorlevel=1) else: f = tarfile.open(tar_path, "r", errorlevel=1) - f.extractall(browser_profile_folder) + f.extractall(browser_profile_path) f.close() - tar_path.unlink() logger.debug("BROWSER %i: Tarfile extracted" % browser_params.browser_id) except Exception as ex: diff --git a/openwpm/config.py b/openwpm/config.py index d8f4a4777..4b64f5784 100644 --- a/openwpm/config.py +++ b/openwpm/config.py @@ -97,7 +97,9 @@ class BrowserParams(DataClassJsonMixin): prefs: dict = field(default_factory=dict) tp_cookies: str = "always" bot_mitigation: bool = False - profile_archive_dir: Optional[str] = None + profile_archive_dir: Optional[Path] = field( + default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) + ) recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False diff --git a/openwpm/deploy_browsers/configure_firefox.py b/openwpm/deploy_browsers/configure_firefox.py index c8367d165..ce6d39fe1 100644 --- a/openwpm/deploy_browsers/configure_firefox.py +++ b/openwpm/deploy_browsers/configure_firefox.py @@ -1,7 +1,105 @@ """ Set prefs and load extensions in Firefox """ +import json +import re +from pathlib import Path +from typing import Any, Dict + +from ..config import BrowserParams + +# TODO: Remove hardcoded geckodriver default preferences. See +# https://github.com/mozilla/OpenWPM/issues/867 +# Source of preferences: +# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/prefs.rs +# https://hg.mozilla.org/mozilla-central/file/tip/testing/geckodriver/src/marionette.rs +DEFAULT_GECKODRIVER_PREFS = { + "app.normandy.api_url": "", + "app.update.checkInstallTime": False, + "app.update.disabledForTesting": True, + "app.update.auto": False, + "browser.dom.window.dump.enabled": True, + "devtools.console.stdout.chrome": True, + "browser.safebrowsing.blockedURIs.enabled": False, + "browser.safebrowsing.downloads.enabled": False, + "browser.safebrowsing.passwords.enabled": False, + "browser.safebrowsing.malware.enabled": False, + "browser.safebrowsing.phishing.enabled": False, + "browser.sessionstore.resume_from_crash": False, + "browser.shell.checkDefaultBrowser": False, + "browser.startup.homepage_override.mstone": "ignore", + "browser.startup.page": 0, + "browser.tabs.closeWindowWithLastTab": False, + "browser.tabs.warnOnClose": False, + "browser.uitour.enabled": False, + "browser.warnOnQuit": False, + "datareporting.healthreport.documentServerURI": "http://%(server)s/dummy/healthreport/", + "datareporting.healthreport.logging.consoleEnabled": False, + "datareporting.healthreport.service.enabled": False, + "datareporting.healthreport.service.firstRun": False, + "datareporting.healthreport.uploadEnabled": False, + "datareporting.policy.dataSubmissionEnabled": False, + "datareporting.policy.dataSubmissionPolicyBypassNotification": True, + "dom.ipc.reportProcessHangs": False, + "extensions.autoDisableScopes": 0, + "extensions.enabledScopes": 5, + "extensions.installDistroAddons": False, + "extensions.update.enabled": False, + "extensions.update.notifyUser": False, + "focusmanager.testmode": True, + "general.useragent.updates.enabled": False, + "geo.provider.testing": True, + "geo.wifi.scan": False, + "hangmonitor.timeout": 0, + "idle.lastDailyNotification": -1, + "javascript.options.showInConsole": True, + "media.gmp-manager.updateEnabled": False, + "media.sanity-test.disabled": True, + "network.http.phishy-userpass-length": 255, + "network.manage-offline-status": False, + "network.sntp.pools": "%(server)s", + "plugin.state.flash": 0, + "security.certerrors.mitm.priming.enabled": False, + "services.settings.server": "http://%(server)s/dummy/blocklist/", + "startup.homepage_welcome_url": "about:blank", + "startup.homepage_welcome_url.additional": "", + "toolkit.startup.max_resumed_crashes": -1, + "marionette.log.level": "Info", +} + + +def load_existing_prefs(browser_profile_path: Path) -> Dict[str, Any]: + """Load existing user preferences. + + If the browser profile contains a user.js file, load the preferences + specified inside it into a dictionary. + """ + prefs: Dict[str, Any] = {} + prefs_path = browser_profile_path / "user.js" + if not prefs_path.is_file(): + return prefs + # Regular expression from https://stackoverflow.com/a/24563687 + r = re.compile(r"\s*user_pref\(([\"'])(.+?)\1,\s*(.+?)\);") + with open(prefs_path, "r") as f: + for line in f: + m = r.match(line) + if m: + key, value = m.group(2), m.group(3) + prefs[key] = json.loads(value) + return prefs + + +def save_prefs_to_profile(prefs: Dict[str, Any], browser_profile_path: Path) -> None: + """Save all preferences to the browser profile. + + Write preferences from the prefs dictionary to a user.js file in the + profile directory. + """ + with open(browser_profile_path / "user.js", "w") as f: + for key, value in prefs.items(): + f.write('user_pref("%s", %s);\n' % (key, json.dumps(value))) + -def privacy(browser_params, fp, fo, root_dir, browser_profile_path): +def privacy(browser_params: BrowserParams, prefs: Dict[str, Any]) -> None: """ Configure the privacy settings in Firefox. This includes: * DNT @@ -12,15 +110,15 @@ def privacy(browser_params, fp, fo, root_dir, browser_profile_path): # Turns on Do Not Track if browser_params.donottrack: - fo.set_preference("privacy.donottrackheader.enabled", True) + prefs["privacy.donottrackheader.enabled"] = True # Sets the third party cookie setting if browser_params.tp_cookies.lower() == "never": - fo.set_preference("network.cookie.cookieBehavior", 1) + prefs["network.cookie.cookieBehavior"] = 1 elif browser_params.tp_cookies.lower() == "from_visited": - fo.set_preference("network.cookie.cookieBehavior", 3) + prefs["network.cookie.cookieBehavior"] = 3 else: # always allow third party cookies - fo.set_preference("network.cookie.cookieBehavior", 0) + prefs["network.cookie.cookieBehavior"] = 0 # Tracking Protection if browser_params.tracking_protection: @@ -31,7 +129,7 @@ def privacy(browser_params, fp, fo, root_dir, browser_profile_path): ) -def optimize_prefs(fo): +def optimize_prefs(prefs: Dict[str, Any]) -> None: """ Disable various features and checks the browser will do on startup. Some of these (e.g. disabling the newtab page) are required to prevent @@ -42,113 +140,113 @@ def optimize_prefs(fo): * https://github.com/pyllyukko/user.js/blob/master/user.js """ # noqa # Startup / Speed - fo.set_preference("browser.shell.checkDefaultBrowser", False) - fo.set_preference("browser.slowStartup.notificationDisabled", True) - fo.set_preference("browser.slowStartup.maxSamples", 0) - fo.set_preference("browser.slowStartup.samples", 0) - fo.set_preference("extensions.checkCompatibility.nightly", False) - fo.set_preference("browser.rights.3.shown", True) - fo.set_preference("reader.parse-on-load.enabled", False) - fo.set_preference("browser.pagethumbnails.capturing_disabled", True) - fo.set_preference("browser.uitour.enabled", False) - fo.set_preference("dom.flyweb.enabled", False) + prefs["browser.shell.checkDefaultBrowser"] = False + prefs["browser.slowStartup.notificationDisabled"] = True + prefs["browser.slowStartup.maxSamples"] = 0 + prefs["browser.slowStartup.samples"] = 0 + prefs["extensions.checkCompatibility.nightly"] = False + prefs["browser.rights.3.shown"] = True + prefs["reader.parse-on-load.enabled"] = False + prefs["browser.pagethumbnails.capturing_disabled"] = True + prefs["browser.uitour.enabled"] = False + prefs["dom.flyweb.enabled"] = False # Disable health reports / telemetry / crash reports - fo.set_preference("datareporting.policy.dataSubmissionEnabled", False) - fo.set_preference("datareporting.healthreport.uploadEnabled", False) - fo.set_preference("datareporting.healthreport.service.enabled", False) - fo.set_preference("toolkit.telemetry.archive.enabled", False) - fo.set_preference("toolkit.telemetry.enabled", False) - fo.set_preference("toolkit.telemetry.unified", False) - fo.set_preference("breakpad.reportURL", "") - fo.set_preference("dom.ipc.plugins.reportCrashURL", False) - fo.set_preference("browser.selfsupport.url", "") - fo.set_preference("browser.tabs.crashReporting.sendReport", False) - fo.set_preference("browser.crashReports.unsubmittedCheck.enabled", False) - fo.set_preference("dom.ipc.plugins.flash.subprocess.crashreporter.enabled", False) + prefs["datareporting.policy.dataSubmissionEnabled"] = False + prefs["datareporting.healthreport.uploadEnabled"] = False + prefs["datareporting.healthreport.service.enabled"] = False + prefs["toolkit.telemetry.archive.enabled"] = False + prefs["toolkit.telemetry.enabled"] = False + prefs["toolkit.telemetry.unified"] = False + prefs["breakpad.reportURL"] = "" + prefs["dom.ipc.plugins.reportCrashURL"] = False + prefs["browser.selfsupport.url"] = "" + prefs["browser.tabs.crashReporting.sendReport"] = False + prefs["browser.crashReports.unsubmittedCheck.enabled"] = False + prefs["dom.ipc.plugins.flash.subprocess.crashreporter.enabled"] = False # Predictive Actions / Prefetch - fo.set_preference("network.predictor.enabled", False) - fo.set_preference("network.dns.disablePrefetch", True) - fo.set_preference("network.prefetch-next", False) - fo.set_preference("browser.search.suggest.enabled", False) - fo.set_preference("network.http.speculative-parallel-limit", 0) - fo.set_preference("keyword.enabled", False) # location bar using search - fo.set_preference("browser.urlbar.userMadeSearchSuggestionsChoice", True) - fo.set_preference("browser.casting.enabled", False) + prefs["network.predictor.enabled"] = False + prefs["network.dns.disablePrefetch"] = True + prefs["network.prefetch-next"] = False + prefs["browser.search.suggest.enabled"] = False + prefs["network.http.speculative-parallel-limit"] = 0 + prefs["keyword.enabled"] = False # location bar using search + prefs["browser.urlbar.userMadeSearchSuggestionsChoice"] = True + prefs["browser.casting.enabled"] = False # Disable pinging Mozilla for geoip - fo.set_preference("browser.search.geoip.url", "") - fo.set_preference("browser.search.countryCode", "US") - fo.set_preference("browser.search.region", "US") + prefs["browser.search.geoip.url"] = "" + prefs["browser.search.countryCode"] = "US" + prefs["browser.search.region"] = "US" # Disable pinging Mozilla for geo-specific search - fo.set_preference("browser.search.geoSpecificDefaults", False) - fo.set_preference("browser.search.geoSpecificDefaults.url", "") + prefs["browser.search.geoSpecificDefaults"] = False + prefs["browser.search.geoSpecificDefaults.url"] = "" # Disable auto-updating - fo.set_preference("app.update.enabled", False) # browser - fo.set_preference("app.update.url", "") # browser - fo.set_preference("browser.search.update", False) # search - fo.set_preference("extensions.update.enabled", False) # extensions - fo.set_preference("extensions.update.autoUpdateDefault", False) - fo.set_preference("extensions.getAddons.cache.enabled", False) - fo.set_preference("lightweightThemes.update.enabled", False) # Personas + prefs["app.update.enabled"] = False # browser + prefs["app.update.url"] = "" # browser + prefs["browser.search.update"] = False # search + prefs["extensions.update.enabled"] = False # extensions + prefs["extensions.update.autoUpdateDefault"] = False + prefs["extensions.getAddons.cache.enabled"] = False + prefs["lightweightThemes.update.enabled"] = False # Personas # Disable Safebrowsing and other security features # that require on remote content - fo.set_preference("browser.safebrowsing.phising.enabled", False) - fo.set_preference("browser.safebrowsing.malware.enabled", False) - fo.set_preference("browser.safebrowsing.downloads.enabled", False) - fo.set_preference("browser.safebrowsing.downloads.remote.enabled", False) - fo.set_preference("browser.safebrowsing.blockedURIs.enabled", False) - fo.set_preference("browser.safebrowsing.provider.mozilla.gethashURL", "") - fo.set_preference("browser.safebrowsing.provider.google.gethashURL", "") - fo.set_preference("browser.safebrowsing.provider.google4.gethashURL", "") - fo.set_preference("browser.safebrowsing.provider.mozilla.updateURL", "") - fo.set_preference("browser.safebrowsing.provider.google.updateURL", "") - fo.set_preference("browser.safebrowsing.provider.google4.updateURL", "") - fo.set_preference("browser.safebrowsing.provider.mozilla.lists", "") # TP - fo.set_preference("browser.safebrowsing.provider.google.lists", "") # TP - fo.set_preference("browser.safebrowsing.provider.google4.lists", "") # TP - fo.set_preference("extensions.blocklist.enabled", False) # extensions - fo.set_preference("security.OCSP.enabled", 0) + prefs["browser.safebrowsing.phising.enabled"] = False + prefs["browser.safebrowsing.malware.enabled"] = False + prefs["browser.safebrowsing.downloads.enabled"] = False + prefs["browser.safebrowsing.downloads.remote.enabled"] = False + prefs["browser.safebrowsing.blockedURIs.enabled"] = False + prefs["browser.safebrowsing.provider.mozilla.gethashURL"] = "" + prefs["browser.safebrowsing.provider.google.gethashURL"] = "" + prefs["browser.safebrowsing.provider.google4.gethashURL"] = "" + prefs["browser.safebrowsing.provider.mozilla.updateURL"] = "" + prefs["browser.safebrowsing.provider.google.updateURL"] = "" + prefs["browser.safebrowsing.provider.google4.updateURL"] = "" + prefs["browser.safebrowsing.provider.mozilla.lists"] = "" # TP + prefs["browser.safebrowsing.provider.google.lists"] = "" # TP + prefs["browser.safebrowsing.provider.google4.lists"] = "" # TP + prefs["extensions.blocklist.enabled"] = False # extensions + prefs["security.OCSP.enabled"] = 0 # Disable Content Decryption Module and OpenH264 related downloads - fo.set_preference("media.gmp-manager.url", "") - fo.set_preference("media.gmp-provider.enabled", False) - fo.set_preference("media.gmp-widevinecdm.enabled", False) - fo.set_preference("media.gmp-widevinecdm.visible", False) - fo.set_preference("media.gmp-gmpopenh264.enabled", False) + prefs["media.gmp-manager.url"] = "" + prefs["media.gmp-provider.enabled"] = False + prefs["media.gmp-widevinecdm.enabled"] = False + prefs["media.gmp-widevinecdm.visible"] = False + prefs["media.gmp-gmpopenh264.enabled"] = False # Disable Experiments - fo.set_preference("experiments.enabled", False) - fo.set_preference("experiments.manifest.uri", "") - fo.set_preference("experiments.supported", False) - fo.set_preference("experiments.activeExperiment", False) - fo.set_preference("network.allow-experiments", False) + prefs["experiments.enabled"] = False + prefs["experiments.manifest.uri"] = "" + prefs["experiments.supported"] = False + prefs["experiments.activeExperiment"] = False + prefs["network.allow-experiments"] = False # Disable pinging Mozilla for newtab - fo.set_preference("browser.newtabpage.directory.ping", "") - fo.set_preference("browser.newtabpage.directory.source", "") - fo.set_preference("browser.newtabpage.enabled", False) - fo.set_preference("browser.newtabpage.enhanced", False) - fo.set_preference("browser.newtabpage.introShown", True) - fo.set_preference("browser.aboutHomeSnippets.updateUrl", "") + prefs["browser.newtabpage.directory.ping"] = "" + prefs["browser.newtabpage.directory.source"] = "" + prefs["browser.newtabpage.enabled"] = False + prefs["browser.newtabpage.enhanced"] = False + prefs["browser.newtabpage.introShown"] = True + prefs["browser.aboutHomeSnippets.updateUrl"] = "" # Disable Pocket - fo.set_preference("extensions.pocket.enabled", False) + prefs["extensions.pocket.enabled"] = False # Disable Shield - fo.set_preference("app.shield.optoutstudies.enabled", False) - fo.set_preference("extensions.shield-recipe-client.enabled", False) + prefs["app.shield.optoutstudies.enabled"] = False + prefs["extensions.shield-recipe-client.enabled"] = False - # Disable Source Pragams + # Disable Source Pragmas # As per https://bugzilla.mozilla.org/show_bug.cgi?id=1628853 # sourceURL can be used to obfuscate the original origin of # a script, we disable it. - fo.set_preference("javascript.options.source_pragmas", False) + prefs["javascript.options.source_pragmas"] = False # Enable extensions and disable extension signing - fo.set_preference("extensions.experiments.enabled", True) - fo.set_preference("xpinstall.signatures.required", False) + prefs["extensions.experiments.enabled"] = True + prefs["xpinstall.signatures.required"] = False diff --git a/openwpm/deploy_browsers/deploy_firefox.py b/openwpm/deploy_browsers/deploy_firefox.py index 862525689..826b8830a 100755 --- a/openwpm/deploy_browsers/deploy_firefox.py +++ b/openwpm/deploy_browsers/deploy_firefox.py @@ -1,6 +1,8 @@ import json import logging import os.path +import socket +import tempfile from pathlib import Path from typing import Any, Dict, Optional, Tuple @@ -8,7 +10,6 @@ from multiprocess import Queue from pyvirtualdisplay import Display from selenium import webdriver -from selenium.webdriver.firefox.firefox_profile import FirefoxProfile from ..commands.profile_commands import load_profile from ..config import BrowserParamsInternal, ConfigEncoder, ManagerParamsInternal @@ -25,7 +26,7 @@ def deploy_firefox( browser_params: BrowserParamsInternal, manager_params: ManagerParamsInternal, crash_recovery: bool, -) -> Tuple[webdriver.Firefox, str, Optional[Display]]: +) -> Tuple[webdriver.Firefox, Path, Optional[Display]]: """ launches a firefox instance with parameters set by the input dictionary """ @@ -33,14 +34,20 @@ def deploy_firefox( root_dir = os.path.dirname(__file__) # directory of this file - fp = FirefoxProfile() - browser_profile_path = Path(fp.path) + browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) status_queue.put(("STATUS", "Profile Created", browser_profile_path)) # Use Options instead of FirefoxProfile to set preferences since the # Options method has no "frozen"/restricted options. # https://github.com/SeleniumHQ/selenium/issues/2106#issuecomment-320238039 fo = Options() + # Set a custom profile that is used in-place and is not deleted by geckodriver. + # https://firefox-source-docs.mozilla.org/testing/geckodriver/CrashReports.html + # Using FirefoxProfile breaks stateful crawling: + # https://github.com/mozilla/OpenWPM/issues/423#issuecomment-521018093 + fo.add_argument("-profile") + fo.add_argument(str(browser_profile_path)) + assert browser_params.browser_id is not None if browser_params.seed_tar and not crash_recovery: logger.info( @@ -110,16 +117,32 @@ def deploy_firefox( # TODO restore detailed logging # fo.set_preference("extensions.@openwpm.sdk.console.logLevel", "all") + # Geckodriver currently places the user.js file in the wrong profile + # directory, so we have to create it manually here. + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when + # to remove this workaround. + # Load existing preferences from the profile's user.js file + prefs = configure_firefox.load_existing_prefs(browser_profile_path) + # Load default geckodriver preferences + prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS) + # Pick an available port for Marionette (https://stackoverflow.com/a/2838309) + # This has a race condition, as another process may get the port + # before Marionette, but we don't expect it to happen often + s = socket.socket() + s.bind(("", 0)) + marionette_port = s.getsockname()[1] + s.close() + prefs["marionette.port"] = marionette_port + # Configure privacy settings - configure_firefox.privacy(browser_params, fp, fo, root_dir, browser_profile_path) + configure_firefox.privacy(browser_params, prefs) # Set various prefs to improve speed and eliminate traffic to Mozilla - configure_firefox.optimize_prefs(fo) + configure_firefox.optimize_prefs(prefs) # Intercept logging at the Selenium level and redirect it to the - # main logger. This will also inform us where the real profile - # directory is hiding. - interceptor = FirefoxLogInterceptor(browser_params.browser_id, browser_profile_path) + # main logger. + interceptor = FirefoxLogInterceptor(browser_params.browser_id) interceptor.start() # Set custom prefs. These are set after all of the default prefs to allow @@ -129,16 +152,21 @@ def deploy_firefox( "BROWSER %i: Setting custom preference: %s = %s" % (browser_params.browser_id, name, value) ) - fo.set_preference(name, value) + prefs[name] = value + + # Write all preferences to the profile's user.js file + configure_firefox.save_prefs_to_profile(prefs, browser_profile_path) # Launch the webdriver status_queue.put(("STATUS", "Launch Attempted", None)) fb = FirefoxBinary(firefox_path=firefox_binary_path) driver = webdriver.Firefox( - firefox_profile=fp, firefox_binary=fb, - firefox_options=fo, + options=fo, log_path=interceptor.fifo, + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for + # when to remove this + service_args=["--marionette-port", str(marionette_port)], ) # Add extension @@ -165,4 +193,4 @@ def deploy_firefox( status_queue.put(("STATUS", "Browser Launched", int(pid))) - return driver, driver.capabilities["moz:profile"], display + return driver, browser_profile_path, display diff --git a/openwpm/deploy_browsers/selenium_firefox.py b/openwpm/deploy_browsers/selenium_firefox.py index 67d1450b3..77938c688 100644 --- a/openwpm/deploy_browsers/selenium_firefox.py +++ b/openwpm/deploy_browsers/selenium_firefox.py @@ -46,15 +46,13 @@ class FirefoxLogInterceptor(threading.Thread): """ Intercept logs from Selenium and/or geckodriver, using a named pipe and a detached thread, and feed them to the primary logger for this - instance. Also responsible for extracting the _real_ profile location - from geckodriver's log output (geckodriver copies the profile). + instance. """ - def __init__(self, browser_id, profile_path): + def __init__(self, browser_id): threading.Thread.__init__(self, name="log-interceptor-%i" % browser_id) self.browser_id = browser_id self.fifo = mktempfifo(suffix=".log", prefix="owpm_driver_") - self.profile_path = profile_path self.daemon = True self.logger = logging.getLogger("openwpm") @@ -68,11 +66,6 @@ def run(self): self.logger.debug( "BROWSER %i: driver: %s" % (self.browser_id, line.strip()) ) - if "Using profile path" in line: - self.profile_path = line.partition("Using profile path")[ - -1 - ].strip() - if self.fifo is not None: os.unlink(self.fifo) self.fifo = None @@ -83,7 +76,7 @@ def run(self): self.fifo = None -class PatchedGeckoDriverService(BaseService): +class PatchedGeckoDriverService(FirefoxDriverModule.Service): """Object that manages the starting and stopping of the GeckoDriver. Modified from the original (selenium.webdriver.firefox.service.Service) for Py3 compat in the presence of log FIFOs, and for potential future @@ -128,11 +121,5 @@ def __init__( ) self.service_args = service_args or [] - def command_line_args(self): - return ["--port", "%d" % self.port] - - def send_remote_shutdown_command(self): - pass - FirefoxDriverModule.Service = PatchedGeckoDriverService diff --git a/openwpm/task_manager.py b/openwpm/task_manager.py index e61d98f04..934b4707c 100644 --- a/openwpm/task_manager.py +++ b/openwpm/task_manager.py @@ -305,8 +305,8 @@ def _shutdown_manager( Parameters ---------- during_init : - flag to indicator if this shutdown is occuring during - the TaskManager initialization + flag to indicate if this shutdown is occuring during + the TaskManager initialization relaxed : If `True` the function will wait for all active `CommandSequences` to finish before shutting down @@ -434,17 +434,6 @@ def _issue_command( assert browser.browser_id is not None assert browser.curr_visit_id is not None reset = command_sequence.reset - if not reset: - self.logger.warning( - "BROWSER %i: Browser will not reset after CommandSequence " - "executes. OpenWPM does not currently support stateful crawls " - "(see: https://github.com/mozilla/OpenWPM/projects/2). " - "The next command issued to this browser may or may not " - "use the same profile (depending on the failure status of " - "this command). To prevent this warning, initialize the " - "CommandSequence with `reset` set to `True` to use a fresh " - "profile for each command." % browser.browser_id - ) self.logger.info( "Starting to work on CommandSequence with " "visit_id %d on browser with id %d", diff --git a/test/conftest.py b/test/conftest.py index e33801938..fe9cd9d70 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -24,13 +24,17 @@ pytest_plugins = "test.storage.fixtures" -@pytest.fixture(scope="session") def xpi(): # Creates a new xpi using npm run build. print("Building new xpi") subprocess.check_call(["npm", "run", "build"], cwd=EXTENSION_DIR) +@pytest.fixture(name="xpi", scope="session") +def xpi_fixture(): + return xpi() + + @pytest.fixture(scope="session") def server(): """Run an HTTP server during the tests.""" diff --git a/test/manual_test.py b/test/manual_test.py index 0f5afb347..3fcdf9c16 100644 --- a/test/manual_test.py +++ b/test/manual_test.py @@ -1,11 +1,15 @@ import atexit +import shutil import subprocess +import tempfile from os.path import dirname, join, realpath +from pathlib import Path import click import IPython from selenium import webdriver from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from selenium.webdriver.firefox.options import Options from openwpm import js_instrumentation as jsi from openwpm.config import BrowserParams @@ -88,7 +92,7 @@ def start_webdriver( Set to True to load browser_params browser_params_file : string Specify the browser_params.json to load. - If None, default params form openwpm/config.py::BrowserParams will be loaded. + If None, default params from openwpm/config.py::BrowserParams will be loaded. Returns ------- @@ -110,16 +114,35 @@ def cleanup_server(): print("...server shutdown") driver.quit() print("...webdriver closed") + shutil.rmtree(driver.capabilities["moz:profile"], ignore_errors=True) + print("...browser profile removed") atexit.register(cleanup_server) return driver - fp = webdriver.FirefoxProfile() + browser_profile_path = Path(tempfile.mkdtemp(prefix="firefox_profile_")) + fo = Options() + fo.add_argument("-profile") + fo.add_argument(str(browser_profile_path)) + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for when + # to remove manually creating user.js + prefs = configure_firefox.load_existing_prefs(browser_profile_path) + prefs.update(configure_firefox.DEFAULT_GECKODRIVER_PREFS) + if with_extension: # TODO: Restore preference for log level in a way that works in Fx 57+ # fp.set_preference("extensions.@openwpm.sdk.console.logLevel", "all") - configure_firefox.optimize_prefs(fp) - driver = webdriver.Firefox(firefox_binary=fb, firefox_profile=fp) + configure_firefox.optimize_prefs(prefs) + + configure_firefox.save_prefs_to_profile(prefs, browser_profile_path) + driver = webdriver.Firefox( + firefox_binary=fb, + options=fo, + # Use the default Marionette port. + # TODO: See https://github.com/mozilla/OpenWPM/issues/867 for + # when to remove this + service_args=["--marionette-port", "2828"], + ) if load_browser_params is True: # There's probably more we could do here # to set more preferences and better emulate @@ -134,8 +157,7 @@ def cleanup_server(): js_request_as_string = jsi.clean_js_instrumentation_settings(js_request) browser_params.js_instrument_settings = js_request_as_string - profile_dir = driver.capabilities["moz:profile"] - with open(join(profile_dir, "browser_params.json"), "w") as f: + with open(browser_profile_path / "browser_params.json", "w") as f: f.write(browser_params.to_json()) if with_extension: @@ -192,9 +214,9 @@ def start_webext(): "--browser-params-file", help=""" Specify a browser_params.json file. If none provided and - --browser-params is enabled. Default browser_params.json - will be used. Pass an absolute path or a path relative - to the test directory.""", + --browser-params is enabled the default params from + openwpm/config.py::BrowserParams will be loaded. Pass an + absolute path or a path relative to the test directory.""", ) def main(selenium, no_extension, browser_params, browser_params_file): diff --git a/test/test_callback.py b/test/test_callback.py index 472307632..51a6c3513 100644 --- a/test/test_callback.py +++ b/test/test_callback.py @@ -7,7 +7,7 @@ def test_local_callbacks(default_params, task_manager_creator): - """Test test the storage controller as well as the entire callback machinery + """Test the storage controller as well as the entire callback machinery to see if all callbacks get correctly called""" manager, _ = task_manager_creator(default_params) TEST_SITE = BASE_TEST_URL + "/test_pages/simple_a.html" @@ -17,7 +17,7 @@ def callback(argument: List[int], success: bool) -> None: my_list: List[int] = [] sequence = CommandSequence( - TEST_SITE, reset=True, blocking=True, callback=partial(callback, my_list) + TEST_SITE, blocking=True, callback=partial(callback, my_list) ) sequence.get() diff --git a/test/test_crawl.py b/test/test_crawl.py index c65a2ab2b..d9235edec 100644 --- a/test/test_crawl.py +++ b/test/test_crawl.py @@ -1,18 +1,19 @@ -# type:ignore -# As this file is no longer maintained, mypy shouldn't check this +"""Runs a short test crawl. + +This should be used to test any features that require real crawl data. +This should be avoided if possible, as controlled tests will be easier +to debug. +""" + +import json import os import tarfile -from pathlib import Path -from typing import List, Tuple import domain_utils as du import pytest -from openwpm.config import BrowserParams, ManagerParams from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest - TEST_SITES = [ "http://google.com", "http://facebook.com", @@ -37,122 +38,105 @@ ] -def get_public_suffix(url): - url_parts = du.hostname_subparts(url, include_ps=True) - return url_parts[-1] - +@pytest.mark.skipif( + "CI" not in os.environ or os.environ["CI"] == "false", + reason="Makes remote connections", +) +@pytest.mark.slow +def test_browser_profile_coverage(default_params, task_manager_creator): + """Test the coverage of the browser's profile. -class TestCrawl(OpenWPMTest): - """Runs a short test crawl. - - This should be used to test any features that require real - crawl data. This should be avoided if possible, as controlled - tests will be easier to debug + This verifies that Firefox's places.sqlite database contains all + visited sites (with a few exceptions). If it does not, it is likely + the profile is lost at some point during the crawl. """ - - def get_config( - self, data_dir: Path = None - ) -> Tuple[ManagerParams, List[BrowserParams]]: - manager_params, browser_params = self.get_test_config(data_dir) - browser_params[0].profile_archive_dir = os.path.join( - manager_params.data_directory, "browser_profile" - ) - browser_params[0].http_instrument = True - return manager_params, browser_params - - @pytest.mark.xfail(run=False) - @pytest.mark.slow - def test_browser_profile_coverage(self, tmpdir: Path, task_manager_creator) -> None: - """Test the coverage of the browser's profile - - This verifies that Firefox's places.sqlite database contains - all visited sites (with a few exceptions). If it does not, - it is likely the profile is lost at some point during the crawl - """ - # Run the test crawl - data_dir = tmpdir / "data_dir" - manager_params, browser_params = self.get_config(data_dir) - manager, crawl_db = task_manager_creator((manager_params, browser_params)) - for site in TEST_SITES: - manager.get(site) - ff_db_tar = os.path.join( - browser_params[0].profile_archive_dir, "profile.tar.gz" + # Run the test crawl + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + browser_params[0].http_instrument = True + manager, crawl_db = task_manager_creator((manager_params, browser_params[:1])) + for site in TEST_SITES: + manager.get(site) + manager.close() + + # Extract crawl profile + ff_db_tar = browser_params[0].profile_archive_dir / "profile.tar.gz" + with tarfile.open(ff_db_tar) as tar: + tar.extractall(browser_params[0].profile_archive_dir) + + # Output databases + ff_db = browser_params[0].profile_archive_dir / "places.sqlite" + + # Grab urls from crawl database + rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") + req_ps = set() # visited domains from http_requests table + for (url,) in rows: + req_ps.add(du.get_ps_plus_1(url)) + + hist_ps = set() # visited domains from crawl_history Table + statuses = dict() + rows = db_utils.query_db( + crawl_db, + "SELECT arguments, command_status FROM crawl_history WHERE" + " command='GetCommand'", + ) + for arguments, command_status in rows: + url = json.loads(arguments)["url"] + ps = du.get_ps_plus_1(url) + hist_ps.add(ps) + statuses[ps] = command_status + + # Grab urls from Firefox database + profile_ps = set() # visited domains from firefox profile + rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") + for (host,) in rows: + try: + profile_ps.add(du.get_ps_plus_1(host)) + except AttributeError: + pass + + # We expect a url to be in the Firefox profile if: + # 1. We've made requests to it + # 2. The url is a top_url we entered into the address bar + # 3. The url successfully loaded (see: Issue #40) + # 4. The site does not respond to the initial request with a 204 + # (won't show in FF DB) + missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) + unexpected_missing_urls = set() + for url in missing_urls: + if command_status[url] != "ok": + continue + + # Get the visit id for the url + rows = db_utils.query_db( + crawl_db, + "SELECT visit_id FROM site_visits WHERE site_url = ?", + ("http://" + url,), ) - manager.close() - - # Extract crawl profile - with tarfile.open(ff_db_tar) as tar: - tar.extractall(browser_params[0].profile_archive_dir) - - # Output databases - ff_db = os.path.join(browser_params[0].profile_archive_dir, "places.sqlite") + visit_id = rows[0] - # Grab urls from crawl database - rows = db_utils.query_db(crawl_db, "SELECT url FROM http_requests") - req_ps = set() # visited domains from http_requests table - for (url,) in rows: - req_ps.add(get_public_suffix(url)) + rows = db_utils.query_db( + crawl_db, + "SELECT COUNT(*) FROM http_responses WHERE visit_id = ?", + (visit_id,), + ) + if rows[0] > 1: + continue - hist_ps = set() # visited domains from crawl_history Table - statuses = dict() rows = db_utils.query_db( crawl_db, - "SELECT arguments, command_status " - "FROM crawl_history WHERE command='GET'", + "SELECT response_status, location FROM " + "http_responses WHERE visit_id = ?", + (visit_id,), ) - for url, command_status in rows: - ps = get_public_suffix(url) - hist_ps.add(ps) - statuses[ps] = command_status - - # Grab urls from Firefox database - profile_ps = set() # visited domains from firefox profile - rows = db_utils.query_db(ff_db, "SELECT url FROM moz_places") - for (host,) in rows: - try: - profile_ps.add(get_public_suffix(host)) - except AttributeError: - pass - - # We expect urls to be in the Firefox profile if: - # 1. We've made requests to it - # 2. The url is a top_url we entered into the address bar - # 3. The url successfully loaded (see: Issue #40) - # 4. The site does not respond to the initial request with a 204 - # (won't show in FF DB) - missing_urls = req_ps.intersection(hist_ps).difference(profile_ps) - unexpected_missing_urls = set() - for url in missing_urls: - if command_status[url] != "ok": - continue - - # Get the visit id for the url - rows = db_utils.query_db( - crawl_db, - "SELECT visit_id FROM site_visits " "WHERE site_url = ?", - ("http://" + url,), - ) - visit_id = rows[0] - - rows = db_utils.query_db( - crawl_db, - "SELECT COUNT(*) FROM http_responses " "WHERE visit_id = ?", - (visit_id,), - ) - if rows[0] > 1: - continue - - rows = db_utils.query_db( - crawl_db, - "SELECT response_status, location FROM " - "http_responses WHERE visit_id = ?", - (visit_id,), - ) - response_status, location = rows[0] - if response_status == 204: - continue - if location == "http://": # site returned a blank redirect - continue - unexpected_missing_urls.add(url) - - assert len(unexpected_missing_urls) == 0 + response_status, location = rows[0] + if response_status == 204: + continue + if location == "http://": # site returned a blank redirect + continue + unexpected_missing_urls.add(url) + + assert len(unexpected_missing_urls) == 0 diff --git a/test/test_profile.py b/test/test_profile.py index 85c05c80e..32839fa4b 100644 --- a/test/test_profile.py +++ b/test/test_profile.py @@ -1,86 +1,86 @@ -from os.path import isfile, join from pathlib import Path -from typing import Any, List, Optional, Tuple +from typing import Any import pytest from openwpm.command_sequence import CommandSequence from openwpm.commands.types import BaseCommand -from openwpm.config import BrowserParams, ManagerParams from openwpm.errors import CommandExecutionError, ProfileLoadError -from openwpm.task_manager import TaskManager from openwpm.utilities import db_utils -from .openwpmtest import OpenWPMTest +from .utilities import BASE_TEST_URL # TODO update these tests to make use of blocking commands -class TestProfile(OpenWPMTest): - def get_config( - self, data_dir: Optional[Path] - ) -> Tuple[ManagerParams, List[BrowserParams]]: - manager_params, browser_params = self.get_test_config(data_dir) - browser_params[0].profile_archive_dir = join( - manager_params.data_directory, "browser_profile" - ) - return manager_params, browser_params - - @pytest.mark.xfail(run=False) - def test_saving(self): - manager_params, browser_params = self.get_config() - manager = TaskManager(manager_params, browser_params) - manager.get("http://example.com") - manager.close() - assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) - - @pytest.mark.xfail(run=False) - def test_crash_profile(self): - manager_params, browser_params = self.get_config() - manager_params.failure_limit = 2 - manager = TaskManager(manager_params, browser_params) - try: - manager.get("http://example.com") # So we have a profile - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Selenium requires scheme prefix - manager.get("example.com") # Requires two commands to shut down - except CommandExecutionError: - pass - assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) - - @pytest.mark.xfail(run=False) - def test_profile_error(self): - manager_params, browser_params = self.get_config() - browser_params[0].seed_tar = "/tmp/NOTREAL" - with pytest.raises(ProfileLoadError): - TaskManager(manager_params, browser_params) # noqa - - @pytest.mark.skip(reason="proxy no longer supported, need to update") - def test_profile_saved_when_launch_crashes(self): - manager_params, browser_params = self.get_config() - browser_params[0].proxy = True - browser_params[0].save_content = "script" - manager = TaskManager(manager_params, browser_params) - manager.get("http://example.com") - - # Kill the LevelDBAggregator - # This will cause the proxy launch to crash - manager.ldb_status_queue.put("DIE") - manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly - manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout - manager.get("example.com") # Cause a selenium crash - - # The browser will fail to launch due to the proxy crashes - try: - manager.get("http://example.com") - except CommandExecutionError: - pass - manager.close() - assert isfile(join(browser_params[0].profile_archive_dir, "profile.tar.gz")) - - -def test_seed_persistance(default_params, task_manager_creator): +def test_saving(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + manager.get(BASE_TEST_URL) + manager.close() + assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() + + +def test_crash_profile(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + manager_params.failure_limit = 2 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + try: + manager.get(BASE_TEST_URL) # So we have a profile + manager.get("example.com") # Selenium requires scheme prefix + manager.get("example.com") # Selenium requires scheme prefix + manager.get("example.com") # Selenium requires scheme prefix + manager.get("example.com") # Requires two commands to shut down + except CommandExecutionError: + pass + assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() + + +def test_profile_error(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].seed_tar = Path("/tmp/NOTREAL") + with pytest.raises(ProfileLoadError): + task_manager_creator((manager_params, browser_params[:1])) + + +@pytest.mark.skip(reason="proxy no longer supported, need to update") +def test_profile_saved_when_launch_crashes(default_params, task_manager_creator): + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + browser_params[0].profile_archive_dir = ( + manager_params.data_directory / "browser_profile" + ) + browser_params[0].proxy = True + browser_params[0].save_content = "script" + manager, _ = task_manager_creator((manager_params, browser_params[:1])) + manager.get(BASE_TEST_URL) + + # Kill the LevelDBAggregator + # This will cause the proxy launch to crash + manager.ldb_status_queue.put("DIE") + manager.browsers[0]._SPAWN_TIMEOUT = 2 # Have timeout occur quickly + manager.browsers[0]._UNSUCCESSFUL_SPAWN_LIMIT = 2 # Quick timeout + manager.get("example.com") # Cause a selenium crash + + # The browser will fail to launch due to the proxy crashes + try: + manager.get(BASE_TEST_URL) + except CommandExecutionError: + pass + manager.close() + assert (browser_params[0].profile_archive_dir / "profile.tar.gz").is_file() + + +def test_seed_persistence(default_params, task_manager_creator): manager_params, browser_params = default_params p = Path("profile.tar.gz") for browser_param in browser_params: @@ -89,7 +89,7 @@ def test_seed_persistance(default_params, task_manager_creator): command_sequences = [] for _ in range(2): - cs = CommandSequence(url="https://example.com", reset=True) + cs = CommandSequence(url=BASE_TEST_URL) cs.get() cs.append_command(AssertConfigSetCommand("test_pref", True)) command_sequences.append(cs)