Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restore stateful crawling support #864

Merged
merged 22 commits into from
Mar 29, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def callback(success: bool, val: str = site) -> None:
command_sequence = CommandSequence(
site,
site_rank=index,
reset=True,
callback=callback,
)

Expand Down
6 changes: 0 additions & 6 deletions docs/Configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,6 @@ TODO

# Browser Profile Support

**WARNING: Stateful crawls are currently not supported. Attempts to run
stateful crawls will throw `NotImplementedError`s. The work required to
restore support is tracked in
[this project](https://github.com/mozilla/OpenWPM/projects/2).**

## Stateful vs Stateless crawls

By default OpenWPM performs a "stateful" crawl, in that it keeps a consistent
Expand Down Expand Up @@ -323,7 +318,6 @@ but will not be used during crash recovery. Specifically:
profile specified by `seed_tar`. If OpenWPM determines that Firefox needs to
restart for some reason during the crawl, it will use the profile from
the most recent page visit (pre-crash) rather than the `seed_tar` profile.
Note that stateful crawls are currently [unsupported](https://github.com/mozilla/OpenWPM/projects/2)).
* For stateless crawls, the initial `seed_tar` will be loaded during each
new page visit. Note that this means the profile will very likely be
_incomplete_, as cookies or storage may have been set or changed during the
Expand Down
90 changes: 44 additions & 46 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@
import shutil
import signal
import sys
import tempfile
import threading
import time
import traceback
from pathlib import Path
from queue import Empty as EmptyQueue
from typing import Optional, Union

Expand All @@ -16,6 +18,7 @@
from selenium.common.exceptions import WebDriverException
from tblib import pickling_support

from .commands.profile_commands import DumpProfileCommand
from .commands.types import BaseCommand, ShutdownSignal
from .config import BrowserParamsInternal, ManagerParamsInternal
from .deploy_browsers import deploy_firefox
Expand All @@ -33,7 +36,7 @@

class Browser:
"""
The Browser class is responsbile for holding all of the
The Browser class is responsible for holding all of the
configuration and status information on BrowserManager process
it corresponds to. It also includes a set of methods for managing
the BrowserManager process and its child processes/threads.
Expand All @@ -52,7 +55,7 @@ def __init__(
self._UNSUCCESSFUL_SPAWN_LIMIT = 4

# manager parameters
self.current_profile_path = None
self.current_profile_path: Optional[Path] = None
self.db_socket_address = manager_params.storage_controller_address
assert browser_params.browser_id is not None
self.browser_id: BrowserId = browser_params.browser_id
Expand Down Expand Up @@ -97,29 +100,33 @@ def launch_browser_manager(self):
sets up the BrowserManager and gets the process id, browser pid and,
if applicable, screen pid. loads associated user profile if necessary
"""
# Unsupported. See https://github.com/mozilla/OpenWPM/projects/2
# if this is restarting from a crash, update the tar location
# to be a tar of the crashed browser's history
"""
if self.current_profile_path is not None:
# tar contents of crashed profile to a temp dir
tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_") + "/"
profile_commands.dump_profile(
self.current_profile_path,
self.manager_params,
self.browser_params,
tempdir,
close_webdriver=False,
tempdir = tempfile.mkdtemp(prefix="owpm_profile_archive_")
vringar marked this conversation as resolved.
Show resolved Hide resolved
tar_path = Path(tempdir) / "profile.tar.gz"

self.browser_params.profile_path = self.current_profile_path
dump_profile_command = DumpProfileCommand(
tar_path=tar_path, close_webdriver=False, compress=True
boolean5 marked this conversation as resolved.
Show resolved Hide resolved
)
dump_profile_command.execute(
webdriver=None,
browser_params=self.browser_params,
manager_params=self.manager_params,
extension_socket=None,
)

# make sure browser loads crashed profile
self.browser_params.recovery_tar = tempdir
self.browser_params.recovery_tar = tar_path

crash_recovery = True
else:
"""
tempdir = None
crash_recovery = False

self.logger.info("BROWSER %i: Launching browser..." % self.browser_id)
tempdir = None
crash_recovery = False
self.is_fresh = not crash_recovery

# Try to spawn the browser within the timelimit
Expand Down Expand Up @@ -394,33 +401,32 @@ def shutdown_browser(self, during_init: bool, force: bool = False) -> None:
self.close_browser_manager(force=force)

# Archive browser profile (if requested)
if not during_init and self.browser_params.profile_archive_dir is not None:
self.logger.warning(
"BROWSER %i: Archiving the browser profile directory is "
"currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2" % self.browser_id
)
"""
self.logger.debug(
"BROWSER %i: during_init=%s | profile_archive_dir=%s" % (
self.browser_id, str(during_init),
self.browser_params.profile_archive_dir)
"BROWSER %i: during_init=%s | profile_archive_dir=%s"
% (
self.browser_id,
str(during_init),
self.browser_params.profile_archive_dir,
)
)
if (not during_init and
self.browser_params.profile_archive_dir is not None):
if not during_init and self.browser_params.profile_archive_dir is not None:
self.logger.debug(
"BROWSER %i: Archiving browser profile directory to %s" % (
self.browser_id,
self.browser_params.profile_archive_dir))
profile_commands.dump_profile(
self.current_profile_path,
self.manager_params,
self.browser_params,
self.browser_params.profile_archive_dir,
"BROWSER %i: Archiving browser profile directory to %s"
% (self.browser_id, self.browser_params.profile_archive_dir)
)
tar_path = self.browser_params.profile_archive_dir / "profile.tar.gz"
self.browser_params.profile_path = self.current_profile_path
dump_profile_command = DumpProfileCommand(
tar_path=tar_path,
close_webdriver=False,
compress=True
compress=True,
boolean5 marked this conversation as resolved.
Show resolved Hide resolved
)
dump_profile_command.execute(
webdriver=None,
browser_params=self.browser_params,
manager_params=self.manager_params,
extension_socket=None,
)
boolean5 marked this conversation as resolved.
Show resolved Hide resolved
"""

# Clean up temporary files
if self.current_profile_path is not None:
Expand All @@ -444,8 +450,6 @@ def BrowserManager(
driver, prof_folder, display = deploy_firefox.deploy_firefox(
status_queue, browser_params, manager_params, crash_recovery
)
if prof_folder[-1] != "/":
prof_folder += "/"

# Read the extension port -- if extension is enabled
# TODO: Initial communication from extension to TM should use sockets
Expand All @@ -456,7 +460,7 @@ def BrowserManager(
)
elapsed = 0
port = None
ep_filename = os.path.join(prof_folder, "extension_port.txt")
ep_filename = prof_folder / "extension_port.txt"
while elapsed < 5:
try:
with open(ep_filename, "rt") as f:
Expand Down Expand Up @@ -498,12 +502,6 @@ def BrowserManager(
command: Union[ShutdownSignal, BaseCommand] = command_queue.get()

if type(command) is ShutdownSignal:
# Geckodriver creates a copy of the profile (and the original
# temp file created by FirefoxProfile() is deleted).
# We clear the profile attribute here to prevent prints from:
# https://github.com/SeleniumHQ/selenium/blob/4e4160dd3d2f93757cafb87e2a1c20d6266f5554/py/selenium/webdriver/firefox/webdriver.py#L193-L199
if driver.profile and not os.path.isdir(driver.profile.path):
driver.profile = None
driver.quit()
status_queue.put("OK")
return
Expand Down
13 changes: 4 additions & 9 deletions openwpm/command_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
SaveScreenshotCommand,
ScreenshotFullPageCommand,
)
from .commands.profile_commands import DumpProfileCommand
from .commands.types import BaseCommand
from .errors import CommandExecutionError

Expand Down Expand Up @@ -86,16 +87,10 @@ def browse(self, num_links=2, sleep=0, timeout=60):
self._commands_with_timeout.append((command, timeout))
self.contains_get_or_browse = True

def dump_profile(
self, dump_folder, close_webdriver=False, compress=True, timeout=120
):
def dump_profile(self, tar_path, close_webdriver=False, compress=True, timeout=120):
""" dumps from the profile path to a given file (absolute path) """
raise NotImplementedError(
"Profile saving is currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2."
)
self.total_timeout += timeout
command = DumpProfCommand(dump_folder, close_webdriver, compress)
command = DumpProfileCommand(tar_path, close_webdriver, compress)
self._commands_with_timeout.append((command, timeout))

def save_screenshot(self, suffix="", timeout=30):
Expand Down Expand Up @@ -131,7 +126,7 @@ def screenshot_full_page(self, suffix="", timeout=30):
self.total_timeout += timeout
if not self.contains_get_or_browse:
raise CommandExecutionError(
"No get or browse request preceding " "the dump page source command",
"No get or browse request preceding the dump page source command",
self,
)
command = ScreenshotFullPageCommand(suffix)
Expand Down
18 changes: 8 additions & 10 deletions openwpm/commands/profile_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shutil
import tarfile
from pathlib import Path
from typing import Optional

from selenium.webdriver import Firefox

Expand All @@ -25,13 +26,9 @@ def __init__(self, tar_path: Path, close_webdriver: bool, compress: bool) -> Non
self.tar_path = tar_path
self.close_webdriver = close_webdriver
self.compress = compress
raise NotImplementedError(
"Profile dumping is currently unsupported. "
"See: https://github.com/mozilla/OpenWPM/projects/2."
)

def __repr__(self) -> str:
return "DumpProfCommand({},{},{})".format(
return "DumpProfileCommand({},{},{})".format(
self.tar_path, self.close_webdriver, self.compress
)

Expand All @@ -40,10 +37,11 @@ def execute(
webdriver: Firefox,
browser_params: BrowserParamsInternal,
manager_params: ManagerParamsInternal,
extension_socket: ClientSocket,
extension_socket: Optional[ClientSocket],
) -> None:
browser_profile_folder = browser_params.profile_path
assert browser_profile_folder is not None
assert browser_params.browser_id is not None

# Creating the folders if need be
self.tar_path.parent.mkdir(exist_ok=True, parents=True)
Expand All @@ -65,7 +63,7 @@ def execute(
logger.debug(
"BROWSER %i: Backing up full profile from %s to %s"
% (
self.browser_id,
browser_params.browser_id,
browser_profile_folder,
self.tar_path,
)
Expand Down Expand Up @@ -94,7 +92,7 @@ def execute(
):
logger.critical(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (self.browser_id, full_path)
% (browser_params.browser_id, full_path)
)
elif not full_path.is_file() and (
full_path.name.endswith("shm") or full_path.name.endswith("wal")
Expand All @@ -106,7 +104,7 @@ def execute(
if not full_path.is_dir():
logger.warning(
"BROWSER %i: %s NOT FOUND IN profile folder, skipping."
% (self.browser_id, full_path)
% (browser_params.browser_id, full_path)
)
continue
tar.add(full_path, arcname=item)
Expand All @@ -125,9 +123,9 @@ def load_profile(
The tar will remain unmodified.
"""

assert tar_path.is_file()
assert browser_params.browser_id is not None
try:
assert tar_path.is_file()
# Copy and untar the loaded profile
logger.debug(
"BROWSER %i: Copying profile tar from %s to %s"
Expand Down
4 changes: 3 additions & 1 deletion openwpm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ class BrowserParams(DataClassJsonMixin):
prefs: dict = field(default_factory=dict)
tp_cookies: str = "always"
bot_mitigation: bool = False
profile_archive_dir: Optional[str] = None
profile_archive_dir: Optional[Path] = field(
default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path)
)
recovery_tar: Optional[Path] = None
donottrack: bool = False
tracking_protection: bool = False
Expand Down
Loading