Skip to content

Commit

Permalink
Issue #98 - Framework for dockerization
Browse files Browse the repository at this point in the history
  • Loading branch information
Mattm27 committed Oct 7, 2024
1 parent 45599d5 commit 4296d0a
Show file tree
Hide file tree
Showing 6 changed files with 176 additions and 0 deletions.
50 changes: 50 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Base image with systemd support
FROM jrei/systemd-debian

# Set environment variables to prevent interactive prompts during package installations
ENV TERM=linux \
DEBIAN_FRONTEND=noninteractive

# Get repositories and keys for Node.js and MySQL, then install core utilities
RUN apt-get update && \
apt-get -y install apt-utils curl file gnupg lsb-release wget python3-pip && \
install -d -m 0755 /etc/apt/keyrings

# Set up Node.js repository
RUN curl -fsSL https://deb.nodesource.com/setup_21.x | bash -

# Download and install MySQL repository configuration
RUN curl -fsSL https://dev.mysql.com/get/mysql-apt-config_0.8.22-1_all.deb -o /tmp/mysql.deb && \
dpkg -i /tmp/mysql.deb

# Set up Firefox repository from Mozilla and add GPG key
RUN wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- | tee /etc/apt/keyrings/packages.mozilla.org.asc && \
echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | tee -a /etc/apt/sources.list.d/mozilla.list

# Correctly format the preferences file for the Mozilla repository
RUN echo "Package: *" > /etc/apt/preferences.d/mozilla && \
echo "Pin: origin packages.mozilla.org" >> /etc/apt/preferences.d/mozilla && \
echo "Pin-Priority: 1000" >> /etc/apt/preferences.d/mozilla

# Re-add required keys to resolve any missing key issues for the repositories
RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C

# Update package lists and install necessary packages, replacing firefox with firefox-esr and mysql-server with mariadb-server
RUN apt-get update && \
apt-get -y install apache2 firefox-esr mariadb-server nodejs zip && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/mysql.deb

# Manually install Geckodriver for Selenium
RUN npm install -g geckodriver

# Install Selenium using pip3
RUN pip3 install selenium

# Expose the required ports for Apache and MySQL
EXPOSE 80 3306

# Keep the container running indefinitely to facilitate testing
CMD ["sleep", "infinity"]


26 changes: 26 additions & 0 deletions scripts/build-extension.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

set -x

cd /srv/analysis/gpc-analysis-extension

npm install -g pretty-js
npm install -g rimraf
npm run build

# Firefox build at dist/firefox
cd dist/firefox

# add the browser specific settings to the JSON file
cat << EXTRALINES | sed -i '/"incognito": "spanning"/r /dev/stdin' manifest.json
, "browser_specific_settings": {
"gecko": {
"id": "{daf44bf7-a45e-4450-979c-91cf07434c3d}"
}
}
EXTRALINES

pretty-js --in-place manifest.json

zip -1 -r myextension.xpi *
cp myextension.xpi /srv/analysis/selenium-optmeowt-crawler
67 changes: 67 additions & 0 deletions scripts/rest-api.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash

cd /srv/analysis/rest-api
echo "Running $0 in `pwd`"

set -e
set -x

# Start the MariaDB service
service mariadb start
service mariadb status &> /dev/null

# Configure the MariaDB database using compatible commands
# Update the root password and create the `analysis` database and tables
mysql -u root << SQLCOMMANDS || true
ALTER USER 'root'@'localhost' IDENTIFIED BY 'toor';
FLUSH PRIVILEGES;
CREATE DATABASE IF NOT EXISTS analysis;
USE analysis;
CREATE TABLE IF NOT EXISTS entries (
id INTEGER PRIMARY KEY AUTO_INCREMENT,
site_id INTEGER,
domain varchar(255),
sent_gpc BOOLEAN,
gpp_version TEXT,
uspapi_before_gpc varchar(255),
uspapi_after_gpc varchar(255),
usp_cookies_before_gpc varchar(255),
usp_cookies_after_gpc varchar(255),
OptanonConsent_before_gpc varchar(255),
OptanonConsent_after_gpc varchar(255),
gpp_before_gpc TEXT,
gpp_after_gpc TEXT,
urlClassification TEXT,
OneTrustWPCCPAGoogleOptOut_before_gpc BOOLEAN,
OneTrustWPCCPAGoogleOptOut_after_gpc BOOLEAN,
OTGPPConsent_before_gpc TEXT,
OTGPPConsent_after_gpc TEXT
);
CREATE TABLE IF NOT EXISTS debug (
id INTEGER PRIMARY KEY AUTO_INCREMENT,
domain varchar(255),
a varchar(4000),
b varchar(4000)
);
SQLCOMMANDS

# Create a .env file with database credentials for the REST API
cat << 'ENVFILE' > .env
DB_CONNECTION=mysql # Keep this as `mysql`, because MariaDB uses the same driver
DB_HOST=localhost
DB_DATABASE=analysis
DB_USERNAME=root
DB_PASSWORD=toor
ENVFILE

# Install dependencies for the REST API using npm
npm install

# Start the REST API using Node.js directly instead of systemd
# This step replaces `systemctl` usage since Docker containers typically don't use systemd
nohup node index.js debug > restapi.log 2>&1 &

set +x
echo '--------------------------------------------------'
echo "REST API started at http://localhost:8080/analysis"
echo '--------------------------------------------------'
6 changes: 6 additions & 0 deletions scripts/run-crawl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

# https://github.com/privacy-tech-lab/gpc-web-crawler/blob/main/README.md
cd /srv/analysis/selenium-optmeowt-crawler
npm install
node local-crawler.js
27 changes: 27 additions & 0 deletions scripts/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

trap popd EXIT
pushd $PWD &> /dev/null
cd $(dirname "$0")
cd ..

dockerfail() {
echo "Docker not found. Check that Docker is installed and running."
exit 1
}
docker ps &> /dev/null || dockerfail

set -e
set -x

docker stop crawl_test || true
docker rm crawl_test || true
docker build --tag=crawl_test .
docker run -d --name crawl_test --privileged \
-v /sys/fs/cgroup:/sys/fs/cgroup:ro \
-v "$(pwd)":/srv/analysis \
-p 8080:8080 \
crawl_test || true
docker exec -it crawl_test /srv/analysis/scripts/build-extension.sh
docker exec -it crawl_test /srv/analysis/scripts/rest-api.sh
docker exec -it crawl_test /srv/analysis/scripts/run-crawl.sh
Binary file modified selenium-optmeowt-crawler/myextension.xpi
Binary file not shown.

0 comments on commit 4296d0a

Please sign in to comment.