-
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Issue #98 - Framework for dockerization
- Loading branch information
Showing
6 changed files
with
176 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# Base image with systemd support | ||
FROM jrei/systemd-debian | ||
|
||
# Set environment variables to prevent interactive prompts during package installations | ||
ENV TERM=linux \ | ||
DEBIAN_FRONTEND=noninteractive | ||
|
||
# Get repositories and keys for Node.js and MySQL, then install core utilities | ||
RUN apt-get update && \ | ||
apt-get -y install apt-utils curl file gnupg lsb-release wget python3-pip && \ | ||
install -d -m 0755 /etc/apt/keyrings | ||
|
||
# Set up Node.js repository | ||
RUN curl -fsSL https://deb.nodesource.com/setup_21.x | bash - | ||
|
||
# Download and install MySQL repository configuration | ||
RUN curl -fsSL https://dev.mysql.com/get/mysql-apt-config_0.8.22-1_all.deb -o /tmp/mysql.deb && \ | ||
dpkg -i /tmp/mysql.deb | ||
|
||
# Set up Firefox repository from Mozilla and add GPG key | ||
RUN wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- | tee /etc/apt/keyrings/packages.mozilla.org.asc && \ | ||
echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | tee -a /etc/apt/sources.list.d/mozilla.list | ||
|
||
# Correctly format the preferences file for the Mozilla repository | ||
RUN echo "Package: *" > /etc/apt/preferences.d/mozilla && \ | ||
echo "Pin: origin packages.mozilla.org" >> /etc/apt/preferences.d/mozilla && \ | ||
echo "Pin-Priority: 1000" >> /etc/apt/preferences.d/mozilla | ||
|
||
# Re-add required keys to resolve any missing key issues for the repositories | ||
RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com --recv-keys B7B3B788A8D3785C | ||
|
||
# Update package lists and install necessary packages, replacing firefox with firefox-esr and mysql-server with mariadb-server | ||
RUN apt-get update && \ | ||
apt-get -y install apache2 firefox-esr mariadb-server nodejs zip && \ | ||
apt-get clean && \ | ||
rm -rf /var/lib/apt/lists/* /tmp/mysql.deb | ||
|
||
# Manually install Geckodriver for Selenium | ||
RUN npm install -g geckodriver | ||
|
||
# Install Selenium using pip3 | ||
RUN pip3 install selenium | ||
|
||
# Expose the required ports for Apache and MySQL | ||
EXPOSE 80 3306 | ||
|
||
# Keep the container running indefinitely to facilitate testing | ||
CMD ["sleep", "infinity"] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/bash | ||
|
||
set -x | ||
|
||
cd /srv/analysis/gpc-analysis-extension | ||
|
||
npm install -g pretty-js | ||
npm install -g rimraf | ||
npm run build | ||
|
||
# Firefox build at dist/firefox | ||
cd dist/firefox | ||
|
||
# add the browser specific settings to the JSON file | ||
cat << EXTRALINES | sed -i '/"incognito": "spanning"/r /dev/stdin' manifest.json | ||
, "browser_specific_settings": { | ||
"gecko": { | ||
"id": "{daf44bf7-a45e-4450-979c-91cf07434c3d}" | ||
} | ||
} | ||
EXTRALINES | ||
|
||
pretty-js --in-place manifest.json | ||
|
||
zip -1 -r myextension.xpi * | ||
cp myextension.xpi /srv/analysis/selenium-optmeowt-crawler |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/bin/bash | ||
|
||
cd /srv/analysis/rest-api | ||
echo "Running $0 in `pwd`" | ||
|
||
set -e | ||
set -x | ||
|
||
# Start the MariaDB service | ||
service mariadb start | ||
service mariadb status &> /dev/null | ||
|
||
# Configure the MariaDB database using compatible commands | ||
# Update the root password and create the `analysis` database and tables | ||
mysql -u root << SQLCOMMANDS || true | ||
ALTER USER 'root'@'localhost' IDENTIFIED BY 'toor'; | ||
FLUSH PRIVILEGES; | ||
CREATE DATABASE IF NOT EXISTS analysis; | ||
USE analysis; | ||
CREATE TABLE IF NOT EXISTS entries ( | ||
id INTEGER PRIMARY KEY AUTO_INCREMENT, | ||
site_id INTEGER, | ||
domain varchar(255), | ||
sent_gpc BOOLEAN, | ||
gpp_version TEXT, | ||
uspapi_before_gpc varchar(255), | ||
uspapi_after_gpc varchar(255), | ||
usp_cookies_before_gpc varchar(255), | ||
usp_cookies_after_gpc varchar(255), | ||
OptanonConsent_before_gpc varchar(255), | ||
OptanonConsent_after_gpc varchar(255), | ||
gpp_before_gpc TEXT, | ||
gpp_after_gpc TEXT, | ||
urlClassification TEXT, | ||
OneTrustWPCCPAGoogleOptOut_before_gpc BOOLEAN, | ||
OneTrustWPCCPAGoogleOptOut_after_gpc BOOLEAN, | ||
OTGPPConsent_before_gpc TEXT, | ||
OTGPPConsent_after_gpc TEXT | ||
); | ||
CREATE TABLE IF NOT EXISTS debug ( | ||
id INTEGER PRIMARY KEY AUTO_INCREMENT, | ||
domain varchar(255), | ||
a varchar(4000), | ||
b varchar(4000) | ||
); | ||
SQLCOMMANDS | ||
|
||
# Create a .env file with database credentials for the REST API | ||
cat << 'ENVFILE' > .env | ||
DB_CONNECTION=mysql # Keep this as `mysql`, because MariaDB uses the same driver | ||
DB_HOST=localhost | ||
DB_DATABASE=analysis | ||
DB_USERNAME=root | ||
DB_PASSWORD=toor | ||
ENVFILE | ||
|
||
# Install dependencies for the REST API using npm | ||
npm install | ||
|
||
# Start the REST API using Node.js directly instead of systemd | ||
# This step replaces `systemctl` usage since Docker containers typically don't use systemd | ||
nohup node index.js debug > restapi.log 2>&1 & | ||
|
||
set +x | ||
echo '--------------------------------------------------' | ||
echo "REST API started at http://localhost:8080/analysis" | ||
echo '--------------------------------------------------' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
#!/bin/bash | ||
|
||
# https://github.com/privacy-tech-lab/gpc-web-crawler/blob/main/README.md | ||
cd /srv/analysis/selenium-optmeowt-crawler | ||
npm install | ||
node local-crawler.js |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/bin/bash | ||
|
||
trap popd EXIT | ||
pushd $PWD &> /dev/null | ||
cd $(dirname "$0") | ||
cd .. | ||
|
||
dockerfail() { | ||
echo "Docker not found. Check that Docker is installed and running." | ||
exit 1 | ||
} | ||
docker ps &> /dev/null || dockerfail | ||
|
||
set -e | ||
set -x | ||
|
||
docker stop crawl_test || true | ||
docker rm crawl_test || true | ||
docker build --tag=crawl_test . | ||
docker run -d --name crawl_test --privileged \ | ||
-v /sys/fs/cgroup:/sys/fs/cgroup:ro \ | ||
-v "$(pwd)":/srv/analysis \ | ||
-p 8080:8080 \ | ||
crawl_test || true | ||
docker exec -it crawl_test /srv/analysis/scripts/build-extension.sh | ||
docker exec -it crawl_test /srv/analysis/scripts/rest-api.sh | ||
docker exec -it crawl_test /srv/analysis/scripts/run-crawl.sh |
Binary file not shown.