Skip to content

Commit

Permalink
Add breach data model and sync command
Browse files Browse the repository at this point in the history
Fixes #13611, #13612
  • Loading branch information
robhudson committed Oct 12, 2023
1 parent 2a87ea6 commit ec37e43
Show file tree
Hide file tree
Showing 12 changed files with 664 additions and 0 deletions.
3 changes: 3 additions & 0 deletions bedrock/products/management/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
27 changes: 27 additions & 0 deletions bedrock/products/management/commands/sync_breaches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

from django.core.management.base import BaseCommand

from bedrock.products.models import Breach
from bedrock.utils.management.decorators import alert_sentry_on_exception


@alert_sentry_on_exception
class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument("-q", "--quiet", action="store_true", dest="quiet", default=False, help="If no error occurs, swallow all output."),

def output(self, msg):
if not self.quiet:
print(msg)

def handle(self, *args, **options):
self.quiet = options["quiet"]

added, updated = Breach.objects.sync_db()
self.output(f"Breaches added: {added}")
self.output(f"Breaches updated: {updated}")

Breach.objects.sync_logos(verbose=not self.quiet)
40 changes: 40 additions & 0 deletions bedrock/products/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

# Generated by Django 3.2.21 on 2023-09-26 17:50

from django.db import migrations, models


class Migration(migrations.Migration):
initial = True

dependencies = []

operations = [
migrations.CreateModel(
name="Breach",
fields=[
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("name", models.CharField(max_length=255, unique=True)),
("title", models.CharField(max_length=255)),
("domain", models.CharField(max_length=255)),
("breach_date", models.DateField(null=True)),
("added_date", models.DateTimeField(null=True)),
("modified_date", models.DateTimeField(null=True)),
("pwn_count", models.PositiveIntegerField(default=0)),
("logo_path", models.CharField(max_length=255)),
("data_classes", models.JSONField(default=list)),
("is_verified", models.BooleanField(default=False)),
("is_fabricated", models.BooleanField(default=False)),
("is_sensitive", models.BooleanField(default=False)),
("is_retired", models.BooleanField(default=False)),
("is_spam_list", models.BooleanField(default=False)),
("is_malware", models.BooleanField(default=False)),
],
options={
"verbose_name_plural": "Breaches",
},
),
]
3 changes: 3 additions & 0 deletions bedrock/products/migrations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
172 changes: 172 additions & 0 deletions bedrock/products/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,175 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
from tempfile import TemporaryFile

from django.conf import settings
from django.db import models
from django.utils.dateparse import parse_date, parse_datetime

import requests
from google.cloud import storage


class BreachManager(models.Manager):
def sync_db(self):
# Fetch new breach data and update the database.
BREACH_URL = "https://haveibeenpwned.com/api/v3/breaches"

response = requests.get(BREACH_URL, headers={"User-Agent": "mozilla-org"})
response.raise_for_status()

breaches_added = 0
breaches_updated = 0

for data in response.json():
breach, created = self.get_or_create(name=data["Name"])
obj_data = {
"title": data["Title"],
"domain": data["Domain"],
"breach_date": data["BreachDate"],
"added_date": data["AddedDate"],
"modified_date": data["ModifiedDate"],
"pwn_count": data["PwnCount"],
"logo_path": "", # We aren't using the hibp logos because they are too large and inconsistent.
"data_classes": data["DataClasses"],
"is_verified": data["IsVerified"],
"is_fabricated": data["IsFabricated"],
"is_sensitive": data["IsSensitive"],
"is_retired": data["IsRetired"],
"is_spam_list": data["IsSpamList"],
"is_malware": data["IsMalware"],
}

changed = False
for key, value in obj_data.items():
# Convert date strings to date objects.
if key == "breach_date":
value = parse_date(value)
elif key in ("added_date", "modified_date"):
value = parse_datetime(value)

if getattr(breach, key, None) != value:
changed = True
setattr(breach, key, value)

if changed:
if created:
breaches_added += 1
else:
breaches_updated += 1
breach.save()

return breaches_added, breaches_updated

def sync_logos(self, verbose=True):
# Iterate over db breaches and download logos.
verbose and print("Syncing breach logos...")

GCS_DIR = "media/"
GCS_PATH = "img/products/monitor/breach_logos/"

def _urlize(path):
# Convert a GCS path to the full static URL to the logo.
return path.replace(GCS_DIR, settings.STATIC_URL, 1)

# Get list of all breach logos from GCS.
try:
client = storage.Client()
bucket = client.get_bucket(settings.GCS_MEDIA_BUCKET_NAME)
blob_list = bucket.list_blobs(prefix=GCS_DIR + GCS_PATH)
gcs_logos = [_urlize(blob.name) for blob in blob_list]
except Exception as e:
verbose and print(f"Failed to get list of GCS logos: {e}. Aborting.")
return

for breach in self.all():
if not breach.domain:
verbose and print(f"Skipping {breach.name} because it has no domain.")
continue

# Check if the breach has a logo_path value and if it exists in GCS.
if breach.logo_path and breach.logo_path in gcs_logos:
verbose and print(f"Skipping {breach.name} because it already has an existing logo.")
continue

# NOTE: We are storing the full logo URL in the logo_path field since the db is per deployment environment.
# This allows us to reference the logo images locally without needing to download them.
logo_path = f"{GCS_DIR}{GCS_PATH}{breach.domain.lower()}.ico"
logo_url = _urlize(logo_path)

# Check if the logo exists in GCS. If so, no reason to re-fetch fron DDG.
if logo_url in gcs_logos:
breach.logo_path = logo_url
breach.save()
print(f"Found existing logo for {breach.name} in GCS. Updating db.")
continue

# Fetch the logo from the ddg api.
resp = requests.get(f"https://icons.duckduckgo.com/ip3/{breach.domain}.ico", headers={"User-Agent": "mozilla-org"})
if resp.status_code != 200:
verbose and print(f"Failed to fetch logo for {breach.name} from ddg api. Status code: {resp.status_code}.")
continue

# Save the logo to a temp file then upload to GCS.
with TemporaryFile() as tf:
tf.write(resp.content)
tf.seek(0)
try:
blob = bucket.blob(logo_path)
blob.upload_from_file(tf)
print(f"Uploaded logo for {breach.name} to GCS: {logo_path}")
except Exception as e:
verbose and print(f"Failed to upload logo for {breach.name} to GCS: {e}")
continue

# Update the logo_path value in the db.
breach.logo_path = logo_url
breach.save()
verbose and print(f"Saved logo for {breach.name} to {breach.logo_path}")

# Add the logo_path to the list of gcs logos.
gcs_logos.append(logo_path)


class Breach(models.Model):
name = models.CharField(max_length=255, unique=True)
title = models.CharField(max_length=255)
domain = models.CharField(max_length=255)
breach_date = models.DateField(null=True)
added_date = models.DateTimeField(null=True)
modified_date = models.DateTimeField(null=True)
pwn_count = models.PositiveIntegerField(default=0)
# Note: The description is unused on the site and not included to reduce the size of the database.
# description = models.TextField()
logo_path = models.CharField(max_length=255)
data_classes = models.JSONField(default=list)
is_verified = models.BooleanField(default=False)
is_fabricated = models.BooleanField(default=False)
is_sensitive = models.BooleanField(default=False)
is_retired = models.BooleanField(default=False)
is_spam_list = models.BooleanField(default=False)
is_malware = models.BooleanField(default=False)

objects = BreachManager()

class Meta:
verbose_name_plural = "Breaches"

def __str__(self):
return self.name

@property
def category(self):
if self.name in ("Exactis", "Apollo", "YouveBeenScraped", "ElasticsearchSalesLeads", "Estonia", "MasterDeeds", "PDL"):
return "data-aggregator-breach"
if self.is_sensitive:
return "sensitive-breach"
if self.domain != "":
return "website-breach"
return "data-aggregator-breach"

@property
def is_delayed(self):
# Boolean whether the difference between the `breach_date` and `added_date` is greater than 90 days.
return abs((self.added_date.date() - self.breach_date).days) > 90
Empty file.
26 changes: 26 additions & 0 deletions bedrock/products/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from django.utils.dateparse import parse_date, parse_datetime

import pytest

from bedrock.products.models import Breach


@pytest.fixture
def breach(db):
return Breach.objects.create(
name="Twitter",
title="Twitter",
domain="twitter.com",
breach_date=parse_date("2022-01-01"),
added_date=parse_datetime("2022-08-01T01:23:45Z"),
modified_date=parse_datetime("2022-08-01T01:23:45Z"),
pwn_count=6682453,
logo_path="/path/to/twitter.com.ico",
data_classes=["Bios", "Email addresses", "Geographic locations", "Names", "Phone numbers", "Profile photos", "Usernames"],
is_verified=True,
is_fabricated=False,
is_sensitive=False,
is_retired=False,
is_spam_list=False,
is_malware=False,
)
98 changes: 98 additions & 0 deletions bedrock/products/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.


from unittest import mock

from django.utils.dateparse import parse_date, parse_datetime

import pytest

from bedrock.products.models import Breach


def _update(**kwargs):
Breach.objects.filter(name="Twitter").update(**kwargs)


def test_is_delayed(breach):
assert breach.is_delayed is True


def test_is_not_delayed(breach):
_update(breach_date=parse_date("2022-07-01"))
breach = Breach.objects.get(name="Twitter")
assert breach.is_delayed is False


@pytest.mark.parametrize(
"kwargs, expected",
[
(None, "website-breach"),
({"name": "Apollo"}, "data-aggregator-breach"),
({"is_sensitive": True}, "sensitive-breach"),
({"domain": ""}, "data-aggregator-breach"),
],
)
def test_category(breach, kwargs, expected):
if kwargs:
_update(**kwargs)
breach = Breach.objects.get()
assert breach.category == expected


BREACH_JSON = {
"Name": "Twitter",
"Title": "Twitter",
"Domain": "twitter.com",
"BreachDate": "2022-01-01",
"AddedDate": "2022-08-01T01:23:45Z",
"ModifiedDate": "2022-08-01T01:23:45Z",
"PwnCount": 6682453,
"Description": "Example description. We don't use this field.",
"LogoPath": "/path/to/twitter.com.ico",
"DataClasses": ["Bios", "Email addresses", "Geographic locations", "Names", "Phone numbers", "Profile photos", "Usernames"],
"IsVerified": True,
"IsFabricated": False,
"IsSensitive": False,
"IsRetired": False,
"IsSpamList": False,
"IsMalware": False,
"IsSubscriptionFree": False, # We don't use this field.
}


@mock.patch("bedrock.products.models.requests.get")
def test_sync_db(mock_requests, db):
mock_requests.return_value.json.return_value = [BREACH_JSON]
added, updated = Breach.objects.sync_db()
assert added == 1
assert updated == 0
assert Breach.objects.count() == 1
breach = Breach.objects.get()
assert breach.name == "Twitter"
assert breach.title == "Twitter"
assert breach.domain == "twitter.com"
assert breach.breach_date == parse_date("2022-01-01")
assert breach.added_date == parse_datetime("2022-08-01T01:23:45Z")
assert breach.modified_date == parse_datetime("2022-08-01T01:23:45Z")
assert breach.pwn_count == 6682453
assert breach.logo_path == ""
assert breach.data_classes == ["Bios", "Email addresses", "Geographic locations", "Names", "Phone numbers", "Profile photos", "Usernames"]
assert breach.is_verified is True
assert breach.is_fabricated is False
assert breach.is_sensitive is False
assert breach.is_retired is False
assert breach.is_spam_list is False
assert breach.is_malware is False


@mock.patch("bedrock.products.models.requests.get")
def test_sync_db__update(mock_requests, breach, db):
BREACH_JSON["PwnCount"] = 9999999
mock_requests.return_value.json.return_value = [BREACH_JSON]
added, updated = Breach.objects.sync_db()
assert added == 0
assert updated == 1
assert Breach.objects.count() == 1
4 changes: 4 additions & 0 deletions bedrock/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,10 @@ def lazy_langs():
if DEBUG:
STATICFILES_DIRS += (path("media"),)

# GCS bucket name for media. Configured on a per-deployment basis in envvars. Defaults to dev.
# NOTE: This shouldn't be needed locally unless you're testing GCS uploads.
GCS_MEDIA_BUCKET_NAME = config("GCS_MEDIA_BUCKET_NAME", default="bedrock-nonprod-dev-media")


def set_whitenoise_headers(headers, path, url):
if "/fonts/" in url:
Expand Down
Loading

0 comments on commit ec37e43

Please sign in to comment.