Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add breach data model and sync command #13782

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bedrock/products/management/commands/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
27 changes: 27 additions & 0 deletions bedrock/products/management/commands/sync_breaches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

from django.core.management.base import BaseCommand

from bedrock.products.models import Breach
from bedrock.utils.management.decorators import alert_sentry_on_exception


@alert_sentry_on_exception
class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument("-q", "--quiet", action="store_true", dest="quiet", default=False, help="If no error occurs, swallow all output."),

def output(self, msg):
if not self.quiet:
print(msg)

def handle(self, *args, **options):
self.quiet = options["quiet"]

added, updated = Breach.objects.sync_db()
self.output(f"Breaches added: {added}")
self.output(f"Breaches updated: {updated}")

Breach.objects.sync_logos(verbose=not self.quiet)
40 changes: 40 additions & 0 deletions bedrock/products/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.

# Generated by Django 3.2.21 on 2023-09-26 17:50

from django.db import migrations, models


class Migration(migrations.Migration):
initial = True

dependencies = []

operations = [
migrations.CreateModel(
name="Breach",
fields=[
("id", models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name="ID")),
("name", models.CharField(max_length=255, unique=True)),
("title", models.CharField(max_length=255)),
("domain", models.CharField(max_length=255)),
("breach_date", models.DateField(null=True)),
("added_date", models.DateTimeField(null=True)),
("modified_date", models.DateTimeField(null=True)),
("pwn_count", models.PositiveIntegerField(default=0)),
("logo_path", models.CharField(max_length=255)),
("data_classes", models.JSONField(default=list)),
("is_verified", models.BooleanField(default=False)),
("is_fabricated", models.BooleanField(default=False)),
("is_sensitive", models.BooleanField(default=False)),
("is_retired", models.BooleanField(default=False)),
("is_spam_list", models.BooleanField(default=False)),
("is_malware", models.BooleanField(default=False)),
],
options={
"verbose_name_plural": "Breaches",
},
),
]
3 changes: 3 additions & 0 deletions bedrock/products/migrations/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
172 changes: 172 additions & 0 deletions bedrock/products/models.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,175 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
from tempfile import TemporaryFile

from django.conf import settings
from django.db import models
from django.utils.dateparse import parse_date, parse_datetime

import requests
from google.cloud import storage


class BreachManager(models.Manager):
def sync_db(self):
# Fetch new breach data and update the database.
BREACH_URL = "https://haveibeenpwned.com/api/v3/breaches"

response = requests.get(BREACH_URL, headers={"User-Agent": "mozilla-org"})
response.raise_for_status()

breaches_added = 0
breaches_updated = 0

for data in response.json():
breach, created = self.get_or_create(name=data["Name"])
obj_data = {
"title": data["Title"],
"domain": data["Domain"],
"breach_date": data["BreachDate"],
"added_date": data["AddedDate"],
"modified_date": data["ModifiedDate"],
"pwn_count": data["PwnCount"],
"logo_path": "", # We aren't using the hibp logos because they are too large and inconsistent.
"data_classes": data["DataClasses"],
"is_verified": data["IsVerified"],
"is_fabricated": data["IsFabricated"],
"is_sensitive": data["IsSensitive"],
"is_retired": data["IsRetired"],
"is_spam_list": data["IsSpamList"],
"is_malware": data["IsMalware"],
}

changed = False
for key, value in obj_data.items():
# Convert date strings to date objects.
if key == "breach_date":
value = parse_date(value)
elif key in ("added_date", "modified_date"):
value = parse_datetime(value)

if getattr(breach, key, None) != value:
changed = True
setattr(breach, key, value)

if changed:
if created:
breaches_added += 1
else:
breaches_updated += 1
breach.save()

return breaches_added, breaches_updated

def sync_logos(self, verbose=True):
# Iterate over db breaches and download logos.
verbose and print("Syncing breach logos...")

GCS_DIR = "media/"
GCS_PATH = "img/products/monitor/breach_logos/"

def _urlize(path):
# Convert a GCS path to the full static URL to the logo.
return path.replace(GCS_DIR, settings.STATIC_URL, 1)

# Get list of all breach logos from GCS.
try:
client = storage.Client()
bucket = client.get_bucket(settings.GCS_MEDIA_BUCKET_NAME)
blob_list = bucket.list_blobs(prefix=GCS_DIR + GCS_PATH)
gcs_logos = [_urlize(blob.name) for blob in blob_list]
except Exception as e:
verbose and print(f"Failed to get list of GCS logos: {e}. Aborting.")
return

for breach in self.all():
if not breach.domain:
verbose and print(f"Skipping {breach.name} because it has no domain.")
continue

# Check if the breach has a logo_path value and if it exists in GCS.
if breach.logo_path and breach.logo_path in gcs_logos:
verbose and print(f"Skipping {breach.name} because it already has an existing logo.")
continue

# NOTE: We are storing the full logo URL in the logo_path field since the db is per deployment environment.
# This allows us to reference the logo images locally without needing to download them.
logo_path = f"{GCS_DIR}{GCS_PATH}{breach.domain.lower()}.ico"
logo_url = _urlize(logo_path)

# Check if the logo exists in GCS. If so, no reason to re-fetch fron DDG.
if logo_url in gcs_logos:
breach.logo_path = logo_url
breach.save()
print(f"Found existing logo for {breach.name} in GCS. Updating db.")
continue

# Fetch the logo from the ddg api.
resp = requests.get(f"https://icons.duckduckgo.com/ip3/{breach.domain}.ico", headers={"User-Agent": "mozilla-org"})
if resp.status_code != 200:
verbose and print(f"Failed to fetch logo for {breach.name} from ddg api. Status code: {resp.status_code}.")
continue

# Save the logo to a temp file then upload to GCS.
with TemporaryFile() as tf:
tf.write(resp.content)
tf.seek(0)
try:
blob = bucket.blob(logo_path)
blob.upload_from_file(tf)
print(f"Uploaded logo for {breach.name} to GCS: {logo_path}")
except Exception as e:
verbose and print(f"Failed to upload logo for {breach.name} to GCS: {e}")
continue

# Update the logo_path value in the db.
breach.logo_path = logo_url
breach.save()
verbose and print(f"Saved logo for {breach.name} to {breach.logo_path}")

# Add the logo_path to the list of gcs logos.
gcs_logos.append(logo_path)


class Breach(models.Model):
name = models.CharField(max_length=255, unique=True)
title = models.CharField(max_length=255)
domain = models.CharField(max_length=255)
breach_date = models.DateField(null=True)
added_date = models.DateTimeField(null=True)
modified_date = models.DateTimeField(null=True)
pwn_count = models.PositiveIntegerField(default=0)
# Note: The description is unused on the site and not included to reduce the size of the database.
# description = models.TextField()
logo_path = models.CharField(max_length=255)
data_classes = models.JSONField(default=list)
is_verified = models.BooleanField(default=False)
is_fabricated = models.BooleanField(default=False)
is_sensitive = models.BooleanField(default=False)
is_retired = models.BooleanField(default=False)
is_spam_list = models.BooleanField(default=False)
is_malware = models.BooleanField(default=False)

objects = BreachManager()

class Meta:
verbose_name_plural = "Breaches"

def __str__(self):
return self.name

@property
def category(self):
if self.name in ("Exactis", "Apollo", "YouveBeenScraped", "ElasticsearchSalesLeads", "Estonia", "MasterDeeds", "PDL"):
return "data-aggregator-breach"
if self.is_sensitive:
return "sensitive-breach"
if self.domain != "":
return "website-breach"
return "data-aggregator-breach"

@property
def is_delayed(self):
# Boolean whether the difference between the `breach_date` and `added_date` is greater than 90 days.
return abs((self.added_date.date() - self.breach_date).days) > 90
Empty file.
26 changes: 26 additions & 0 deletions bedrock/products/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from django.utils.dateparse import parse_date, parse_datetime

import pytest

from bedrock.products.models import Breach


@pytest.fixture
def breach(db):
return Breach.objects.create(
name="Twitter",
title="Twitter",
domain="twitter.com",
breach_date=parse_date("2022-01-01"),
added_date=parse_datetime("2022-08-01T01:23:45Z"),
modified_date=parse_datetime("2022-08-01T01:23:45Z"),
pwn_count=6682453,
logo_path="/path/to/twitter.com.ico",
data_classes=["Bios", "Email addresses", "Geographic locations", "Names", "Phone numbers", "Profile photos", "Usernames"],
is_verified=True,
is_fabricated=False,
is_sensitive=False,
is_retired=False,
is_spam_list=False,
is_malware=False,
)
98 changes: 98 additions & 0 deletions bedrock/products/tests/test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.


from unittest import mock

from django.utils.dateparse import parse_date, parse_datetime

import pytest

from bedrock.products.models import Breach


def _update(**kwargs):
Breach.objects.filter(name="Twitter").update(**kwargs)


def test_is_delayed(breach):
assert breach.is_delayed is True


def test_is_not_delayed(breach):
_update(breach_date=parse_date("2022-07-01"))
breach = Breach.objects.get(name="Twitter")
assert breach.is_delayed is False


@pytest.mark.parametrize(
"kwargs, expected",
[
(None, "website-breach"),
({"name": "Apollo"}, "data-aggregator-breach"),
({"is_sensitive": True}, "sensitive-breach"),
({"domain": ""}, "data-aggregator-breach"),
],
)
def test_category(breach, kwargs, expected):
if kwargs:
_update(**kwargs)
breach = Breach.objects.get()
assert breach.category == expected


BREACH_JSON = {
"Name": "Twitter",
"Title": "Twitter",
"Domain": "twitter.com",
"BreachDate": "2022-01-01",
"AddedDate": "2022-08-01T01:23:45Z",
"ModifiedDate": "2022-08-01T01:23:45Z",
"PwnCount": 6682453,
"Description": "Example description. We don't use this field.",
"LogoPath": "/path/to/twitter.com.ico",
"DataClasses": ["Bios", "Email addresses", "Geographic locations", "Names", "Phone numbers", "Profile photos", "Usernames"],
"IsVerified": True,
"IsFabricated": False,
"IsSensitive": False,
"IsRetired": False,
"IsSpamList": False,
"IsMalware": False,
"IsSubscriptionFree": False, # We don't use this field.
}


@mock.patch("bedrock.products.models.requests.get")
def test_sync_db(mock_requests, db):
mock_requests.return_value.json.return_value = [BREACH_JSON]
added, updated = Breach.objects.sync_db()
assert added == 1
assert updated == 0
assert Breach.objects.count() == 1
breach = Breach.objects.get()
assert breach.name == "Twitter"
assert breach.title == "Twitter"
assert breach.domain == "twitter.com"
assert breach.breach_date == parse_date("2022-01-01")
assert breach.added_date == parse_datetime("2022-08-01T01:23:45Z")
assert breach.modified_date == parse_datetime("2022-08-01T01:23:45Z")
assert breach.pwn_count == 6682453
assert breach.logo_path == ""
assert breach.data_classes == ["Bios", "Email addresses", "Geographic locations", "Names", "Phone numbers", "Profile photos", "Usernames"]
assert breach.is_verified is True
assert breach.is_fabricated is False
assert breach.is_sensitive is False
assert breach.is_retired is False
assert breach.is_spam_list is False
assert breach.is_malware is False


@mock.patch("bedrock.products.models.requests.get")
def test_sync_db__update(mock_requests, breach, db):
BREACH_JSON["PwnCount"] = 9999999
mock_requests.return_value.json.return_value = [BREACH_JSON]
added, updated = Breach.objects.sync_db()
assert added == 0
assert updated == 1
assert Breach.objects.count() == 1
4 changes: 4 additions & 0 deletions bedrock/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,10 @@ def lazy_langs():
if DEBUG:
STATICFILES_DIRS += (path("media"),)

# GCS bucket name for media. Configured on a per-deployment basis in envvars. Defaults to dev.
# NOTE: This shouldn't be needed locally unless you're testing GCS uploads.
GCS_MEDIA_BUCKET_NAME = config("GCS_MEDIA_BUCKET_NAME", default="bedrock-nonprod-dev-media")


def set_whitenoise_headers(headers, path, url):
if "/fonts/" in url:
Expand Down
Loading