diff --git a/api/api/management/commands/backfillmoderationdecision.py b/api/api/management/commands/backfillmoderationdecision.py new file mode 100644 index 00000000000..2b3598a3a66 --- /dev/null +++ b/api/api/management/commands/backfillmoderationdecision.py @@ -0,0 +1,121 @@ +import argparse + +from django.contrib.auth import get_user_model + +from django_tqdm import BaseCommand + +from api.constants.moderation import DecisionAction +from api.models import ( + AudioDecision, + AudioDecisionThrough, + AudioReport, + ImageDecision, + ImageDecisionThrough, + ImageReport, +) +from api.models.media import DMCA, MATURE_FILTERED, NO_ACTION, PENDING + + +class Command(BaseCommand): + help = "Back-fill the moderation decision table for a given media type." + batch_size = 3 + + @staticmethod + def add_arguments(parser): + parser.add_argument( + "--dry-run", + help="Count reports to process, and don't do anything else.", + type=bool, + default=True, + action=argparse.BooleanOptionalAction, + ) + parser.add_argument( + "--media-type", + help="The media type to back-fill moderation decisions.", + type=str, + default="image", + choices=["image", "audio"], + ) + parser.add_argument( + "--moderator", + help="The username of the moderator to attribute the decisions to.", + type=str, + default="opener", + ) + + def handle(self, *args, **options): + dry = options["dry_run"] + username = options["moderator"] + media_type = options["media_type"] + + MediaReport = ImageReport + MediaDecision = ImageDecision + MediaDecisionThrough = ImageDecisionThrough + if media_type == "audio": + MediaReport = AudioReport + MediaDecision = AudioDecision + MediaDecisionThrough = AudioDecisionThrough + + non_pending_reports = MediaReport.objects.filter(decision=None).exclude( + status=PENDING + ) + count_to_process = non_pending_reports.count() + + if dry: + self.info( + f"{count_to_process} {media_type} reports to back-fill. " + f"This is a dry run, exiting without making changes." + ) + return + + if not count_to_process: + self.info("No reports to process.") + return + + t = self.tqdm(total=count_to_process // self.batch_size) + User = get_user_model() + try: + moderator = User.objects.get(username=username) + except User.DoesNotExist: + t.error(f"User '{username}' not found.") + return + + while reports_chunk := non_pending_reports[: self.batch_size]: + decisions = MediaDecision.objects.bulk_create( + MediaDecision( + action=self.get_action(report), + moderator=moderator, + notes="__backfilled_from_report_status", + ) + for report in reports_chunk + ) + for report, decision in zip(reports_chunk, decisions): + report.decision = decision + MediaReport.objects.bulk_update(reports_chunk, ["decision"]) + MediaDecisionThrough.objects.bulk_create( + [ + MediaDecisionThrough(media_obj=report.media_obj, decision=decision) + for report, decision in zip(reports_chunk, decisions) + ] + ) + t.update(1) + + t.info( + self.style.SUCCESS( + f"Created {count_to_process} {media_type} moderation decisions from existing reports." + ) + ) + + @staticmethod + def get_action(report): + if report.status == MATURE_FILTERED: + return DecisionAction.MARKED_SENSITIVE + + if report.status == NO_ACTION: + return DecisionAction.REJECTED_REPORTS + + # Cases with status = DEINDEXED + if report.reason == DMCA: + return DecisionAction.DEINDEXED_COPYRIGHT + + return DecisionAction.DEINDEXED_SENSITIVE # For reasons MATURE and OTHER diff --git a/api/test/factory/models/audio.py b/api/test/factory/models/audio.py index 0004e39f7ef..b5a096082af 100644 --- a/api/test/factory/models/audio.py +++ b/api/test/factory/models/audio.py @@ -3,7 +3,11 @@ from api.models.audio import Audio, AudioAddOn, AudioReport, SensitiveAudio from test.factory.faker import Faker -from test.factory.models.media import IdentifierFactory, MediaFactory +from test.factory.models.media import ( + IdentifierFactory, + MediaFactory, + MediaReportFactory, +) class SensitiveAudioFactory(DjangoModelFactory): @@ -29,7 +33,7 @@ class Meta: waveform_peaks = Faker("waveform") -class AudioReportFactory(DjangoModelFactory): +class AudioReportFactory(MediaReportFactory): class Meta: model = AudioReport diff --git a/api/test/factory/models/oauth2.py b/api/test/factory/models/oauth2.py index d84f912da4a..28f5eac9e39 100644 --- a/api/test/factory/models/oauth2.py +++ b/api/test/factory/models/oauth2.py @@ -1,3 +1,4 @@ +from django.contrib.auth import get_user_model from django.utils import timezone import factory @@ -67,3 +68,8 @@ class Meta: tzinfo=timezone.get_current_timezone(), ) application = factory.SubFactory(ThrottledApplicationFactory) + + +class UserFactory(DjangoModelFactory): + class Meta: + model = get_user_model() diff --git a/api/test/unit/management/commands/test_backfillmoderationdecision.py b/api/test/unit/management/commands/test_backfillmoderationdecision.py new file mode 100644 index 00000000000..9c6e517dd91 --- /dev/null +++ b/api/test/unit/management/commands/test_backfillmoderationdecision.py @@ -0,0 +1,95 @@ +from io import StringIO + +from django.core.management import call_command + +import pytest + +from api.constants.moderation import DecisionAction +from api.models import ( + DEINDEXED, + DMCA, + MATURE, + MATURE_FILTERED, + NO_ACTION, + OTHER, + AudioDecision, + AudioDecisionThrough, + ImageDecision, + ImageDecisionThrough, +) +from test.factory.models.audio import AudioReportFactory +from test.factory.models.image import ImageReportFactory +from test.factory.models.oauth2 import UserFactory + + +def call_cmd(**options): + out = StringIO() + err = StringIO() + call_command( + "backfillmoderationdecision", + **options, + stdout=out, + stderr=err, + ) + res = out.getvalue(), err.getvalue() + print(res) + + return res + + +def make_reports(media_type, reason: str, status: str, count: int = 1): + if media_type == "audio": + return AudioReportFactory.create_batch(count, status=status, reason=reason) + else: + return ImageReportFactory.create_batch(count, status=status, reason=reason) + + +@pytest.mark.parametrize( + ("reason", "status", "expected_action"), + ( + (MATURE, MATURE_FILTERED, DecisionAction.MARKED_SENSITIVE), + (DMCA, MATURE_FILTERED, DecisionAction.MARKED_SENSITIVE), + (OTHER, MATURE_FILTERED, DecisionAction.MARKED_SENSITIVE), + (MATURE, NO_ACTION, DecisionAction.REJECTED_REPORTS), + (DMCA, NO_ACTION, DecisionAction.REJECTED_REPORTS), + (OTHER, NO_ACTION, DecisionAction.REJECTED_REPORTS), + (MATURE, DEINDEXED, DecisionAction.DEINDEXED_SENSITIVE), + (DMCA, DEINDEXED, DecisionAction.DEINDEXED_COPYRIGHT), + (OTHER, DEINDEXED, DecisionAction.DEINDEXED_SENSITIVE), + ), +) +@pytest.mark.parametrize(("media_type"), ("image", "audio")) +@pytest.mark.django_db +def test_create_moderation_decision_for_reports( + media_type, reason, status, expected_action +): + username = "opener" + UserFactory.create(username=username) + + report = make_reports(media_type=media_type, reason=reason, status=status)[0] + + out, err = call_cmd(dry_run=False, media_type=media_type, moderator=username) + + MediaDecision = ImageDecision if media_type == "image" else AudioDecision + MediaDecisionThrough = ( + ImageDecisionThrough if media_type == "image" else AudioDecisionThrough + ) + assert MediaDecision.objects.count() == 1 + assert f"Created 1 {media_type} moderation decisions from existing reports." in out + + decision = MediaDecision.objects.first() + assert decision.media_objs.count() == 1 + assert decision.action == expected_action + assert decision.moderator.username == username + + decision_through = MediaDecisionThrough.objects.first() + assert decision_through.media_obj == report.media_obj + assert decision_through.decision == decision + + +@pytest.mark.django_db +def test_catch_user_exception(): + make_reports(media_type="image", reason=MATURE, status=MATURE_FILTERED) + _, err = call_cmd(dry_run=False, moderator="nonexistent") + + assert "User 'nonexistent' not found." in err