Skip to content

Commit

Permalink
Use pygfried in file format identification
Browse files Browse the repository at this point in the history
This commit simplifies file format identification. It removes all file
identification related models and data entries from the fpr application.
Instead, it defaults to pygfried, a Python package that makes siegfried
available as a CPython extension.
  • Loading branch information
sevein committed Dec 28, 2021
1 parent acb374e commit 5238045
Show file tree
Hide file tree
Showing 13 changed files with 126 additions and 13,190 deletions.
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ RUN set -ex \
pbzip2 \
pst-utils \
rsync \
siegfried \
sleuthkit \
tesseract-ocr \
tree \
Expand Down
11 changes: 2 additions & 9 deletions a3m/client/clientScripts/a3m_download_transfer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Download transfer object from storage."""
import json
import shutil
import sys
from contextlib import contextmanager
from pathlib import Path
from tempfile import TemporaryDirectory
from urllib.parse import urlparse

import pygfried
import requests
from django.conf import settings

Expand Down Expand Up @@ -42,14 +42,7 @@ def _create_tmpdir(suffix, purpose=None):


def _archived(path):
command = ["sf", "-json", path]
exit_code, stdout, stderr = executeOrRun("command", command, capture_output=True)
if exit_code != 0:
raise RetrievalError(
f"Extraction failed, Siegfried quit with exit code {exit_code}"
)
idresults = json.loads(stdout)
puid = idresults["files"][0]["matches"][0]["id"]
puid = pygfried.identify(str(path))
return puid in EXTRACTABLE_PUIDS


Expand Down
147 changes: 45 additions & 102 deletions a3m/client/clientScripts/identify_file_format.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,44 @@
import argparse
import logging
import uuid

import pygfried
from django.db import transaction

from a3m.databaseFunctions import getUTCDate
from a3m.databaseFunctions import insertIntoEvents
from a3m.executeOrRunSubProcess import executeOrRun
from a3m.fpr.models import FormatVersion
from a3m.fpr.models import IDCommand
from a3m.fpr.models import IDRule
from a3m.main.models import File
from a3m.main.models import FileFormatVersion
from a3m.main.models import FileID


def write_identification_event(file_uuid, command, format=None, success=True):
logger = logging.getLogger(__name__)

TOOL_DESCRIPTION = "pygfried/siegfried"
TOOL_VERSION = pygfried.version()


def write_file_format_version(file_obj, format_version_obj):
(ffv, created) = FileFormatVersion.objects.get_or_create(
file_uuid=file_obj, defaults={"format_version": format_version_obj}
)
if not created: # Update the version if it wasn't created new
ffv.format_version = format_version_obj
ffv.save()


def write_identification_event(file_uuid, puid=None, success=True):
event_detail_text = 'program="{}"; version="{}"'.format(
command.tool.description, command.tool.version
TOOL_DESCRIPTION, TOOL_VERSION
)
if success:
event_outcome_text = "Positive"
else:
event_outcome_text = "Not identified"

if not format:
format = "No Matching Format"
if not puid or puid == "UNKNOWN":
puid = "No Matching Format"

date = getUTCDate()

Expand All @@ -35,124 +49,54 @@ def write_identification_event(file_uuid, command, format=None, success=True):
eventDateTime=date,
eventDetail=event_detail_text,
eventOutcome=event_outcome_text,
eventOutcomeDetailNote=format,
eventOutcomeDetailNote=puid,
)


def write_file_id(file_uuid, format, output):
def write_file_id(file_id, format_version_obj):
"""
Write the identified format to the DB.
:param str file_uuid: UUID of the file identified
:param FormatVersion format: FormatVersion it was identified as
:param str output: Text that generated the match
"""
if format.pronom_id:
format_registry = "PRONOM"
key = format.pronom_id
else:
format_registry = "Archivematica Format Policy Registry"
key = output

# Sometimes, this is null instead of an empty string
version = format.version or ""

FileID.objects.create(
file_id=file_uuid,
format_name=format.format.description,
format_version=version,
format_registry_name=format_registry,
format_registry_key=key,
file_id=file_id,
format_name=format_version_obj.format.description,
format_version=format_version_obj.version or "",
format_registry_name="PRONOM",
format_registry_key=format_version_obj.pronom_id,
)


def _default_idcommand():
"""Retrieve the default ``fpr.IDCommand``.
We only expect to find one command enabled/active.
"""
return IDCommand.active.first()


def main(job, file_path, file_uuid, disable_reidentify):
command = _default_idcommand()
if command is None:
job.write_error("Unable to determine IDCommand.\n")
return 255

command_uuid = command.uuid
job.print_output("IDCommand:", command.description)
job.print_output("IDCommand UUID:", command.uuid)
job.print_output("IDTool:", command.tool.description)
job.print_output("IDTool UUID:", command.tool.uuid)
job.print_output(f"File: ({file_uuid}) {file_path}")

file_ = File.objects.get(uuid=file_uuid)

def identify_file_format(file_path, file_id, disable_reidentify):
# If reidentification is disabled and a format identification event exists for this file, exit
file_obj = File.objects.get(uuid=file_id)
if (
disable_reidentify
and file_.event_set.filter(event_type="format identification").exists()
and file_obj.event_set.filter(event_type="format identification").exists()
):
job.print_output(
logger.debug(
"This file has already been identified, and re-identification is disabled. Skipping."
)
return 0

exitcode, output, err = executeOrRun(
command.script_type,
command.script,
arguments=[file_path],
printing=False,
capture_output=True,
)
output = output.strip()
try:
puid = pygfried.identify(file_path)
except Exception as err:
logger.error("Error running pygfried: %s", err)
return 255

if exitcode != 0:
job.print_error(f"Error: IDCommand with UUID {command_uuid} exited non-zero.")
job.print_error(f"Error: {err}")
if not puid or puid == "UNKNOWN":
write_identification_event(file_id, success=False)
return 255

job.print_output("Command output:", output)
# PUIDs are the same regardless of tool, so PUID-producing tools don't have "rules" per se - we just
# go straight to the FormatVersion table to see if there's a matching PUID
try:
if command.config == "PUID":
version = FormatVersion.active.get(pronom_id=output)
else:
rule = IDRule.active.get(command_output=output, command=command)
version = rule.format
except IDRule.DoesNotExist:
job.print_error(
'Error: No FPR identification rule for tool output "{}" found'.format(
output
)
)
write_identification_event(file_uuid, command, success=False)
return 255
except IDRule.MultipleObjectsReturned:
job.print_error(
'Error: Multiple FPR identification rules for tool output "{}" found'.format(
output
)
)
write_identification_event(file_uuid, command, success=False)
return 255
format_version_obj = FormatVersion.active.get(pronom_id=puid)
except FormatVersion.DoesNotExist:
job.print_error(f"Error: No FPR format record found for PUID {output}")
write_identification_event(file_uuid, command, success=False)
write_identification_event(file_id, success=False)
return 255

(ffv, created) = FileFormatVersion.objects.get_or_create(
file_uuid=file_, defaults={"format_version": version}
)
if not created: # Update the version if it wasn't created new
ffv.format_version = version
ffv.save()
job.print_output(f"{file_path} identified as a {version.description}")

write_identification_event(file_uuid, command, format=version.pronom_id)
write_file_id(file_uuid=file_uuid, format=version, output=output)
write_file_format_version(file_obj, format_version_obj)
write_identification_event(file_id, puid=puid)
write_file_id(file_id, format_version_obj)

return 0

Expand All @@ -172,8 +116,7 @@ def call(jobs):
with job.JobContext():
args = parser.parse_args(job.args[1:])
job.set_status(
main(
job,
identify_file_format(
args.file_path,
args.file_uuid,
args.disable_reidentify,
Expand Down
Loading

0 comments on commit 5238045

Please sign in to comment.