Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pygfried in file format identification #176

Merged
merged 3 commits into from
Dec 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ RUN set -ex \
pbzip2 \
pst-utils \
rsync \
siegfried \
sleuthkit \
tesseract-ocr \
tree \
Expand Down
12 changes: 6 additions & 6 deletions a3m/assets/workflow.json
Original file line number Diff line number Diff line change
Expand Up @@ -691,8 +691,8 @@
"filter_file_end": null,
"filter_file_start": null,
"filter_subdir": "objects",
"stderr_file": "%SIPLogsDirectory%fileFormatIdentification.log",
"stdout_file": "%SIPLogsDirectory%fileFormatIdentification.log"
"stderr_file": null,
"stdout_file": null
},
"description": {
"en": "Identify file format",
Expand Down Expand Up @@ -823,8 +823,8 @@
"filter_file_end": null,
"filter_file_start": null,
"filter_subdir": "objects",
"stderr_file": "%SIPLogsDirectory%fileFormatIdentification.log",
"stdout_file": "%SIPLogsDirectory%fileFormatIdentification.log"
"stderr_file": null,
"stdout_file": null
},
"description": {
"en": "Identify file format",
Expand Down Expand Up @@ -2174,8 +2174,8 @@
"filter_file_end": null,
"filter_file_start": null,
"filter_subdir": "objects",
"stderr_file": "%SIPLogsDirectory%fileFormatIdentification.log",
"stdout_file": "%SIPLogsDirectory%fileFormatIdentification.log"
"stderr_file": null,
"stdout_file": null
},
"description": {
"en": "Identify file format",
Expand Down
2 changes: 1 addition & 1 deletion a3m/cli/client/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _prepare_config(user_pairs):
"""
config = a3m_pb2.ProcessingConfig(
assign_uuids_to_directories=True,
examine_contents=True,
examine_contents=False,
generate_transfer_structure_report=True,
document_empty_directories=True,
extract_packages=True,
Expand Down
11 changes: 2 additions & 9 deletions a3m/client/clientScripts/a3m_download_transfer.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Download transfer object from storage."""
import json
import shutil
import sys
from contextlib import contextmanager
from pathlib import Path
from tempfile import TemporaryDirectory
from urllib.parse import urlparse

import pygfried
import requests
from django.conf import settings

Expand Down Expand Up @@ -42,14 +42,7 @@ def _create_tmpdir(suffix, purpose=None):


def _archived(path):
command = ["sf", "-json", path]
exit_code, stdout, stderr = executeOrRun("command", command, capture_output=True)
if exit_code != 0:
raise RetrievalError(
f"Extraction failed, Siegfried quit with exit code {exit_code}"
)
idresults = json.loads(stdout)
puid = idresults["files"][0]["matches"][0]["id"]
puid = pygfried.identify(str(path))
return puid in EXTRACTABLE_PUIDS


Expand Down
147 changes: 45 additions & 102 deletions a3m/client/clientScripts/identify_file_format.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,44 @@
import argparse
import logging
import uuid

import pygfried
from django.db import transaction

from a3m.databaseFunctions import getUTCDate
from a3m.databaseFunctions import insertIntoEvents
from a3m.executeOrRunSubProcess import executeOrRun
from a3m.fpr.models import FormatVersion
from a3m.fpr.models import IDCommand
from a3m.fpr.models import IDRule
from a3m.main.models import File
from a3m.main.models import FileFormatVersion
from a3m.main.models import FileID


def write_identification_event(file_uuid, command, format=None, success=True):
logger = logging.getLogger(__name__)

TOOL_DESCRIPTION = "pygfried/siegfried"
TOOL_VERSION = pygfried.version()


def write_file_format_version(file_obj, format_version_obj):
(ffv, created) = FileFormatVersion.objects.get_or_create(
file_uuid=file_obj, defaults={"format_version": format_version_obj}
)
if not created: # Update the version if it wasn't created new
ffv.format_version = format_version_obj
ffv.save()


def write_identification_event(file_uuid, puid=None, success=True):
event_detail_text = 'program="{}"; version="{}"'.format(
command.tool.description, command.tool.version
TOOL_DESCRIPTION, TOOL_VERSION
)
if success:
event_outcome_text = "Positive"
else:
event_outcome_text = "Not identified"

if not format:
format = "No Matching Format"
if not puid or puid == "UNKNOWN":
puid = "No Matching Format"

date = getUTCDate()

Expand All @@ -35,124 +49,54 @@ def write_identification_event(file_uuid, command, format=None, success=True):
eventDateTime=date,
eventDetail=event_detail_text,
eventOutcome=event_outcome_text,
eventOutcomeDetailNote=format,
eventOutcomeDetailNote=puid,
)


def write_file_id(file_uuid, format, output):
def write_file_id(file_id, format_version_obj):
"""
Write the identified format to the DB.

:param str file_uuid: UUID of the file identified
:param FormatVersion format: FormatVersion it was identified as
:param str output: Text that generated the match
"""
if format.pronom_id:
format_registry = "PRONOM"
key = format.pronom_id
else:
format_registry = "Archivematica Format Policy Registry"
key = output

# Sometimes, this is null instead of an empty string
version = format.version or ""

FileID.objects.create(
file_id=file_uuid,
format_name=format.format.description,
format_version=version,
format_registry_name=format_registry,
format_registry_key=key,
file_id=file_id,
format_name=format_version_obj.format.description,
format_version=format_version_obj.version or "",
format_registry_name="PRONOM",
format_registry_key=format_version_obj.pronom_id,
)


def _default_idcommand():
"""Retrieve the default ``fpr.IDCommand``.

We only expect to find one command enabled/active.
"""
return IDCommand.active.first()


def main(job, file_path, file_uuid, disable_reidentify):
command = _default_idcommand()
if command is None:
job.write_error("Unable to determine IDCommand.\n")
return 255

command_uuid = command.uuid
job.print_output("IDCommand:", command.description)
job.print_output("IDCommand UUID:", command.uuid)
job.print_output("IDTool:", command.tool.description)
job.print_output("IDTool UUID:", command.tool.uuid)
job.print_output(f"File: ({file_uuid}) {file_path}")

file_ = File.objects.get(uuid=file_uuid)

def identify_file_format(file_path, file_id, disable_reidentify):
# If reidentification is disabled and a format identification event exists for this file, exit
file_obj = File.objects.get(uuid=file_id)
if (
disable_reidentify
and file_.event_set.filter(event_type="format identification").exists()
and file_obj.event_set.filter(event_type="format identification").exists()
):
job.print_output(
logger.debug(
"This file has already been identified, and re-identification is disabled. Skipping."
)
return 0

exitcode, output, err = executeOrRun(
command.script_type,
command.script,
arguments=[file_path],
printing=False,
capture_output=True,
)
output = output.strip()
try:
puid = pygfried.identify(file_path)
except Exception as err:
logger.error("Error running pygfried: %s", err)
return 255

if exitcode != 0:
job.print_error(f"Error: IDCommand with UUID {command_uuid} exited non-zero.")
job.print_error(f"Error: {err}")
if not puid or puid == "UNKNOWN":
write_identification_event(file_id, success=False)
return 255

job.print_output("Command output:", output)
# PUIDs are the same regardless of tool, so PUID-producing tools don't have "rules" per se - we just
# go straight to the FormatVersion table to see if there's a matching PUID
try:
if command.config == "PUID":
version = FormatVersion.active.get(pronom_id=output)
else:
rule = IDRule.active.get(command_output=output, command=command)
version = rule.format
except IDRule.DoesNotExist:
job.print_error(
'Error: No FPR identification rule for tool output "{}" found'.format(
output
)
)
write_identification_event(file_uuid, command, success=False)
return 255
except IDRule.MultipleObjectsReturned:
job.print_error(
'Error: Multiple FPR identification rules for tool output "{}" found'.format(
output
)
)
write_identification_event(file_uuid, command, success=False)
return 255
format_version_obj = FormatVersion.active.get(pronom_id=puid)
except FormatVersion.DoesNotExist:
job.print_error(f"Error: No FPR format record found for PUID {output}")
write_identification_event(file_uuid, command, success=False)
write_identification_event(file_id, success=False)
return 255

(ffv, created) = FileFormatVersion.objects.get_or_create(
file_uuid=file_, defaults={"format_version": version}
)
if not created: # Update the version if it wasn't created new
ffv.format_version = version
ffv.save()
job.print_output(f"{file_path} identified as a {version.description}")

write_identification_event(file_uuid, command, format=version.pronom_id)
write_file_id(file_uuid=file_uuid, format=version, output=output)
write_file_format_version(file_obj, format_version_obj)
write_identification_event(file_id, puid=puid)
write_file_id(file_id, format_version_obj)

return 0

Expand All @@ -172,8 +116,7 @@ def call(jobs):
with job.JobContext():
args = parser.parse_args(job.args[1:])
job.set_status(
main(
job,
identify_file_format(
args.file_path,
args.file_uuid,
args.disable_reidentify,
Expand Down
Loading