Skip to content

Commit

Permalink
add an option to save empty ocr during warmup - avoiding incremental …
Browse files Browse the repository at this point in the history
…run to process same files
  • Loading branch information
WengerK committed Apr 26, 2024
1 parent f5a8864 commit 6ca5099
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 1 deletion.
12 changes: 11 additions & 1 deletion modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ public function __construct(Connection $connection, EntityTypeManagerInterface $
* The maximum file size in bytes a document can be to be processed.
* This is useful to avoid processing large files.
* [defaults: NULL].
* @option save-empty-ocr
* Save an empty OCR file when the file is not processable.
* Some files may not be processable by Tika (too large, corrputed, ...)
* enabling this file may avoid processing the same file over and over.
* [defaults: FALSE].
* @option stop-on-failure
* Stop processing on first failed (Ex. Tika's down).
* [defaults: FALSE].
Expand Down Expand Up @@ -118,6 +123,7 @@ public function warmup(
],
'filesize-threshold' => NULL,
'stop-on-failure' => FALSE,
'save-empty-ocr' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
'dry-run' => FALSE,
Expand All @@ -126,6 +132,7 @@ public function warmup(
$fid = $options['fid'];
$filemime = (array) $options['filemime'];
$filesize_threshold = $options['filesize-threshold'];
$save_empty_ocr = (bool) $options['save-empty-ocr'];
$stop_on_failure = (bool) $options['stop-on-failure'];
$force = (bool) $options['force'];
$dry_run = (bool) $options['dry-run'];
Expand Down Expand Up @@ -188,7 +195,10 @@ public function warmup(
// When the OCR'ed file is not available, then run Tika over it
// and store it for the next run.
$body = $this->fileToText->fromFileToText($file, 'eng+fra');
$this->localFileStorage->save($file, $body, 'eng+fra');

if ($body !== '' || $save_empty_ocr) {
$this->localFileStorage->save($file, $body, 'eng+fra');
}
}

$progressbar_objects->advance();
Expand Down
180 changes: 180 additions & 0 deletions modules/entity_to_text_tika/tests/src/Unit/OcrWarmupCommandTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,7 @@ public function testWarmupDryrun(): void {
'application/pdf',
],
'filesize-threshold' => NULL,
'save-empty-ocr' => FALSE,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
Expand Down Expand Up @@ -334,6 +335,7 @@ public function testWarmupForce(): void {
'application/pdf',
],
'filesize-threshold' => NULL,
'save-empty-ocr' => FALSE,
'stop-on-failure' => FALSE,
'force' => TRUE,
'no-progress' => FALSE,
Expand Down Expand Up @@ -405,6 +407,184 @@ public function testWarmupFid(): void {
'application/pdf',
],
'filesize-threshold' => NULL,
'save-empty-ocr' => FALSE,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
'dry-run' => FALSE,
]);
}

/**
* @covers ::warmup
*/
public function testWarmupSaveEmptyOcr(): void {
$query = $this->createMock(QueryInterface::class);
$query->expects($this->once())
->method('accessCheck')
->with(FALSE);
$query->expects($this->once())
->method('condition')
->with('filemime', [
'application/pdf',
]);
$query->expects($this->once())
->method('count')
->willReturnSelf();
$query->expects($this->exactly(2))
->method('execute')
->willReturnOnConsecutiveCalls(
// The first call is the cound query.
2,
// The second call is the actual query with files IDs.
[200, 2039],
);
$query->expects($this->once())
->method('range')
->with(0, 100);

$this->fileStorage->expects(self::once())
->method('getQuery')
->willReturn($query);

// Create a test file object.
$file200 = $this->createMock(File::class);
$file200->expects(self::once())
->method('getFileUri')
->willReturn('public://file/test.txt');
$file200->expects(self::once())
->method('id')
->willReturn(200);

// Create a test file object.
$file2039 = $this->createMock(File::class);
$file2039->expects(self::once())
->method('getFileUri')
->willReturn('public://file/foo.pdf');
$file2039->expects(self::once())
->method('id')
->willReturn(2039);

$this->fileStorage->expects($this->exactly(2))
->method('load')
->withConsecutive(
[200],
[2039],
)
->willReturnOnConsecutiveCalls($file200, $file2039);

$this->localFileStorage->expects($this->exactly(2))
->method('load')
->withConsecutive(
[$file200, 'eng+fra'],
[$file2039, 'eng+fra'],
)
->willReturnOnConsecutiveCalls('lorem ipsum', NULL);

$this->fileToText->expects($this->once())
->method('fromFileToText')
->with($file2039, 'eng+fra')
->willReturn('');

$this->localFileStorage->expects($this->once())
->method('save')
->with($file2039, '', 'eng+fra');

$this->warmupCommand->warmup([
'fid' => NULL,
'filemime' => [
'application/pdf',
],
'filesize-threshold' => NULL,
'save-empty-ocr' => TRUE,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
'dry-run' => FALSE,
]);
}

/**
* @covers ::warmup
*/
public function testWarmupNoSaveEmptyOcr(): void {
$query = $this->createMock(QueryInterface::class);
$query->expects($this->once())
->method('accessCheck')
->with(FALSE);
$query->expects($this->once())
->method('condition')
->with('filemime', [
'application/pdf',
]);
$query->expects($this->once())
->method('count')
->willReturnSelf();
$query->expects($this->exactly(2))
->method('execute')
->willReturnOnConsecutiveCalls(
// The first call is the cound query.
2,
// The second call is the actual query with files IDs.
[200, 2039],
);
$query->expects($this->once())
->method('range')
->with(0, 100);

$this->fileStorage->expects(self::once())
->method('getQuery')
->willReturn($query);

// Create a test file object.
$file200 = $this->createMock(File::class);
$file200->expects(self::once())
->method('getFileUri')
->willReturn('public://file/test.txt');
$file200->expects(self::once())
->method('id')
->willReturn(200);

// Create a test file object.
$file2039 = $this->createMock(File::class);
$file2039->expects(self::once())
->method('getFileUri')
->willReturn('public://file/foo.pdf');
$file2039->expects(self::once())
->method('id')
->willReturn(2039);

$this->fileStorage->expects($this->exactly(2))
->method('load')
->withConsecutive(
[200],
[2039],
)
->willReturnOnConsecutiveCalls($file200, $file2039);

$this->localFileStorage->expects($this->exactly(2))
->method('load')
->withConsecutive(
[$file200, 'eng+fra'],
[$file2039, 'eng+fra'],
)
->willReturnOnConsecutiveCalls('lorem ipsum', NULL);

$this->fileToText->expects($this->once())
->method('fromFileToText')
->with($file2039, 'eng+fra')
->willReturn('');

$this->localFileStorage->expects($this->never())
->method('save');

$this->warmupCommand->warmup([
'fid' => NULL,
'filemime' => [
'application/pdf',
],
'filesize-threshold' => NULL,
'save-empty-ocr' => FALSE,
'stop-on-failure' => FALSE,
'force' => FALSE,
'no-progress' => FALSE,
Expand Down

0 comments on commit 6ca5099

Please sign in to comment.