Skip to content

Commit

Permalink
rename PlainTextStorage to LocalFileStorage, add Interface for contract
Browse files Browse the repository at this point in the history
  • Loading branch information
WengerK committed Apr 13, 2024
1 parent 83da1aa commit 270a614
Show file tree
Hide file tree
Showing 11 changed files with 169 additions and 110 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,13 @@ or for an advanced usage avoiding multiple calls to Tika:

```php
// Load the already OCR'ed file if possible to avoid unecessary calls to Tika.
$body = \Drupal::service('entity_to_text_tika.storage.plain_text')->loadTextFromFile($file, 'eng+fra');
$body = \Drupal::service('entity_to_text_tika.storage.local_file')->load($file, 'eng+fra');

if (!$body) {
// When the OCR'ed file is not available, then run Tika over it and store it for the next run.
$body = \Drupal::service('entity_to_text_tika.extractor.file_to_text')->fromFileToText($file, 'eng+fra');
// Save the OCR'ed file for the next run.
\Drupal::service('entity_to_text_tika.storage.plain_text')->saveTextToFile($file, $body, 'eng+fra');
\Drupal::service('entity_to_text_tika.storage.local_file')->save($file, $body, 'eng+fra');
}
```

Expand Down
2 changes: 1 addition & 1 deletion modules/entity_to_text_tika/drush.services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ services:
arguments:
- '@entity_type.manager'
- '@entity_to_text_tika.extractor.file_to_text'
- '@entity_to_text_tika.storage.plain_text'
- '@entity_to_text_tika.storage.local_file'
tags:
- { name: drush.command }
10 changes: 5 additions & 5 deletions modules/entity_to_text_tika/entity_to_text_tika.install
Original file line number Diff line number Diff line change
Expand Up @@ -22,22 +22,22 @@ function entity_to_text_tika_requirements($phase = 'runtime') {
}

$requirements['entity_to_text_tika_private'] = [
'title' => t('Entity to Text (Tika): Private schema'),
'title' => t('Entity to Text (Tika): Local File Storage (OCR cache)'),
'description' => t('Entity to Text Tika expose a Local File Storage optimisation in order to store OCR of document in the private:// schema. The current private schema configuration can leverage it.'),
'value' => t('Private file system is set and writtable.'),
];

// Check if the private file stream wrapper is ready to use.
if (!$stream_wrapper_manager->isValidScheme('private')) {
$requirements['entity_to_text_tika_private']['value'] = 'Private file system is not set.';
$requirements['entity_to_text_tika_private']['description'] = t('Entity to Text Tika will store OCR of document in the private:// schema, there this one must be configured before installing.');
$requirements['entity_to_text_tika_private']['severity'] = REQUIREMENT_ERROR;
$requirements['entity_to_text_tika_private']['description'] = t('Entity to Text Tika expose a Local File Storage optimisation in order to store OCR of document in the private:// schema. The current private schema configuration cannot leverage it.');
$requirements['entity_to_text_tika_private']['severity'] = REQUIREMENT_INFO;
}

$private_path = $file_system->realpath('private://');
// Check if the private file stream wrapper is ready to use.
if (!is_dir($private_path) || !is_writable($private_path)) {
$requirements['entity_to_text_tika_private']['value'] = 'Private file system is not writtable.';
$requirements['entity_to_text_tika_private']['description'] = t('The resolved private directory %directory% seems not writable.', ['%directory%' => $private_path]);
$requirements['entity_to_text_tika_private']['value'] = t('Entity to Text Tika expose a Local File Storage optimisation in order to store OCR of document in the private:// schema. The current private schema configuration cannot leverage it. The resolved private directory %directory% seems not writable.', ['%directory%' => $private_path]);
$requirements['entity_to_text_tika_private']['severity'] = REQUIREMENT_ERROR;
}

Expand Down
4 changes: 2 additions & 2 deletions modules/entity_to_text_tika/entity_to_text_tika.services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ services:
- '@event_dispatcher'

# Storage
entity_to_text_tika.storage.plain_text:
class: Drupal\entity_to_text_tika\Storage\PlaintextStorage
entity_to_text_tika.storage.local_file:
class: Drupal\entity_to_text_tika\Storage\LocalFileStorage
arguments:
- '@file_system'
- '@logger.factory'
Expand Down
16 changes: 8 additions & 8 deletions modules/entity_to_text_tika/src/Commands/OcrWarmupCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

use Drupal\Core\Entity\EntityTypeManagerInterface;
use Drupal\entity_to_text_tika\Extractor\FileToText;
use Drupal\entity_to_text_tika\Storage\PlaintextStorage;
use Drupal\entity_to_text_tika\Storage\StorageInterface;
use Drush\Commands\DrushCommands;
use Symfony\Component\Console\Helper\ProgressBar;
use Symfony\Component\Console\Output\OutputInterface;
Expand Down Expand Up @@ -36,19 +36,19 @@ class OcrWarmupCommand extends DrushCommands {
protected $fileToText;

/**
* The Plain-text storage processor.
* The Plain-text storage cache processor.
*
* @var \Drupal\entity_to_text_tika\Storage\PlaintextStorage
* @var \Drupal\entity_to_text_tika\Storage\StorageInterface
*/
protected $plaintextStorage;
protected $localFileStorage;

/**
* Warmup OCR caches for Tika constructor.
*/
public function __construct(EntityTypeManagerInterface $entity_type_manager, FileToText $file_to_text, PlaintextStorage $plaintext_storage) {
public function __construct(EntityTypeManagerInterface $entity_type_manager, FileToText $file_to_text, StorageInterface $local_storage) {
$this->fileStorage = $entity_type_manager->getStorage('file');
$this->fileToText = $file_to_text;
$this->plaintextStorage = $plaintext_storage;
$this->localFileStorage = $local_storage;
}

/**
Expand Down Expand Up @@ -149,13 +149,13 @@ public function warmup(array $options = [
}

// Load the already OCR'ed file if possible.
$body = $this->plaintextStorage->loadTextFromFile($file, 'eng+fra');
$body = $this->localFileStorage->load($file, 'eng+fra');

if (!$body || $force) {
// When the OCR'ed file is not available, then run Tika over it
// and store it for the next run.
$body = $this->fileToText->fromFileToText($file, 'eng+fra');
$this->plaintextStorage->saveTextToFile($file, $body, 'eng+fra');
$this->localFileStorage->save($file, $body, 'eng+fra');
}

$progressbar_objects->advance();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
use Drupal\file\Entity\File;

/**
* Provide Capabilities to store a Text content to plain-text file.
* Provide Capabilities to store a Text content into local plain-text file.
*/
class PlaintextStorage {
class LocalFileStorage implements StorageInterface {

public const DESTINATION = 'private://entity-to-text/ocr';

Expand All @@ -37,7 +37,7 @@ class PlaintextStorage {
protected $streamWrapperManager;

/**
* Construct a new PlaintextStorage object.
* Construct a new LocalFileStorage object.
*/
public function __construct(FileSystemInterface $file_system, LoggerChannelFactoryInterface $logger_factory, StreamWrapperManagerInterface $stream_wrapper_manager) {
$this->fileSystem = $file_system;
Expand All @@ -46,17 +46,9 @@ public function __construct(FileSystemInterface $file_system, LoggerChannelFacto
}

/**
* Store a plain text value into a file.
*
* @param \Drupal\file\Entity\File $file
* The document.
* @param string $langcode
* The OCR langcode to be used.
*
* @return string|null
* The transformed file into a plain text value by Apache Tika.
* {@inheritdoc}
*/
public function loadTextFromFile(File $file, string $langcode = 'eng'): ?string {
public function load(File $file, string $langcode = 'eng'): ?string {
$fullpath = $this->getFullPath($file, $langcode);

if (!is_file($fullpath)) {
Expand All @@ -67,19 +59,11 @@ public function loadTextFromFile(File $file, string $langcode = 'eng'): ?string
}

/**
* Store a plain text value into a file.
*
* @param \Drupal\file\Entity\File $file
* The document to be saved.
* @param string $content
* The plain-text document to be stored.
* @param string $langcode
* The langcode.
*
* @return string
* The saved fullpath file.
* {@inheritdoc}
*/
public function saveTextToFile(File $file, string $content, string $langcode = 'eng'): string {
public function save(File $file, string $content, string $langcode = 'eng'): string {
$this->prepareDestination();

$fullpath = $this->getFullPath($file, $langcode);
file_put_contents($fullpath, $content);
return $fullpath;
Expand All @@ -97,8 +81,6 @@ public function saveTextToFile(File $file, string $content, string $langcode = '
* The given file unique fullpath.
*/
private function getFullPath(File $file, string $langcode = 'eng'): string {
$this->prepareDestination();

$uri = self::DESTINATION;
$filename = $file->id() . '-' . $file->getFilename() . '.' . $langcode . '.ocr.txt';

Expand Down
43 changes: 43 additions & 0 deletions modules/entity_to_text_tika/src/Storage/StorageInterface.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?php

namespace Drupal\entity_to_text_tika\Storage;

use Drupal\file\Entity\File;

/**
* OCR Storage interface.
*
* All class that store and retrieve OCR text values must implement
* the interface.
*/
interface StorageInterface {

/**
* Load an OCR text value from a storage interface (file, database ...).
*
* @param \Drupal\file\Entity\File $file
* The document to be saved.
* @param string $langcode
* The translation the ocr file must be retrieved.
*
* @return string|null
* The OCR plain-text value for the given file.
*/
public function load(File $file, string $langcode = 'eng'): ?string;

/**
* Store an OCR text value into a storage interface (file, database ...).
*
* @param \Drupal\file\Entity\File $file
* The document to be saved.
* @param string $content
* The plain-text document to be stored.
* @param string $langcode
* The translation the ocr file must be stored.
*
* @return string
* The saved full path file.
*/
public function save(File $file, string $content, string $langcode = 'eng'): string;

}
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public function testStatusPageGood() {
$this->drupalGet('admin/reports/status');
$this->assertSession()->statusCodeEquals(200);

$this->assertSession()->pageTextContains('Entity to Text (Tika): Private schema');
$this->assertSession()->pageTextContains('Entity to Text (Tika): Local File Storage (OCR cache)');
$this->assertSession()->pageTextContains('Private file system is set and writtable.');
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@
namespace Drupal\Tests\entity_to_text_tika\Kernel;

use Drupal\Core\File\FileSystemInterface;
use Drupal\entity_to_text_tika\Storage\PlaintextStorage;
use Drupal\entity_to_text_tika\Storage\LocalFileStorage;
use Drupal\file\Entity\File;
use Drupal\KernelTests\Core\File\FileTestBase;

/**
* Tests the Plaintext File Storage.
*
* @coversDefaultClass \Drupal\entity_to_text_tika\Storage\PlaintextStorage
* @coversDefaultClass \Drupal\entity_to_text_tika\Storage\LocalFileStorage
*
* @group entity_to_text
* @group entity_to_text_tika
*
* @internal
*/
final class PlaintextStorageTest extends FileTestBase {
final class LocalFileStorageTest extends FileTestBase {

/**
* {@inheritdoc}
Expand All @@ -34,9 +34,9 @@ final class PlaintextStorageTest extends FileTestBase {
/**
* The plain-text storage processor.
*
* @var \Drupal\entity_to_text_tika\Storage\PlaintextStorage
* @var \Drupal\entity_to_text_tika\Storage\LocalFileStorage
*/
protected PlaintextStorage $plaintextStorage;
protected LocalFileStorage $localFileStorage;

/**
* {@inheritdoc}
Expand All @@ -49,16 +49,16 @@ protected function setUp(): void {
$this->installSchema('file', ['file_usage']);

$this->fileSystem = $this->container->get('file_system');
$this->plaintextStorage = $this->container->get('entity_to_text_tika.storage.plain_text');
$this->localFileStorage = $this->container->get('entity_to_text_tika.storage.local_file');

$destination = PlaintextStorage::DESTINATION;
$destination = LocalFileStorage::DESTINATION;
$this->fileSystem->prepareDirectory($destination, FileSystemInterface::CREATE_DIRECTORY | FileSystemInterface::MODIFY_PERMISSIONS);
}

/**
* @covers ::loadTextFromFile
* @covers ::load
*/
public function testLoadTextFromFile(): void {
public function testloadPublic(): void {
// Create an OCR file for testing.
$file_uri = $this->createUri('390-foo.txt.en.ocr.txt', 'Ipsum excepteur id cupidatat commodo', 'private');
$this->fileSystem->move($file_uri, 'private://entity-to-text/ocr/390-foo.txt.en.ocr.txt', FileSystemInterface::EXISTS_REPLACE);
Expand All @@ -70,13 +70,49 @@ public function testLoadTextFromFile(): void {
]);
$file->set('fid', 390);

self::assertEquals('Ipsum excepteur id cupidatat commodo', $this->plaintextStorage->loadTextFromFile($file, 'en'));
self::assertEquals('Ipsum excepteur id cupidatat commodo', $this->localFileStorage->load($file, 'en'));
}

/**
* @covers ::loadTextFromFile
* @covers ::load
*/
public function testLoadTextFromFileWhenOcrFileNotExists(): void {
public function testloadPrivate(): void {
// Create an OCR file for testing.
$file_uri = $this->createUri('390-foo.txt.en.ocr.txt', 'Ipsum excepteur id cupidatat commodo', 'private');
$this->fileSystem->move($file_uri, 'private://entity-to-text/ocr/390-foo.txt.en.ocr.txt', FileSystemInterface::EXISTS_REPLACE);

// Create a file that correspond to the previous OCR file.
$file = File::create([
'uri' => 'private://foo.txt',
'name' => 'foo',
]);
$file->set('fid', 390);

self::assertEquals('Ipsum excepteur id cupidatat commodo', $this->localFileStorage->load($file, 'en'));
}

/**
* @covers ::load
*/
public function testloadSubDirectory(): void {
// Create an OCR file for testing.
$file_uri = $this->createUri('420-foo.txt.en.ocr.txt', 'Ipsum excepteur id cupidatat commodo', 'private');
$this->fileSystem->move($file_uri, 'private://entity-to-text/ocr/420-foo.txt.en.ocr.txt', FileSystemInterface::EXISTS_REPLACE);

// Create a file that correspond to the previous OCR file.
$file = File::create([
'uri' => 'public://documents/2024/foo.txt',
'name' => 'foo',
]);
$file->set('fid', 420);

self::assertEquals('Ipsum excepteur id cupidatat commodo', $this->localFileStorage->load($file, 'en'));
}

/**
* @covers ::load
*/
public function testloadWhenOcrFileNotExists(): void {
// Create a file that has not been already processed and
// therefore does not havean OCR associated file.
$file = File::create([
Expand All @@ -86,30 +122,30 @@ public function testLoadTextFromFileWhenOcrFileNotExists(): void {
$file->set('fid', 380);

// When the OCR file does not exists, then nothing can be retreived.
self::assertNull($this->plaintextStorage->loadTextFromFile($file, 'en'));
self::assertNull($this->localFileStorage->load($file, 'en'));
}

/**
* @covers ::saveTextToFile
* @covers ::save
*/
public function testSaveTextToFile(): void {
public function testSave(): void {
// Create a file for testing.
$file = File::create([
'uri' => $this->createUri('foo.txt', 'veniam consequat duis'),
'name' => 'foo',
]);
$file->set('fid', 399);

$file_path = $this->plaintextStorage->saveTextToFile($file, 'veniam consequat duis', 'en');
$file_path = $this->localFileStorage->save($file, 'veniam consequat duis', 'en');
self::assertStringEndsWith('private/entity-to-text/ocr/399-foo.txt.en.ocr.txt', $file_path);
self::assertFileExists($file_path);
self::assertEquals('veniam consequat duis', file_get_contents($file_path));
}

/**
* @covers ::saveTextToFile
* @covers ::save
*/
public function testSaveTextToFileWhenOcrFileAlreadyExists(): void {
public function testSaveWhenOcrFileAlreadyExists(): void {
// Create an OCR file for testing.
$file_ocr_uri = $this->createUri('400-foo.txt.en.ocr.txt', 'Ipsum excepteur id cupidatat commodo', 'private');
$this->fileSystem->move($file_ocr_uri, 'private://entity-to-text/ocr/400-foo.txt.en.ocr.txt', FileSystemInterface::EXISTS_REPLACE);
Expand All @@ -122,7 +158,7 @@ public function testSaveTextToFileWhenOcrFileAlreadyExists(): void {
$file->set('fid', 400);

// When the file already exists, it will be overriden.
$file_path = $this->plaintextStorage->saveTextToFile($file, 'veniam consequat duis', 'en');
$file_path = $this->localFileStorage->save($file, 'veniam consequat duis', 'en');
self::assertStringEndsWith('private/entity-to-text/ocr/400-foo.txt.en.ocr.txt', $file_path);
self::assertFileExists($file_path);
self::assertEquals('veniam consequat duis', file_get_contents($file_path));
Expand Down
Loading

0 comments on commit 270a614

Please sign in to comment.