Skip to content

Commit

Permalink
add a new layer of performance by allowing developers to cache OCR'ed…
Browse files Browse the repository at this point in the history
… files
  • Loading branch information
WengerK committed Mar 27, 2024
1 parent f2498ab commit ff0e565
Show file tree
Hide file tree
Showing 9 changed files with 699 additions and 3 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- increase timeout to 20sec

### Fixed
- fix D10 deprecations: Creation of dynamic property is deprecated

### Added
- add event PRE_PROCESS_FILE to allow client or file alteration before Tika OCR

### Added
- add a new layer of performance by allowing developers to cache OCR'ed files

## [1.0.0] - 2023-01-27
### Added
- init module
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,20 @@ $file = $file_item->entity;
$body = \Drupal::service('entity_to_text_tika.extractor.file_to_text')->fromFileToText($file, 'eng+fra');
```

or for an advanced usage avoiding multiple calls to Tika:

```php
// Load the already OCR'ed file if possible to avoid unecessary calls to Tika.
$body = \Drupal::service('entity_to_text_tika.storage.plain_text')->loadTextFromFile($file, 'eng+fra');

if (!$body) {
// When the OCR'ed file is not available, then run Tika over it and store it for the next run.
$body = \Drupal::service('entity_to_text_tika.extractor.file_to_text')->fromFileToText($file, 'eng+fra');
// Save the OCR'ed file for the next run.
\Drupal::service('entity_to_text_tika.storage.plain_text')->saveTextToFile($file, $body, 'eng+fra');
}
```

## Supporting organizations

This project is sponsored by [Antistatique](https://www.antistatique.net), a Swiss Web Agency.
Expand Down
45 changes: 45 additions & 0 deletions modules/entity_to_text_tika/entity_to_text_tika.install
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<?php

/**
* @file
* Install, update and uninstall fn for the Entity to Text Tika sub-module.
*/

/**
* Implements hook_requirements().
*/
function entity_to_text_tika_requirements($phase = 'runtime') {
$requirements = [];
$stream_wrapper_manager = \Drupal::service('stream_wrapper_manager');
$file_system = \Drupal::service('file_system');

/* ************************************************************************ */
// Check private file directory.
/* ************************************************************************ */

if ($phase !== 'runtime') {
return $requirements;
}

$requirements['entity_to_text_tika_private'] = [
'title' => t('Entity to Text (Tika): Private schema'),
'value' => t('Private file system is set and writtable.'),
];

// Check if the private file stream wrapper is ready to use.
if (!$stream_wrapper_manager->isValidScheme('private')) {
$requirements['entity_to_text_tika_private']['value'] = 'Private file system is not set.';
$requirements['entity_to_text_tika_private']['description'] = t('Entity to Text Tika will store OCR of document in the private:// schema, there this one must be configured before installing.');
$requirements['entity_to_text_tika_private']['severity'] = REQUIREMENT_ERROR;
}

$private_path = $file_system->realpath('private://');
// Check if the private file stream wrapper is ready to use.
if (!is_dir($private_path) || !is_writable($private_path)) {
$requirements['entity_to_text_tika_private']['value'] = 'Private file system is not writtable.';
$requirements['entity_to_text_tika_private']['description'] = t('The resolved private directory %directory% seems not writable.', ['%directory%' => $private_path]);
$requirements['entity_to_text_tika_private']['severity'] = REQUIREMENT_ERROR;
}

return $requirements;
}
8 changes: 8 additions & 0 deletions modules/entity_to_text_tika/entity_to_text_tika.services.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,11 @@ services:
- '@file_system'
- '@logger.factory'
- '@event_dispatcher'

# Storage
entity_to_text_tika.storage.plain_text:
class: Drupal\entity_to_text_tika\Storage\PlaintextStorage
arguments:
- '@file_system'
- '@logger.factory'
- '@stream_wrapper_manager'
126 changes: 126 additions & 0 deletions modules/entity_to_text_tika/src/Storage/PlaintextStorage.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
<?php

namespace Drupal\entity_to_text_tika\Storage;

use Drupal\Core\File\FileSystemInterface;
use Drupal\Core\Logger\LoggerChannelFactoryInterface;
use Drupal\Core\StreamWrapper\StreamWrapperManager;
use Drupal\Core\StreamWrapper\StreamWrapperManagerInterface;
use Drupal\file\Entity\File;

/**
* Provide Capabilities to store a Text content to plain-text file.
*/
class PlaintextStorage {

public const DESTINATION = 'private://entity-to-text/ocr';

/**
* The file system service.
*
* @var \Drupal\Core\File\FileSystemInterface
*/
protected $fileSystem;

/**
* The logger service.
*
* @var \Drupal\Core\Logger\LoggerChannelInterface
*/
protected $logger;

/**
* The stream wrapper manager.
*
* @var \Drupal\Core\StreamWrapper\StreamWrapperManagerInterface
*/
protected $streamWrapperManager;

/**
* Construct a new PlaintextStorage object.
*/
public function __construct(FileSystemInterface $file_system, LoggerChannelFactoryInterface $logger_factory, StreamWrapperManagerInterface $stream_wrapper_manager) {
$this->fileSystem = $file_system;
$this->logger = $logger_factory->get('entity_to_text_tika');
$this->streamWrapperManager = $stream_wrapper_manager;
}

/**
* Store a plain text value into a file.
*
* @param \Drupal\file\Entity\File $file
* The document.
* @param string $langcode
* The OCR langcode to be used.
*
* @return string|null
* The transformed file into a plain text value by Apache Tika.
*/
public function loadTextFromFile(File $file, string $langcode = 'eng'): ?string {
$fullpath = $this->getFullPath($file, $langcode);

if (!is_file($fullpath)) {
return NULL;
}

return file_get_contents($fullpath);
}

/**
* Store a plain text value into a file.
*
* @param \Drupal\file\Entity\File $file
* The document to be saved.
* @param string $content
* The plain-text document to be stored.
* @param string $langcode
* The langcode.
*
* @return string
* The saved fullpath file.
*/
public function saveTextToFile(File $file, string $content, string $langcode = 'eng'): string {
$fullpath = $this->getFullPath($file, $langcode);
file_put_contents($fullpath, $content);
return $fullpath;
}

/**
* Get a normalized fullpath for a given file and langcode.
*
* @param \Drupal\file\Entity\File $file
* The document.
* @param string $langcode
* The langcode.
*
* @return string
* The given file unique fullpath.
*/
private function getFullPath(File $file, string $langcode = 'eng'): string {
$this->prepareDestination();

$uri = self::DESTINATION;
$filename = $file->id() . '-' . $file->getFilename() . '.' . $langcode . '.ocr.txt';

$scheme = StreamWrapperManager::getScheme($uri);
if (!$this->streamWrapperManager->isValidScheme($scheme)) {
throw new \RuntimeException('The destination path is not a valid stream wrapper.');
}

$path = $this->fileSystem->realpath($uri);
if (!$path) {
throw new \RuntimeException(sprintf('The resolved realpath from uri "%s" is not a valid directory.', $uri));
}

return $path . '/' . $filename;
}

/**
* Ensure the destination directory is ready to use.
*/
private function prepareDestination(): void {
$dest = self::DESTINATION;
$this->fileSystem->prepareDirectory($dest, FileSystemInterface::CREATE_DIRECTORY);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

namespace Drupal\Tests\entity_to_text_tika\Functional;

use Drupal\Tests\system\Functional\Module\ModuleTestBase;

/**
* Tests install / uninstall of module.
*
* @group entity_to_text
* @group entity_to_text_tika
* @group entity_to_text_tika_functional
*/
class InstallUninstallTest extends ModuleTestBase {

/**
* {@inheritdoc}
*/
protected $defaultTheme = 'starterkit_theme';

/**
* Ensure module can be installed.
*/
public function testInstall(): void {
// Makes sure the base module is installed.
$this->container->get('module_installer')->install(['entity_to_text']);
// Makes sure the sub-module is not already installed.
$this->assertModules(['entity_to_text_tika'], FALSE);

// Attempt to install the module.
$edit = [];
$edit['modules[entity_to_text][enable]'] = 'entity_to_text';
$edit['modules[entity_to_text_tika][enable]'] = 'entity_to_text_tika';
$this->drupalGet('admin/modules');
$this->submitForm($edit, 'Install');

$this->assertSession()->pageTextContains('Module Entity to Text - Tika has been enabled.');

// Makes sure the module has been installed.
$this->assertModules(['entity_to_text_tika'], TRUE);
}

/**
* Ensure module can be uninstalled.
*/
public function testUninstall(): void {
// Makes sure the base module is installed.
$this->container->get('module_installer')->install(['entity_to_text']);
// Makes sure the sub-module is installed.
$this->container->get('module_installer')->install(['entity_to_text_tika']);

// Attempt to uninstall the factory_lollipop module.
$edit['uninstall[entity_to_text_tika]'] = TRUE;
$this->drupalGet('admin/modules/uninstall');
$this->submitForm($edit, 'Uninstall');
// Confirm uninstall.
$this->submitForm([], 'Uninstall');
$this->assertSession()->responseContains('The selected modules have been uninstalled.');

// Makes sure the module has been uninstalled.
$this->assertModules(['entity_to_text_tika'], FALSE);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<?php

namespace Drupal\Tests\entity_to_text_tika\Functional;

use Drupal\Tests\BrowserTestBase;

/**
* Tests module requirements.
*
* @group entity_to_text
* @group entity_to_text_tika
* @group entity_to_text_tika_functional
*/
class RequirementsTest extends BrowserTestBase {

/**
* {@inheritdoc}
*/
protected $defaultTheme = 'starterkit_theme';

/**
* {@inheritdoc}
*/
protected static $modules = ['entity_to_text_tika'];

/**
* Admin user.
*
* @var \Drupal\user\UserInterface
*/
protected $adminUser;

/**
* {@inheritdoc}
*/
protected function setUp(): void {
parent::setUp();

$this->adminUser = $this->drupalCreateUser([
'administer site configuration',
]);
}

/**
* Tests when private stream is configured the status acknowledge.
*/
public function testStatusPageGood() {
$this->drupalLogin($this->adminUser);

$this->drupalGet('admin/reports/status');
$this->assertSession()->statusCodeEquals(200);

$this->assertSession()->pageTextContains('Entity to Text (Tika): Private schema');
$this->assertSession()->pageTextContains('Private file system is set and writtable.');
}

}
Loading

0 comments on commit ff0e565

Please sign in to comment.