From 3af0d323d579d619f9e26727bc7e3d456f3c2f61 Mon Sep 17 00:00:00 2001 From: Bozana Bokan Date: Fri, 19 Jul 2024 15:30:18 +0200 Subject: [PATCH] pkp/pkp-lib#9911 Consider new URLs with language, and create canonical URLs in ConvertApacheAccessLogFile CLI tool --- classes/cliTool/traits/ConvertLogFile.php | 96 +++++++++++++++++++---- tools/convertApacheAccessLogFile.php | 79 ++++++++++++++++--- 2 files changed, 148 insertions(+), 27 deletions(-) diff --git a/classes/cliTool/traits/ConvertLogFile.php b/classes/cliTool/traits/ConvertLogFile.php index 747928227af..84a21fcc450 100644 --- a/classes/cliTool/traits/ConvertLogFile.php +++ b/classes/cliTool/traits/ConvertLogFile.php @@ -26,9 +26,11 @@ use APP\statistics\StatisticsHelper; use DateTime; use Exception; +use PKP\config\Config; use PKP\core\Core; use PKP\core\Registry; use PKP\db\DAORegistry; +use PKP\facades\Locale; use PKP\file\FileManager; use PKP\submission\Genre; @@ -169,7 +171,6 @@ public function convert(string $fileName): void } $newEntry['userAgent'] = $entryData['userAgent']; - $newEntry['canonicalUrl'] = $entryData['url']; [ 'workingAssocType' => $assocType, @@ -188,13 +189,21 @@ public function convert(string $fileName): void $context = $this->contextsByPath[$foundContextPath]; $newEntry['contextId'] = $context->getId(); - $this->setAssoc($assocType, $op, $args, $newEntry); + $this->setAssoc($assocType, $page, $op, $args, $newEntry); if (!array_key_exists('assocType', $newEntry)) { if (!$this->isApacheAccessLogFile()) { fwrite(STDERR, "The URL {$entryData['url']} in the line number {$lineNumber} was not considered." . PHP_EOL); } continue; } + + $canonicalUrl = $entryData['url']; // if this is not the apache log file i.e. it is the internal log file, the URLs are already canonical + if ($this->isApacheAccessLogFile()) { + $canonicalUrl = $this->getCanonicalUrl($foundContextPath, $newEntry['canonicalUrlPage'], $newEntry['canonicalUrlOp'], $newEntry['canonicalUrlArgs'] ?? null); + // unset elements that are temporarily used and should not be logged + unset($newEntry['canonicalUrlPage'], $newEntry['canonicalUrlOp'], $newEntry['canonicalUrlArgs']); + } + $newEntry['canonicalUrl'] = $canonicalUrl; } else { continue; } @@ -408,12 +417,14 @@ protected function getExpectedPageAndOp(): array break; case 'omp': // Before 3.4 OMP did not have chapter assoc type i.e. chapter landing page - // so no need to consider it here + // consider it here however, in order to allow current apache access log file conversion $pageAndOp = $pageAndOp + [ Application::ASSOC_TYPE_SUBMISSION_FILE => [ 'catalog/download'], Application::ASSOC_TYPE_MONOGRAPH => [ 'catalog/book'], + Application::ASSOC_TYPE_CHAPTER => [ + 'catalog/book'], Application::ASSOC_TYPE_SERIES => [ 'catalog/series'] ]; @@ -478,8 +489,8 @@ protected static function getContextPaths(string $urlInfo, bool $isPathInfo): ar */ protected static function getPage(string $urlInfo, bool $isPathInfo): string { - $page = self::getUrlComponents($urlInfo, $isPathInfo, 0, 'page'); - return Core::cleanFileVar(is_null($page) ? '' : $page); + $page = self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 0), 'page'); + return Core::cleanFileVar($page ?? ''); } /** @@ -489,8 +500,8 @@ protected static function getPage(string $urlInfo, bool $isPathInfo): string */ protected static function getOp(string $urlInfo, bool $isPathInfo): string { - $operation = self::getUrlComponents($urlInfo, $isPathInfo, 1, 'op'); - return Core::cleanFileVar(empty($operation) ? 'index' : $operation); + $operation = self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 1), 'op'); + return Core::cleanFileVar($operation ?: 'index'); } /** @@ -501,14 +512,32 @@ protected static function getOp(string $urlInfo, bool $isPathInfo): string */ protected static function getArgs(string $urlInfo, bool $isPathInfo): array { - return self::getUrlComponents($urlInfo, $isPathInfo, 2, 'path'); + return self::getUrlComponents($urlInfo, $isPathInfo, self::getOffset($urlInfo, $isPathInfo, 2), 'path'); + } + + /** + * Get offset. Add 1 extra if localization present in URL + */ + private static function getOffset(string $urlInfo, bool $isPathInfo, int $varOffset): int + { + return $varOffset + (int) !!self::getLocalization($urlInfo, $isPathInfo); + } + + /** + * Get localization path present into the passed + * url information. + */ + public static function getLocalization(string $urlInfo, bool $isPathInfo): string + { + $locale = self::getUrlComponents($urlInfo, $isPathInfo, 0); + return Locale::isLocaleValid($locale) ? $locale : ''; } /** * Get url components (page, operation and args) * based on the passed offset. */ - protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, int $offset, string $varName = ''): mixed + protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, int $offset, string $varName = ''): array|string|null { $component = null; @@ -517,7 +546,6 @@ protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, in $isArrayComponent = true; } if ($isPathInfo) { - $application = Application::get(); $contextDepth = 1; // Was $application->getContextDepth(); $vars = explode('/', trim($urlInfo, '/')); @@ -544,22 +572,56 @@ protected static function getUrlComponents(string $urlInfo, bool $isPathInfo, in return $component; } + /** + * Construct the URL from context path, page, op, and params + */ + protected function getCanonicalUrl(string $contextPath, string $canonicalUrlPage, string $canonicalUrlOp, array $canonicalUrlArgs = null): string + { + $canonicalUrl = Application::get()->getDispatcher()->url( + Application::get()->getRequest(), + Application::ROUTE_PAGE, + $contextPath, + $canonicalUrlPage, + $canonicalUrlOp, + $canonicalUrlArgs, + urlLocaleForPage: '' + ); + + // Make sure we log the server name and not aliases. + $configBaseUrl = Config::getVar('general', 'base_url'); + $requestBaseUrl = Application::get()->getRequest()->getBaseUrl(); + if ($requestBaseUrl !== $configBaseUrl) { + // Make sure it's not an url override (no alias on that case). + if (!in_array($requestBaseUrl, Config::getContextBaseUrls()) && + $requestBaseUrl !== Config::getVar('general', 'base_url[index]')) { + // Alias found, replace it by base_url from config file. + // Make sure we use the correct base url override value for the context, if any. + $baseUrlReplacement = Config::getVar('general', 'base_url[' . $contextPath . ']'); + if (!$baseUrlReplacement) { + $baseUrlReplacement = $configBaseUrl; + } + $canonicalUrl = str_replace($requestBaseUrl, $baseUrlReplacement, $canonicalUrl); + } + } + return $canonicalUrl; + } + /** * Set assoc type and IDs from the passed page, operation and arguments. */ - protected function setAssoc(int $assocType, string $op, array $args, array &$newEntry): void + protected function setAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { $application = Application::get(); $applicationName = $application->getName(); switch ($applicationName) { case 'ojs2': - $this->setOJSAssoc($assocType, $args, $newEntry); + $this->setOJSAssoc($assocType, $page, $op, $args, $newEntry); break; case 'omp': - $this->setOMPAssoc($assocType, $args, $newEntry); + $this->setOMPAssoc($assocType, $page, $op, $args, $newEntry); break; case 'ops': - $this->setOPSAssoc($assocType, $args, $newEntry); + $this->setOPSAssoc($assocType, $page, $op, $args, $newEntry); break; default: throw new Exception('Unrecognized application name!'); @@ -570,7 +632,7 @@ protected function setAssoc(int $assocType, string $op, array $args, array &$new * Set assoc type and IDs from the passed page, operation and * arguments specific to OJS. */ - protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): void + protected function setOJSAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { switch ($assocType) { case Application::getContextAssocType(): @@ -813,7 +875,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v * Set assoc type and IDs from the passed page, operation and * arguments specific to OMP. */ - protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): void + protected function setOMPAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { switch ($assocType) { case Application::getContextAssocType(): @@ -966,7 +1028,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v * Set assoc type and IDs from the passed page, operation and * arguments specific to OPS. */ - protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): void + protected function setOPSAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { switch ($assocType) { case Application::getContextAssocType(): diff --git a/tools/convertApacheAccessLogFile.php b/tools/convertApacheAccessLogFile.php index e5eadf46f3c..83982ad9547 100644 --- a/tools/convertApacheAccessLogFile.php +++ b/tools/convertApacheAccessLogFile.php @@ -120,7 +120,7 @@ public function __construct(array $argv = []) } // This tool needs egrep path configured. - if (file_exists(self::EGREP_PATH)) { + if (!file_exists(self::EGREP_PATH)) { fwrite(STDERR, 'Error: This tool needs egrep program. Please define the constatn EGREP_PATH in this script, enter there the path to egrep command on your machine.' . PHP_EOL); exit(9); } @@ -156,7 +156,7 @@ public function isApacheAccessLogFile(): bool */ public function usage() { - echo "\nConvert the passed apache access log file into the new usage stats log file format. + echo "\nConvert the apache access log file into the new usage stats log file format. This will copy the apache access file to the usageStats/tmp/ folder in the files directory, filter entries related to this installation, split the file by day, rename the result file(s) into apache_usage_events_YYYYMMDD.log, convert them into the new JSON format, and @@ -304,6 +304,8 @@ public function splitFileByDay(string $filePath): array } // Get all days between the first and the last date, including the last date + $firstDate->setTime(0, 0, 0); + $firstDate->setTime(0, 0, 1); $period = new DatePeriod( $firstDate, new DateInterval('P1D'), @@ -411,19 +413,19 @@ protected function getExpectedPageAndOp(): array /** * Set assoc type and IDs from the passed page, operation and arguments. */ - protected function setAssoc(int $assocType, string $op, array $args, array &$newEntry): void + protected function setAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { $application = Application::get(); $applicationName = $application->getName(); switch ($applicationName) { case 'ojs2': - $this->setOJSAssoc($assocType, $args, $newEntry); + $this->setOJSAssoc($assocType, $page, $op, $args, $newEntry); break; case 'omp': - $this->setOMPAssoc($assocType, $args, $newEntry); + $this->setOMPAssoc($assocType, $page, $op, $args, $newEntry); break; case 'ops': - $this->setOPSAssoc($assocType, $args, $newEntry); + $this->setOPSAssoc($assocType, $page, $op, $args, $newEntry); break; default: throw new Exception('Unrecognized application name!'); @@ -434,12 +436,14 @@ protected function setAssoc(int $assocType, string $op, array $args, array &$new * Set assoc type and IDs from the passed page, operation and * arguments specific to OJS. */ - protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): void + protected function setOJSAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { switch ($assocType) { case Application::getContextAssocType(): // $newEntry['contextId'] has already been set $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = Application::SITE_CONTEXT_PATH; + $newEntry['canonicalUrlOp'] = ''; break; case Application::ASSOC_TYPE_SUBMISSION: @@ -454,6 +458,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId. @@ -470,6 +475,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } elseif (count($args) == 2) { // Consider usage stats log files from releases 2.x: // The URL article/view/{$articleId}/{$galleyId} was used for assoc type galley. @@ -523,10 +529,16 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $representationId; $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = $fileType; + + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; } $newEntry['submissionId'] = $submissionId; $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'view'; break; case Application::ASSOC_TYPE_SUBMISSION_FILE: @@ -545,6 +557,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId/$representationId/$submissionFileId. @@ -564,6 +577,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v if (isset($args[4])) { $submissionFileId = (int) $args[4]; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } else { $representationUrlPath = $args[1]; if (isset($args[2])) { @@ -654,6 +668,10 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $representationId; $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); + + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; case Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER: @@ -673,6 +691,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; $galley = $submissionFile = null; $publications = $submission->getData('publications'); @@ -707,6 +726,10 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $galley->getId(); $newEntry['submissionFileId'] = $submissionFile->getId(); $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); + + $newEntry['canonicalUrlPage'] = 'article'; + $newEntry['canonicalUrlOp'] = 'download'; + $newEntry['canonicalUrlArgs'] = [$submissionId, $galley->getId(), $submissionFile->getId()]; } else { fwrite(STDERR, 'Supp file could not be found.' . PHP_EOL); } @@ -732,6 +755,10 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $issueId = $issue->getId(); $newEntry['issueId'] = $issueId; $newEntry['assocType'] = $assocType; + + $newEntry['canonicalUrlPage'] = 'issue'; + $newEntry['canonicalUrlOp'] = 'view'; + $newEntry['canonicalUrlArgs'] = [$issue->getId()]; break; case Application::ASSOC_TYPE_ISSUE_GALLEY: @@ -750,7 +777,7 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v break; } $issueId = $issue->getId(); - $issueGalleyDao = DAORegistry::getDAO('IssueGalleyDAO'); + $issueGalleyDao = DAORegistry::getDAO('IssueGalleyDAO'); /** @var IssueGalleyDAO $issueGalleyDao */ $issueGalley = $issueGalleyDao->getByBestId($args[1], $issueId); if (!$issueGalley) { fwrite(STDERR, "Issue galley with the URL path or ID {$args[1]} does not exist in the issue with the ID {$issueId}." . PHP_EOL); @@ -759,6 +786,9 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['issueId'] = $issueId; $newEntry['issueGalleyId'] = $issueGalley->getId(); $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = 'issue'; + $newEntry['canonicalUrlOp'] = 'download'; + $newEntry['canonicalUrlArgs'] = [$issue->getId(), $issueGalley->getId()]; break; } } @@ -767,12 +797,14 @@ protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): v * Set assoc type and IDs from the passed page, operation and * arguments specific to OMP. */ - protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): void + protected function setOMPAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { switch ($assocType) { case Application::getContextAssocType(): // $newEntry['contextId'] has already been set $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = $page; + $newEntry['canonicalUrlOp'] = $page == 'catalog' ? 'index' : ''; break; case Application::ASSOC_TYPE_SUBMISSION: @@ -787,6 +819,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId. @@ -801,6 +834,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } // Is it a chapter landing page @@ -827,11 +861,14 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Chapter with the ID {$chapterId} does not exist." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'chapter', $chapterId); } $newEntry['submissionId'] = $submissionId; $newEntry['assocType'] = isset($chapter) ? Application::ASSOC_TYPE_CHAPTER : $assocType; $newEntry['chpaterId'] = isset($chapter) ? $chapter->getId() : null; + $newEntry['canonicalUrlPage'] = 'catalog'; + $newEntry['canonicalUrlOp'] = 'book'; break; case Application::ASSOC_TYPE_SUBMISSION_FILE: @@ -854,6 +891,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId/$representationId/$submissionFileId. @@ -866,6 +904,7 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v $publicationId = (int) $args[2]; $representationUrlPath = $args[3]; $submissionFileId = (int) $args[4]; + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } else { $representationUrlPath = $args[1]; $submissionFileId = (int) $args[2]; @@ -952,6 +991,10 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); $newEntry['chapterId'] = $submissionFile->getData('chapterId'); + + $newEntry['canonicalUrlPage'] = 'book'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; case Application::ASSOC_TYPE_SERIES: @@ -967,6 +1010,10 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v } $newEntry['seriesId'] = $series->getId(); $newEntry['assocType'] = $assocType; + + $newEntry['canonicalUrlPage'] = 'catalog'; + $newEntry['canonicalUrlOp'] = 'series'; + $newEntry['canonicalUrlArgs'] = [$seriesPath]; break; } } @@ -975,12 +1022,14 @@ protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): v * Set assoc type and IDs from the passed page, operation and * arguments specific to OPS. */ - protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): void + protected function setOPSAssoc(int $assocType, string $page, string $op, array $args, array &$newEntry): void { switch ($assocType) { case Application::getContextAssocType(): // $newEntry['contextId'] has already been set $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = Application::SITE_CONTEXT_PATH; + $newEntry['canonicalUrlOp'] = ''; break; case Application::ASSOC_TYPE_SUBMISSION: @@ -995,6 +1044,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId. @@ -1008,9 +1058,12 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL); break; } + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } $newEntry['submissionId'] = $submissionId; $newEntry['assocType'] = $assocType; + $newEntry['canonicalUrlPage'] = 'preprint'; + $newEntry['canonicalUrlOp'] = 'view'; break; case Application::ASSOC_TYPE_SUBMISSION_FILE: @@ -1033,6 +1086,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v break; } $submissionId = $submission->getId(); + $newEntry['canonicalUrlArgs'] = [$submissionId]; // If it is an older submission version, the arguments must be: // $submissionId/version/$publicationId/$representationId/$submissionFileId. @@ -1045,6 +1099,7 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v $publicationId = (int) $args[2]; $representationUrlPath = $args[3]; $submissionFileId = (int) $args[4]; + array_push($newEntry['canonicalUrlArgs'], 'version', $publicationId); } else { $representationUrlPath = $args[1]; $submissionFileId = (int) $args[2]; @@ -1126,6 +1181,10 @@ protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): v $newEntry['representationId'] = $representationId; $newEntry['submissionFileId'] = $submissionFileId; $newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype')); + + $newEntry['canonicalUrlPage'] = 'preprint'; + $newEntry['canonicalUrlOp'] = 'download'; + array_push($newEntry['canonicalUrlArgs'], $representationId, $submissionFileId); break; } }