From a20902c1bf6e0b04aa6f4915566d02d697467ba8 Mon Sep 17 00:00:00 2001 From: Erik Golinelli Date: Fri, 26 Jan 2024 21:54:20 +0100 Subject: [PATCH] b8 files --- cf7-antispam.php | 4 +- core/b8/README.md | 9 + core/b8/README.md.license | 3 + core/b8/b8/b8.php | 391 ++++++++++++++++++++++++++++ core/b8/b8/degenerator/standard.php | 160 ++++++++++++ core/b8/b8/lexer/standard.php | 251 ++++++++++++++++++ core/b8/b8/storage/dba.php | 89 +++++++ core/b8/b8/storage/mysql.php | 95 +++++++ core/b8/b8/storage/sqlite.php | 92 +++++++ core/b8/b8/storage/storage_base.php | 300 +++++++++++++++++++++ package.json | 6 +- phpcs.xml.dist | 1 + readme.txt | 2 +- 13 files changed, 1397 insertions(+), 6 deletions(-) create mode 100644 core/b8/README.md create mode 100644 core/b8/README.md.license create mode 100644 core/b8/b8/b8.php create mode 100644 core/b8/b8/degenerator/standard.php create mode 100644 core/b8/b8/lexer/standard.php create mode 100644 core/b8/b8/storage/dba.php create mode 100644 core/b8/b8/storage/mysql.php create mode 100644 core/b8/b8/storage/sqlite.php create mode 100644 core/b8/b8/storage/storage_base.php diff --git a/cf7-antispam.php b/cf7-antispam.php index d6db6e6..65ebeba 100644 --- a/cf7-antispam.php +++ b/cf7-antispam.php @@ -5,7 +5,7 @@ * Author: Codekraft * Text Domain: cf7-antispam * Domain Path: /languages/ - * Version: 0.6.0 + * Version: 0.6.2 * * @package cf7-antispam */ @@ -18,7 +18,7 @@ /* CONSTANTS */ define( 'CF7ANTISPAM_NAME', 'cf7-antispam' ); -define( 'CF7ANTISPAM_VERSION', '0.6.1' ); +define( 'CF7ANTISPAM_VERSION', '0.6.2' ); define( 'CF7ANTISPAM_PLUGIN', __FILE__ ); diff --git a/core/b8/README.md b/core/b8/README.md new file mode 100644 index 0000000..d0f1aac --- /dev/null +++ b/core/b8/README.md @@ -0,0 +1,9 @@ +# b8 + +## A statistical spam filter implemented in PHP + +b8 is a statistical ("Bayesian") spam filter implemented in PHP. It is intended to keep your weblog or guestbook spam-free. The filter can be used anywhere in your PHP code and tells you whether a text is spam or not, using statistical text analysis. + +## Homepage + +The project's official homepage with further information is . diff --git a/core/b8/README.md.license b/core/b8/README.md.license new file mode 100644 index 0000000..3aa041c --- /dev/null +++ b/core/b8/README.md.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2020-2022 Tobias Leupold + +SPDX-License-Identifier: CC-BY-SA-4.0 diff --git a/core/b8/b8/b8.php b/core/b8/b8/b8.php new file mode 100644 index 0000000..c2f7f73 --- /dev/null +++ b/core/b8/b8/b8.php @@ -0,0 +1,391 @@ + +// SPDX-FileCopyrightText: 2006-2021 Tobias Leupold +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +/** + * The b8 spam filter library + * + * @package b8 + */ + +namespace b8; + +spl_autoload_register( + function ($class) { + $parts = explode('\\', $class); + if (count($parts) > 2 && $parts[0] == 'b8') { + require_once __DIR__ . DIRECTORY_SEPARATOR . $parts[1] + . DIRECTORY_SEPARATOR . $parts[2] . '.php'; + } + } +); + +class b8 +{ + const DBVERSION = 3; + + const SPAM = 'spam'; + const HAM = 'ham'; + const LEARN = 'learn'; + const UNLEARN = 'unlearn'; + + const CLASSIFIER_TEXT_MISSING = 'CLASSIFIER_TEXT_MISSING'; + + const TRAINER_TEXT_MISSING = 'TRAINER_TEXT_MISSING'; + const TRAINER_CATEGORY_MISSING = 'TRAINER_CATEGORY_MISSING'; + const TRAINER_CATEGORY_FAIL = 'TRAINER_CATEGORY_FAIL'; + + const INTERNALS_TEXTS = 'b8*texts'; + const INTERNALS_DBVERSION = 'b8*dbversion'; + + const KEY_DB_VERSION = 'dbversion'; + const KEY_COUNT_HAM = 'count_ham'; + const KEY_COUNT_SPAM = 'count_spam'; + const KEY_TEXTS_HAM = 'texts_ham'; + const KEY_TEXTS_SPAM = 'texts_spam'; + + private $config = [ 'lexer' => 'standard', + 'degenerator' => 'standard', + 'storage' => 'dba', + 'use_relevant' => 15, + 'min_dev' => 0.2, + 'rob_s' => 0.3, + 'rob_x' => 0.5 ]; + + private $storage = null; + private $lexer = null; + private $degenerator = null; + private $token_data = null; + + /** + * Constructs b8 + * + * @access public + * @param array b8's configuration: [ 'lexer' => string, + 'degenerator' => string, + 'storage' => string, + 'use_relevant' => int, + 'min_dev' => float, + 'rob_s' => float, + 'rob_x' => float ] + * @param array The storage backend's config (depending on the backend used) + * @param array The lexer's config (depending on the lexer used) + * @param array The degenerator's config (depending on the degenerator used) + * @return void + */ + function __construct(array $config = [], + array $config_storage = [], + array $config_lexer = [], + array $config_degenerator = []) + { + // Validate config data + foreach ($config as $name => $value) { + switch ($name) { + case 'min_dev': + case 'rob_s': + case 'rob_x': + $this->config[$name] = (float) $value; + break; + case 'use_relevant': + $this->config[$name] = (int) $value; + break; + case 'lexer': + case 'degenerator': + case 'storage': + $this->config[$name] = (string) $value; + break; + default: + throw new \Exception(b8::class . ": Unknown configuration key: \"$name\""); + } + } + + // Setup the degenerator class + $class = '\\b8\\degenerator\\' . $this->config['degenerator']; + $this->degenerator = new $class($config_degenerator); + + // Setup the lexer class + $class = '\\b8\\lexer\\' . $this->config['lexer']; + $this->lexer = new $class($config_lexer); + + // Setup the storage backend + $class = '\\b8\\storage\\' . $this->config['storage']; + $this->storage = new $class($config_storage, $this->degenerator); + } + + /** + * Classifies a text + * + * @access public + * @param string The text to classify + * @return mixed float The rating between 0 (ham) and 1 (spam) or an error code + */ + public function classify(string $text = null) + { + // Let's first see if the user called the function correctly + if ($text === null) { + return \b8\b8::CLASSIFIER_TEXT_MISSING; + } + + // Get the internal database variables, containing the number of ham and spam texts so the + // spam probability can be calculated in relation to them + $internals = $this->storage->get_internals(); + + // Calculate the spaminess of all tokens + + // Get all tokens we want to rate + $tokens = $this->lexer->get_tokens($text); + + // Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens + // will be an array) + if (! is_array($tokens)) { + return $tokens; + } + + // Fetch all available data for the token set from the database + $this->token_data = $this->storage->get(array_keys($tokens)); + + // Calculate the spaminess and importance for each token (or a degenerated form of it) + + $word_count = []; + $rating = []; + $importance = []; + + foreach ($tokens as $word => $count) { + $word_count[$word] = $count; + + // Although we only call this function only here ... let's do the calculation stuff in a + // function to make this a bit less confusing ;-) + $rating[$word] = $this->get_probability($word, $internals); + $importance[$word] = abs(0.5 - $rating[$word]); + } + + // Order by importance + arsort($importance); + reset($importance); + + // Get the most interesting tokens (use all if we have less than the given number) + $relevant = []; + for ($i = 0; $i < $this->config['use_relevant']; $i++) { + if ($token = key($importance)) { + // Important tokens remain + + // If the token's rating is relevant enough, use it + if (abs(0.5 - $rating[$token]) > $this->config['min_dev']) { + // Tokens that appear more than once also count more than once + for ($x = 0, $l = $word_count[$token]; $x < $l; $x++) { + array_push($relevant, $rating[$token]); + } + } + } else { + // We have less words as we want to use, so we already use what we have and can + // break here + break; + } + + next($importance); + } + + // Calculate the spaminess of the text (thanks to Mr. Robinson ;-) + + // We set both haminess and spaminess to 1 for the first multiplying + $haminess = 1; + $spaminess = 1; + + // Consider all relevant ratings + foreach ($relevant as $value) { + $haminess *= (1.0 - $value); + $spaminess *= $value; + } + + // If no token was good for calculation, we really don't know how to rate this text, so + // we can return 0.5 without further calculations. + if ($haminess == 1 && $spaminess == 1) { + return 0.5; + } + + // Calculate the combined rating + + // Get the number of relevant ratings + $n = count($relevant); + + // The actual haminess and spaminess + $haminess = 1 - pow($haminess, (1 / $n)); + $spaminess = 1 - pow($spaminess, (1 / $n)); + + // Calculate the combined indicator + $probability = ($haminess - $spaminess) / ($haminess + $spaminess); + + // We want a value between 0 and 1, not between -1 and +1, so ... + $probability = (1 + $probability) / 2; + + // Alea iacta est + return $probability; + } + + /** + * Calculate the spaminess of a single token also considering "degenerated" versions + * + * @access private + * @param string The word to rate + * @param array The "internals" array + * @return float The word's rating + */ + private function get_probability(string $word, array $internals) + { + // Let's see what we have! + if (isset($this->token_data['tokens'][$word])) { + // The token is in the database, so we can use it's data as-is and calculate the + // spaminess of this token directly + return $this->calculate_probability($this->token_data['tokens'][$word], $internals); + } + + // The token was not found, so do we at least have similar words? + if (isset($this->token_data['degenerates'][$word])) { + // We found similar words, so calculate the spaminess for each one and choose the most + // important one for the further calculation + + // The default rating is 0.5 simply saying nothing + $rating = 0.5; + + foreach ($this->token_data['degenerates'][$word] as $degenerate => $count) { + // Calculate the rating of the current degenerated token + $rating_tmp = $this->calculate_probability($count, $internals); + + // Is it more important than the rating of another degenerated version? + if(abs(0.5 - $rating_tmp) > abs(0.5 - $rating)) { + $rating = $rating_tmp; + } + } + + return $rating; + } else { + // The token is really unknown, so choose the default rating for completely unknown + // tokens. This strips down to the robX parameter so we can cheap out the freaky math + // ;-) + return $this->config['rob_x']; + } + } + + /** + * Do the actual spaminess calculation of a single token + * + * @access private + * @param array The token's data [ \b8\b8::KEY_COUNT_HAM => int, + \b8\b8::KEY_COUNT_SPAM => int ] + * @param array The "internals" array + * @return float The rating + */ + private function calculate_probability(array $data, array $internals) + { + // Calculate the basic probability as proposed by Mr. Graham + + // But: consider the number of ham and spam texts saved instead of the number of entries + // where the token appeared to calculate a relative spaminess because we count tokens + // appearing multiple times not just once but as often as they appear in the learned texts. + + $rel_ham = $data[\b8\b8::KEY_COUNT_HAM]; + $rel_spam = $data[\b8\b8::KEY_COUNT_SPAM]; + + if ($internals[\b8\b8::KEY_TEXTS_HAM] > 0) { + $rel_ham = $data[\b8\b8::KEY_COUNT_HAM] / $internals[\b8\b8::KEY_TEXTS_HAM]; + } + + if ($internals[\b8\b8::KEY_TEXTS_SPAM] > 0) { + $rel_spam = $data[\b8\b8::KEY_COUNT_SPAM] / $internals[\b8\b8::KEY_TEXTS_SPAM]; + } + + $rating = $rel_spam / ($rel_ham + $rel_spam); + + // Calculate the better probability proposed by Mr. Robinson + $all = $data[\b8\b8::KEY_COUNT_HAM] + $data[\b8\b8::KEY_COUNT_SPAM]; + return (($this->config['rob_s'] * $this->config['rob_x']) + ($all * $rating)) + / ($this->config['rob_s'] + $all); + } + + /** + * Check the validity of the category of a request + * + * @access private + * @param string The category + * @return void + */ + private function check_category(string $category) + { + return $category === \b8\b8::HAM || $category === \b8\b8::SPAM; + } + + /** + * Learn a reference text + * + * @access public + * @param string The text to learn + * @param string Either b8::SPAM or b8::HAM + * @return mixed void or an error code + */ + public function learn(string $text = null, string $category = null) + { + // Let's first see if the user called the function correctly + if ($text === null) { + return \b8\b8::TRAINER_TEXT_MISSING; + } + if ($category === null) { + return \b8\b8::TRAINER_CATEGORY_MISSING; + } + + return $this->process_text($text, $category, \b8\b8::LEARN); + } + + /** + * Unlearn a reference text + * + * @access public + * @param string The text to unlearn + * @param string Either b8::SPAM or b8::HAM + * @return mixed void or an error code + */ + public function unlearn(string $text = null, string $category = null) + { + // Let's first see if the user called the function correctly + if ($text === null) { + return \b8\b8::TRAINER_TEXT_MISSING; + } + if ($category === null) { + return \b8\b8::TRAINER_CATEGORY_MISSING; + } + + return $this->process_text($text, $category, \b8\b8::UNLEARN); + } + + /** + * Does the actual interaction with the storage backend for learning or unlearning texts + * + * @access private + * @param string The text to process + * @param string Either b8::SPAM or b8::HAM + * @param string Either b8::LEARN or b8::UNLEARN + * @return mixed void or an error code + */ + private function process_text(string $text, string $category, string $action) + { + // Look if the request is okay + if (! $this->check_category($category)) { + return \b8\b8::TRAINER_CATEGORY_FAIL; + } + + // Get all tokens from $text + $tokens = $this->lexer->get_tokens($text); + + // Check if the lexer failed (if so, $tokens will be a lexer error code, if not, $tokens + // will be an array) + if (! is_array($tokens)) { + return $tokens; + } + + // Pass the tokens and what to do with it to the storage backend + return $this->storage->process_text($tokens, $category, $action); + } + +} diff --git a/core/b8/b8/degenerator/standard.php b/core/b8/b8/degenerator/standard.php new file mode 100644 index 0000000..6920f30 --- /dev/null +++ b/core/b8/b8/degenerator/standard.php @@ -0,0 +1,160 @@ + +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +/** + * A helper class to derive simplified tokens + * + * @package b8 + */ + +namespace b8\degenerator; + +class standard +{ + public $config = [ 'multibyte' => true, + 'encoding' => 'UTF-8' ]; + + public $degenerates = []; + + /** + * Constructs the degenerator. + * + * @access public + * @param array $config The configuration: [ 'multibyte' => bool, + 'encoding' => string ] + * @return void + */ + public function __construct(array $config) + { + // Validate config data + foreach ($config as $name => $value) { + switch($name) { + case 'multibyte': + $this->config[$name] = (bool) $value; + break; + case 'encoding': + $this->config[$name] = (string) $value; + break; + default: + throw new \Exception(standard::class . ": Unknown configuration key: " + . "\"$name\""); + } + } + } + + /** + * Generates a list of "degenerated" words for a list of words. + * + * @access public + * @param array $words The words to degenerate + * @return array An array containing an array of degenerated tokens for each token + */ + public function degenerate(array $words) + { + $degenerates = []; + + foreach ($words as $word) { + $degenerates[$word] = $this->degenerate_word($word); + } + + return $degenerates; + } + + /** + * Remove duplicates from a list of degenerates of a word. + * + * @access private + * @param string $word The word + * @param array $list The list to process + * @return array The list without duplicates + */ + private function delete_duplicates(string $word, array $list) + { + $list_processed = []; + + // Check each upper/lower version + foreach ($list as $alt_word) { + if ($alt_word != $word) { + array_push($list_processed, $alt_word); + } + } + + return $list_processed; + } + + /** + * Builds a list of "degenerated" versions of a word. + * + * @access private + * @param string $word The word + * @return array An array of degenerated words + */ + private function degenerate_word(string $word) + { + // Check for any stored words so the process doesn't have to repeat + if (isset($this->degenerates[$word]) === true) { + return $this->degenerates[$word]; + } + + // Create different versions of upper and lower case + if ($this->config['multibyte'] === false) { + // The standard upper/lower versions + $lower = strtolower($word); + $upper = strtoupper($word); + $first = substr($upper, 0, 1) . substr($lower, 1, strlen($word)); + } elseif ($this->config['multibyte'] === true) { + // The multibyte upper/lower versions + $lower = mb_strtolower($word, $this->config['encoding']); + $upper = mb_strtoupper($word, $this->config['encoding']); + $first = mb_substr($upper, 0, 1, $this->config['encoding']) + . mb_substr($lower, 1, mb_strlen($word), $this->config['encoding']); + } + + // Add the versions + $upper_lower = []; + array_push($upper_lower, $lower); + array_push($upper_lower, $upper); + array_push($upper_lower, $first); + + // Delete duplicate upper/lower versions + $degenerate = $this->delete_duplicates($word, $upper_lower); + + // Append the original word + array_push($degenerate, $word); + + // Degenerate all versions + foreach ($degenerate as $alt_word) { + // Look for stuff like !!! and ??? + if (preg_match('/[!?]$/', $alt_word) > 0) { + // Add versions with different !s and ?s + if (preg_match('/[!?]{2,}$/', $alt_word) > 0) { + $tmp = preg_replace('/([!?])+$/', '$1', $alt_word); + array_push($degenerate, $tmp); + } + + $tmp = preg_replace('/([!?])+$/', '', $alt_word); + array_push($degenerate, $tmp); + } + + // Look for "..." at the end of the word + $alt_word_int = $alt_word; + while (preg_match('/[\.]$/', $alt_word_int) > 0) { + $alt_word_int = substr($alt_word_int, 0, strlen($alt_word_int) - 1); + array_push($degenerate, $alt_word_int); + } + } + + // Some degenerates are the same as the original word. These don't have to be fetched, so we + // create a new array with only new tokens + $degenerate = $this->delete_duplicates($word, $degenerate); + + // Store the list of degenerates for the token to prevent unnecessary re-processing + $this->degenerates[$word] = $degenerate; + + return $degenerate; + } + +} diff --git a/core/b8/b8/lexer/standard.php b/core/b8/b8/lexer/standard.php new file mode 100644 index 0000000..d996dad --- /dev/null +++ b/core/b8/b8/lexer/standard.php @@ -0,0 +1,251 @@ + +// SPDX-FileCopyrightText: 2006-2022 Tobias Leupold +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +/** + * A helper class to disassemble a text to tokens + * + * @package b8 + */ + +namespace b8\lexer; + +class standard +{ + const LEXER_TEXT_NOT_STRING = 'LEXER_TEXT_NOT_STRING'; + const LEXER_TEXT_EMPTY = 'LEXER_TEXT_EMPTY'; + + const LEXER_NO_TOKENS = 'b8*no_tokens'; + + private $config = [ 'min_size' => 3, + 'max_size' => 30, + 'get_uris' => true, + 'get_html' => true, + 'get_bbcode' => false, + 'allow_numbers' => false ]; + + private $tokens = null; + private $processed_text = null; + + // The regular expressions we use to split the text to tokens + private $regexp = [ 'raw_split' => '/[\s,\.\/"\:;\|<>\-_\[\]{}\+=\)\(\*\&\^%]+/', + 'ip' => '/([A-Za-z0-9\_\-\.]+)/', + 'uris' => '/([A-Za-z0-9\_\-]*\.[A-Za-z0-9\_\-\.]+)/', + 'html' => '/(<.+?>)/', + 'bbcode' => '/(\[.+?\])/', + 'tagname' => '/(.+?)\s/', + 'numbers' => '/^[0-9]+$/' ]; + + /** + * Constructs the lexer. + * + * @access public + * @param array $config The configuration: [ 'min_size' => int, + * 'max_size' => int, + * 'get_uris' => bool, + * 'get_html' => bool, + * 'get_bbcode' => bool, + * 'allow_numbers' => bool ] + * @return void + */ + function __construct(array $config) + { + // Validate config data + foreach ($config as $name=>$value) { + switch ($name) { + case 'min_size': + case 'max_size': + $this->config[$name] = (int) $value; + break; + case 'allow_numbers': + case 'get_uris': + case 'get_html': + case 'get_bbcode': + $this->config[$name] = (bool) $value; + break; + default: + throw new \Exception(standard::class . ": Unknown configuration key: " + . "\"$name\""); + } + } + } + + /** + * Splits a text to tokens. + * + * @access public + * @param string $text The text to disassemble + * @return mixed Returns a list of tokens or an error code + */ + public function get_tokens(string $text) + { + // Check if we actually have a string ... + if (is_string($text) === false) { + return self::LEXER_TEXT_NOT_STRING; + } + + // ... and if it's empty + if (empty($text) === true) { + return self::LEXER_TEXT_EMPTY; + } + + // Re-convert the text to the original characters coded in UTF-8, as they have been coded in + // html entities during the post process + $this->processed_text = html_entity_decode($text, ENT_QUOTES, 'UTF-8'); + + // Reset the token list + $this->tokens = array(); + + if ($this->config['get_uris'] === true) { + // Get URIs + $this->get_uris($this->processed_text); + } + + if ($this->config['get_html'] === true) { + // Get HTML + $this->get_markup($this->processed_text, $this->regexp['html']); + } + + if ($this->config['get_bbcode'] === true) { + // Get BBCode + $this->get_markup($this->processed_text, $this->regexp['bbcode']); + } + + // We always want to do a raw split of the (remaining) text, so: + $this->raw_split($this->processed_text); + + // Be sure not to return an empty array + if (count($this->tokens) == 0) { + $this->tokens[self::LEXER_NO_TOKENS] = 1; + } + + // Return a list of all found tokens + return $this->tokens; + } + + /** + * Validates a token. + * + * @access private + * @param string $token The token string + * @return bool Returns true if the token is valid, otherwise returns false. + */ + private function is_valid(string $token) + { + // Just to be sure that the token's name won't collide with b8's internal variables + if (substr($token, 0, 3) == 'b8*') { + return false; + } + + // Validate the size of the token + $len = strlen($token); + if ($len < $this->config['min_size'] || $len > $this->config['max_size']) { + return false; + } + + // We may want to exclude pure numbers + if ($this->config['allow_numbers'] === false + && preg_match($this->regexp['numbers'], $token) > 0) { + + return false; + } + + // Token is okay + return true; + } + + /** + * Checks the validity of a token and adds it to the token list if it's valid. + * + * @access private + * @param string $token + * @param string $word_to_remove Word to remove from the processed string + * @return void + */ + private function add_token(string $token, string $word_to_remove = null) + { + // Check the validity of the token + if (! $this->is_valid($token)) { + return; + } + + // Add it to the list or increase it's counter + if (! isset($this->tokens[$token])) { + $this->tokens[$token] = 1; + } else { + $this->tokens[$token] += 1; + } + + // If requested, remove the word or it's original version from the text + if ($word_to_remove !== null) { + $this->processed_text = str_replace($word_to_remove, '', $this->processed_text); + } + } + + /** + * Gets URIs. + * + * @access private + * @param string $text + * @return void + */ + private function get_uris(string $text) + { + // Find URIs + preg_match_all($this->regexp['uris'], $text, $raw_tokens); + foreach ($raw_tokens[1] as $word) { + // Remove a possible trailing dot + $word = rtrim($word, '.'); + // Try to add the found tokens to the list + $this->add_token($word, $word); + // Also process the parts of the found URIs + $this->raw_split($word); + } + } + + /** + * Gets HTML or BBCode markup, depending on the regexp used. + * + * @access private + * @param string $text + * @param string $regexp + * @return void + */ + private function get_markup(string $text, string $regexp) + { + // Search for the markup + preg_match_all($regexp, $text, $raw_tokens); + foreach ($raw_tokens[1] as $word) { + $actual_word = $word; + + // If the tag has parameters, just use the tag itself + if (strpos($word, ' ') !== false) { + preg_match($this->regexp['tagname'], $word, $match); + $actual_word = $match[1]; + $word = "$actual_word..." . substr($word, -1); + } + + // Try to add the found tokens to the list + $this->add_token($word, $actual_word); + } + } + + /** + * Does a raw split. + * + * @access private + * @param string $text + * @return void + */ + private function raw_split(string $text) + { + foreach (preg_split($this->regexp['raw_split'], $text) as $word) { + // Check the word and add it to the token list if it's valid + $this->add_token($word); + } + } + +} diff --git a/core/b8/b8/storage/dba.php b/core/b8/b8/storage/dba.php new file mode 100644 index 0000000..3e8ab5b --- /dev/null +++ b/core/b8/b8/storage/dba.php @@ -0,0 +1,89 @@ + +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +namespace b8\storage; + +/** + * A Berkeley DB (DBA) storage backend + * + * @package b8 + */ + +class dba extends storage_base +{ + + private $db = null; + + protected function setup_backend(array $config) + { + if (! isset($config['resource']) + || gettype($config['resource']) !== 'resource' + || get_resource_type($config['resource']) !== 'dba') { + + throw new \Exception(dba::class . ": No valid DBA resource passed"); + } + $this->db = $config['resource']; + } + + protected function fetch_token_data(array $tokens) + { + $data = []; + + foreach ($tokens as $token) { + // Try to the raw data in the format "count_ham count_spam" + $count = dba_fetch($token, $this->db); + + if ($count !== false) { + // Split the data by space characters + $split_data = explode(' ', $count); + + // As an internal variable may have just one single value, we have to check for this + $count_ham = isset($split_data[0]) ? (int) $split_data[0] : null; + $count_spam = isset($split_data[1]) ? (int) $split_data[1] : null; + + // Append the parsed data + $data[$token] = [ \b8\b8::KEY_COUNT_HAM => $count_ham, + \b8\b8::KEY_COUNT_SPAM => $count_spam ]; + } + } + + return $data; + } + + private function assemble_count_value(array $count) + { + // Assemble the count data string + $count_value = $count[\b8\b8::KEY_COUNT_HAM] . ' ' . $count[\b8\b8::KEY_COUNT_SPAM]; + // Remove whitespace from data of the internal variables + return(rtrim($count_value)); + } + + protected function add_token(string $token, array $count) + { + return dba_insert($token, $this->assemble_count_value($count), $this->db); + } + + protected function update_token(string $token, array $count) + { + return dba_replace($token, $this->assemble_count_value($count), $this->db); + } + + protected function delete_token(string $token) + { + return dba_delete($token, $this->db); + } + + protected function start_transaction() + { + return; + } + + protected function finish_transaction() + { + return; + } + +} diff --git a/core/b8/b8/storage/mysql.php b/core/b8/b8/storage/mysql.php new file mode 100644 index 0000000..b7df2ee --- /dev/null +++ b/core/b8/b8/storage/mysql.php @@ -0,0 +1,95 @@ + +// SPDX-FileCopyrightText: 2006-2021 Tobias Leupold +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +namespace b8\storage; + +/** + * A MySQL storage backend + * + * @package b8 + */ + +class mysql extends storage_base +{ + + private $mysql = null; + private $table = null; + + protected function setup_backend(array $config) + { + if (! isset($config['resource']) + || get_class($config['resource']) !== 'mysqli') { + + throw new \Exception(mysql::class . ": No valid mysqli object passed"); + } + $this->mysql = $config['resource']; + + if (! isset($config['table'])) { + throw new \Exception(mysql::class . ": No b8 wordlist table name passed"); + } + $this->table = $config['table']; + } + + protected function fetch_token_data(array $tokens) + { + $data = []; + + $escaped = []; + foreach ($tokens as $token) { + $escaped[] = $this->mysql->real_escape_string($token); + } + $result = $this->mysql->query('SELECT token, count_ham, count_spam' + . ' FROM ' . $this->table + . ' WHERE token IN ' + . "('" . implode("','", $escaped) . "')"); + + while ($row = $result->fetch_row()) { + $data[$row[0]] = [ \b8\b8::KEY_COUNT_HAM => $row[1], + \b8\b8::KEY_COUNT_SPAM => $row[2] ]; + } + + $result->free_result(); + + return $data; + } + + protected function add_token(string $token, array $count) + { + $query = $this->mysql->prepare('INSERT INTO ' . $this->table + . '(token, count_ham, count_spam) VALUES(?, ?, ?)'); + $query->bind_param('sii', $token, $count[\b8\b8::KEY_COUNT_HAM], + $count[\b8\b8::KEY_COUNT_SPAM]); + $query->execute(); + } + + protected function update_token(string $token, array $count) + { + $query = $this->mysql->prepare('UPDATE ' . $this->table + . ' SET count_ham = ?, count_spam = ? WHERE token = ?'); + $query->bind_param('iis', $count[\b8\b8::KEY_COUNT_HAM], $count[\b8\b8::KEY_COUNT_SPAM], + $token); + $query->execute(); + } + + protected function delete_token(string $token) + { + $query = $this->mysql->prepare('DELETE FROM ' . $this->table . ' WHERE token = ?'); + $query->bind_param('s', $token); + $query->execute(); + } + + protected function start_transaction() + { + $this->mysql->begin_transaction(); + } + + protected function finish_transaction() + { + $this->mysql->commit(); + } + +} diff --git a/core/b8/b8/storage/sqlite.php b/core/b8/b8/storage/sqlite.php new file mode 100644 index 0000000..8b9b797 --- /dev/null +++ b/core/b8/b8/storage/sqlite.php @@ -0,0 +1,92 @@ + +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +namespace b8\storage; +use PDO; + +/** + * An SQLite storage backend + * + * @package b8 + */ + +class sqlite extends storage_base +{ + + private $sqlite = null; + private $table = null; + + protected function setup_backend(array $config) + { + $this->sqlite = $config['resource']; + + if (! isset($config['table'])) { + $config['table'] = 'b8_wordlist'; + } + $this->table = $config['table']; + } + + protected function fetch_token_data(array $tokens) + { + $data = []; + + $escaped = []; + foreach ($tokens as $token) { + $escaped[] = $this->sqlite->quote($token); + } + + $result = $this->sqlite->query('SELECT token, count_ham, count_spam' + . ' FROM ' . $this->table + . ' WHERE token IN ' + . "(" . implode(",", $escaped) . ")"); + + while ($row = $result->fetch()) { + $data[$row[0]] = [ \b8\b8::KEY_COUNT_HAM => $row[1], + \b8\b8::KEY_COUNT_SPAM => $row[2] ]; + } + + return $data; + } + + protected function add_token(string $token, array $count) + { + $query = $this->sqlite->prepare('INSERT INTO ' . $this->table + . '(token, count_ham, count_spam) VALUES(?, ?, ?)'); + $query->bindParam(1, $token, PDO::PARAM_STR); + $query->bindParam(2, $count[\b8\b8::KEY_COUNT_HAM], PDO::PARAM_INT); + $query->bindParam(3, $count[\b8\b8::KEY_COUNT_SPAM], PDO::PARAM_INT); + + $query->execute(); + } + + protected function update_token(string $token, array $count) + { + $query = $this->sqlite->prepare('UPDATE ' . $this->table + . ' SET count_ham = ?, count_spam = ? WHERE token = ?'); + $query->bindParam(1, $count[\b8\b8::KEY_COUNT_HAM], PDO::PARAM_INT); + $query->bindParam(2, $count[\b8\b8::KEY_COUNT_SPAM], PDO::PARAM_INT); + $query->bindParam(3, $token, PDO::PARAM_STR); + $query->execute(); + } + + protected function delete_token(string $token) + { + $query = $this->sqlite->prepare('DELETE FROM ' . $this->table . ' WHERE token = ?'); + $query->bindParam(1, $token, PDO::PARAM_STR); + $query->execute(); + } + + protected function start_transaction() + { + $this->sqlite->beginTransaction(); + } + + protected function finish_transaction() + { + $this->sqlite->commit(); + } + +} diff --git a/core/b8/b8/storage/storage_base.php b/core/b8/b8/storage/storage_base.php new file mode 100644 index 0000000..c7b3b63 --- /dev/null +++ b/core/b8/b8/storage/storage_base.php @@ -0,0 +1,300 @@ + +// +// SPDX-License-Identifier: LGPL-3.0-or-later + +/** + * Abstract base class for storage backends + * + * @package b8 + */ + +namespace b8\storage; + +abstract class storage_base +{ + protected $degenerator = null; + + /** + * Sets up the backend + * + * @access public + * @param array The configuration for the respective backend + */ + abstract protected function setup_backend(array $config); + + /** + * Does the actual interaction with the database when fetching data + * + * @access protected + * @param array $tokens List of token names to fetch + * @return mixed Returns an array of the returned data in the format array(token => data) + or an empty array if there was no data. + */ + abstract protected function fetch_token_data(array $tokens); + + /** + * Stores a new token to the database + * + * @access protected + * @param string $token The token's name + * @param array $count The ham and spam counters [ \b8\b8::KEY_COUNT_HAM => int, + \b8\b8::KEY_COUNT_SPAM => int ] + * @return bool true on success or false on failure + */ + abstract protected function add_token(string $token, array $count); + + /** + * Updates an existing token + * + * @access protected + * @param string $token The token's name + * @param array $count The ham and spam counters [ \b8\b8::KEY_COUNT_HAM => int, + \b8\b8::KEY_COUNT_SPAM => int ] + * @return bool true on success or false on failure + */ + abstract protected function update_token(string $token, array $count); + + /** + * Removes a token from the database + * + * @access protected + * @param string $token The token's name + * @return bool true on success or false on failure + */ + abstract protected function delete_token(string $token); + + /** + * Starts a transaction (if the underlying database supports/needs this) + * + * @access protected + * @return void + */ + abstract protected function start_transaction(); + + /** + * Finishes a transaction (if the underlying database supports/needs this) + * + * @access protected + * @return void + */ + abstract protected function finish_transaction(); + + /** + * Passes the degenerator to the instance and calls the backend setup + * + * @access public + * @param array The respective backen's configuration + * @param object The degenerator to use + * @return void + */ + public function __construct(array $config, object $degenerator) + { + $this->degenerator = $degenerator; + $this->setup_backend($config); + + $internals = $this->get_internals(); + if (! isset($internals[\b8\b8::KEY_DB_VERSION]) + || $internals[\b8\b8::KEY_DB_VERSION] !== \b8\b8::DBVERSION) { + + throw new \Exception(storage_base::class . ': The connected database is not a b8 v' + . \b8\b8::DBVERSION . ' database.'); + } + } + + /** + * Get the database's internal variables. + * + * @access public + * @return array Returns an array of all internals. + */ + public function get_internals() + { + $internals = $this->fetch_token_data([ \b8\b8::INTERNALS_TEXTS, + \b8\b8::INTERNALS_DBVERSION ]); + + // Just in case this is called by check_database() and it's not yet clear if we actually + // have a b8 database + $texts_ham = null; + $texts_spam = null; + $dbversion = null; + if(isset($internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_HAM])) { + $texts_ham = (int) $internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_HAM]; + } + if(isset($internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_SPAM])) { + $texts_spam = (int) $internals[\b8\b8::INTERNALS_TEXTS][\b8\b8::KEY_COUNT_SPAM]; + } + if(isset($internals[\b8\b8::INTERNALS_DBVERSION][\b8\b8::KEY_COUNT_HAM])) { + $dbversion = (int) $internals[\b8\b8::INTERNALS_DBVERSION][\b8\b8::KEY_COUNT_HAM]; + } + + return [ \b8\b8::KEY_TEXTS_HAM => $texts_ham, + \b8\b8::KEY_TEXTS_SPAM => $texts_spam, + \b8\b8::KEY_DB_VERSION => $dbversion ]; + } + + /** + * Get all data about a list of tokens from the database. + * + * @access public + * @param array The tokens list + * @return mixed Returns False on failure, otherwise returns array of returned data + in the format [ 'tokens' => [ token => count ], + 'degenerates' => [ token => [ degenerate => count ] ] ]. + */ + public function get(array $tokens) + { + // First we see what we have in the database + $token_data = $this->fetch_token_data($tokens); + + // Check if we have to degenerate some tokens + $missing_tokens = array(); + foreach ($tokens as $token) { + if (! isset($token_data[$token])) { + $missing_tokens[] = $token; + } + } + + if (count($missing_tokens) > 0) { + // We have to degenerate some tokens + $degenerates_list = []; + + // Generate a list of degenerated tokens for the missing tokens ... + $degenerates = $this->degenerator->degenerate($missing_tokens); + + // ... and look them up + foreach ($degenerates as $token => $token_degenerates) { + $degenerates_list = array_merge($degenerates_list, $token_degenerates); + } + + $token_data = array_merge($token_data, $this->fetch_token_data($degenerates_list)); + } + + // Here, we have all available data in $token_data. + + $return_data_tokens = []; + $return_data_degenerates = []; + + foreach ($tokens as $token) { + if (isset($token_data[$token])) { + // The token was found in the database + $return_data_tokens[$token] = $token_data[$token]; + } else { + // The token was not found, so we look if we can return data for degenerated tokens + foreach ($this->degenerator->degenerates[$token] as $degenerate) { + if (isset($token_data[$degenerate])) { + // A degenertaed version of the token way found in the database + $return_data_degenerates[$token][$degenerate] = $token_data[$degenerate]; + } + } + } + } + + // Now, all token data directly found in the database is in $return_data_tokens and all + // data for degenerated versions is in $return_data_degenerates, so + return [ 'tokens' => $return_data_tokens, + 'degenerates' => $return_data_degenerates ]; + } + + /** + * Stores or deletes a list of tokens from the given category. + * + * @access public + * @param array The tokens list + * @param string Either \b8\b8::HAM or \b8\b8::SPAM + * @param string Either \b8\b8::LEARN or \b8\b8::UNLEARN + * @return void + */ + public function process_text(array $tokens, string $category, string $action) + { + // No matter what we do, we first have to check what data we have. + + // First get the internals, including the ham texts and spam texts counter + $internals = $this->get_internals(); + // Then, fetch all data for all tokens we have + $token_data = $this->fetch_token_data(array_keys($tokens)); + + $this->start_transaction(); + + // Process all tokens to learn/unlearn + foreach ($tokens as $token => $count) { + if (isset($token_data[$token])) { + // We already have this token, so update it's data + + // Get the existing data + $count_ham = $token_data[$token][\b8\b8::KEY_COUNT_HAM]; + $count_spam = $token_data[$token][\b8\b8::KEY_COUNT_SPAM]; + + // Increase or decrease the right counter + if ($action === \b8\b8::LEARN) { + if ($category === \b8\b8::HAM) { + $count_ham += $count; + } elseif ($category === \b8\b8::SPAM) { + $count_spam += $count; + } + } elseif ($action == \b8\b8::UNLEARN) { + if ($category === \b8\b8::HAM) { + $count_ham -= $count; + } elseif ($category === \b8\b8::SPAM) { + $count_spam -= $count; + } + } + + // We don't want to have negative values + if ($count_ham < 0) { + $count_ham = 0; + } + if ($count_spam < 0) { + $count_spam = 0; + } + + // Now let's see if we have to update or delete the token + if ($count_ham != 0 or $count_spam != 0) { + $this->update_token($token, [ \b8\b8::KEY_COUNT_HAM => $count_ham, + \b8\b8::KEY_COUNT_SPAM => $count_spam ]); + } else { + $this->delete_token($token); + } + } else { + // We don't have the token. If we unlearn a text, we can't delete it as we don't + // have it anyway, so just do something if we learn a text + if ($action === \b8\b8::LEARN) { + if ($category === \b8\b8::HAM) { + $this->add_token($token, [ \b8\b8::KEY_COUNT_HAM => $count, + \b8\b8::KEY_COUNT_SPAM => 0 ]); + } elseif ($category === \b8\b8::SPAM) { + $this->add_token($token, [ \b8\b8::KEY_COUNT_HAM => 0, + \b8\b8::KEY_COUNT_SPAM => $count ]); + } + } + } + } + + // Now, all token have been processed, so let's update the right text + if ($action === \b8\b8::LEARN) { + if ($category === \b8\b8::HAM) { + $internals[\b8\b8::KEY_TEXTS_HAM]++; + } elseif ($category === \b8\b8::SPAM) { + $internals[\b8\b8::KEY_TEXTS_SPAM]++; + } + } elseif ($action === \b8\b8::UNLEARN) { + if ($category === \b8\b8::HAM) { + if ($internals[\b8\b8::KEY_TEXTS_HAM] > 0) { + $internals[\b8\b8::KEY_TEXTS_HAM]--; + } + } elseif ($category === \b8\b8::SPAM) { + if ($internals[\b8\b8::KEY_TEXTS_SPAM] > 0) { + $internals[\b8\b8::KEY_TEXTS_SPAM]--; + } + } + } + + $this->update_token(\b8\b8::INTERNALS_TEXTS, + [ \b8\b8::KEY_COUNT_HAM => $internals[\b8\b8::KEY_TEXTS_HAM], + \b8\b8::KEY_COUNT_SPAM => $internals[\b8\b8::KEY_TEXTS_SPAM] ]); + + $this->finish_transaction(); + } + +} diff --git a/package.json b/package.json index cb27e86..1090b06 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "cf7-antispam", "author": "Erik Golinelli", "license": "GPL-2.0-only", - "version": "0.6.1", + "version": "0.6.2", "description": "AntiSpam for Contact Form 7", "files": [ "admin/*", @@ -21,9 +21,9 @@ "scripts": { "start": "wp-scripts start", "build": "wp-scripts build", - "preplugin-zip": "wp-env run cli --env-cwd=wp-content/plugins/cf7-antispam \"composer install --no-dev && composer dump-autoload --optimize\"", + "preplugin----zip": "wp-env run cli --env-cwd=wp-content/plugins/cf7-antispam \"composer install --no-dev && composer dump-autoload --optimize\"", "plugin-zip": "wp-scripts plugin-zip", - "postplugin-zip": "wp-env run cli --env-cwd=wp-content/plugins/cf7-antispam \"composer install\"", + "postplugin----zip": "wp-env run cli --env-cwd=wp-content/plugins/cf7-antispam \"composer install\"", "lint:css": "wp-scripts lint-style ./src/**/*.scss --fix", "lint:js": "wp-scripts lint-js ./src/**/*.js --fix", "packages-update": "wp-scripts packages-update", diff --git a/phpcs.xml.dist b/phpcs.xml.dist index a4849c2..7cc2c13 100644 --- a/phpcs.xml.dist +++ b/phpcs.xml.dist @@ -59,6 +59,7 @@ /.github/* /.husky/* /assets/* + /core/b8/* /languages/* /node_modules/* /tests/* diff --git a/readme.txt b/readme.txt index c5a431b..f677c69 100644 --- a/readme.txt +++ b/readme.txt @@ -4,7 +4,7 @@ Tags: antispam, blacklist, honeypot, geoip, security, contact form 7 Requires at least: 5.4 Tested up to: 6.4.2 Requires PHP: 5.6 -Stable tag: 0.6.1 +Stable tag: 0.6.2 License: GPLv2 or later License URI: https://www.gnu.org/licenses/gpl-2.0.html