Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tashfeer Banned Words Fixes #2 #8

Merged
merged 4 commits into from
Dec 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ public static String wordToLetters(String word) {
* original word if no affix matches are found. {@link String}
*/
public static String removeArabicAffixes(String word) {
if (word.isEmpty()) {
return word;
}
if (Data.ARABIC_PREFIXES.contains(word.substring(0, 2))) {
// For: ALEF & LAM
word = word.substring(2);
Expand All @@ -155,6 +158,42 @@ public static String removeArabicAffixes(String word) {
return word.trim();
}

/**
* Performs tashfeer encryption on a given text, but only for words that are
* considered "banned" words.
* Banned words are determined based on a predefined similarity ratio.
*
* @param text The input text to be encrypted {@link String}.
* @return The encrypted text with tashfeer applied and an encryption level
* is 2 to banned words {@link String}.
*/
public static String tashfeerBannedWords(String text) {
return tashfeerBannedWords(text, 2);
}

/**
* Performs tashfeer encryption on a given text, but only for words that are
* considered "banned" words.
* Banned words are determined based on a predefined similarity ratio.
*
* @param text The input text to be encrypted {@link String}.
* @param levelOfTashfeer The encryption level (default is 2).
* @return The encrypted text with tashfeer applied to banned words
* {@link String}.
*/
public static String tashfeerBannedWords(String text, int levelOfTashfeer) {
Objects.requireNonNull(text, Data.TEXT_NULL_MESSAGE);
StringBuilder newText = new StringBuilder();
for (String word : text.split(" ")) {
if (checkIfBannedWord(word)) {
newText.append(tashfeerHandler(word, levelOfTashfeer)).append(" ");
} else {
newText.append(word).append(" ");
}
}
return newText.toString().trim();
}

private static String handleNoonIssue(String text) {
String arabicLetters = String.join("", Data.LETTERS_DICT.keySet()) + "ـ";
String regex = Data.NOON + "(" + "?=[^" + arabicLetters + "]" + ")|" + Data.NOON + "\\z";
Expand Down Expand Up @@ -259,4 +298,68 @@ private static char tashfeerCharacter(char character) {
char replacementCharacter = replacementCharList[randomIndex].charAt(0);
return replacementCharacter;
}

private static double bannedSimilarityRatio(String string) {
double maximumSimilarity = -1;
for (String bannedWord : Data.BANNED_WORDS) {
double calculatedSimilarity = similarityScore(string, bannedWord);
if (calculatedSimilarity > maximumSimilarity) {
maximumSimilarity = calculatedSimilarity;
}
}
return maximumSimilarity * 100;
}

private static boolean checkIfBannedWord(String string) {
double stdRatio = 70;
return bannedSimilarityRatio(removeArabicAffixes(string)) >= stdRatio;
}

private static double similarityScore(String s1, String s2) {
String longer = s1;
String shorter = s2;

// swap them if s1 is bigger than s2
if (s1.length() < s2.length()) {
longer = s2;
shorter = s1;
}

int longerLength = longer.length();

// if both are empty strings return 1 (100% similarity)
if (longerLength == 0) {
return 1.0;
}

// calculate the similarity score
return (double) (longerLength - editDistance(longer, shorter)) / (double) longerLength;
}

private static int editDistance(String s1, String s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();

int[] costs = new int[s2.length() + 1];

for (int i = 0; i <= s1.length(); i++) {
int lastValue = i;
for (int j = 0; j <= s2.length(); j++) {
if (i == 0) {
costs[j] = j;
} else if (j > 0) {
int newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1)) {
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
}
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
if (i > 0) {
costs[s2.length()] = lastValue;
}
}
return costs[s2.length()];
}
}
56 changes: 55 additions & 1 deletion src/main/java/io/github/seen_arabic/arabic_services/Data.java
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,61 @@ class Data {
'ئ');

static List<Character> ALONE_LETTERS = Arrays.asList('د', 'ذ', 'ر', 'ز', 'و', 'ا', 'أ', 'إ', 'ء', 'ؤ', 'آ');

static final String[] BANNED_WORDS = {
"فلسطين",
"عرب",
"هود",
"صهيون",
"سرائيل",
"دول",
"كيان",
"حتل",
"هتلر",
"خنازير",
"حي",
"شيخ",
"جراح",
"سقط",
"قدس",
"قصى",
"طبع",
"قتل",
"خان",
"كتائب",
"عز",
"دين",
"قسام",
"جهاد",
"جاهد",
"سلام",
"خوارزم",
"لوغاريزم",
"كتاب",
"بدون",
"نقط",
"مارك",
"لعن",
"حقير",
"موت",
"قاوم",
"زوربير",
"عاصم",
"حر",
"رهاب",
"قرد",
"دعم",
"غز",
"نقذ",
"نتهك",
"معايير",
"صل",
"بان",
"دون",
"جيش",
"عدو",
"حماس",
"كر"
};
static {
LETTERS_DICT = new HashMap<>();
LETTERS_DICT.put("ا", "ا");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package io.github.seen_arabic.arabic_services;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

import org.junit.Test;

Expand Down Expand Up @@ -241,4 +243,28 @@ private void itShouldRemoveHumSuffixFromAWord() {
assertEquals("طلاب", result);
}

@Test
public void tashfeerBannedWordsShouldPerformTashfeerEncryptionOnBannedWordsOnly() {
String sentence = "جيش العدو يقتل الأطفال";
String result = ArabicServices.tashfeerBannedWords(sentence);
assertNotEquals(sentence, result);
assertTrue(result.contains("الأطفال"));
assertFalse(result.contains("جيش"));
assertFalse(result.contains("العدو"));
assertFalse(result.contains("يقتل"));
}

@Test
public void tashfeerBannedWordsShouldNotPerformTashfeerEncryptionOnNonBannedWords() {
String sentence = "هذه جملة غير مشفرة";
String result = ArabicServices.tashfeerBannedWords(sentence);
assertEquals(sentence, result);
}

@Test
public void tashfeerBannedWordsShouldHandleEmptyInput() {
String sentence = "";
String result = ArabicServices.tashfeerBannedWords(sentence);
assertEquals("", result);
}
}