-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #130 from Samyssmile/feature/median-imputation
Create Median Imputation feature
- Loading branch information
Showing
5 changed files
with
132 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
61 changes: 61 additions & 0 deletions
61
lib/src/main/java/de/edux/functions/imputation/MedianImputation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
package de.edux.functions.imputation; | ||
|
||
import java.util.Arrays; | ||
|
||
/** | ||
* Implements the {@code IImputationStrategy} interface to provide a median value imputation. This | ||
* strategy calculates the median of the non-missing numeric values in a column and substitutes the | ||
* missing values with this median. | ||
* | ||
* <p>It is important to note that this strategy is only applicable to columns with numeric data. | ||
* Attempting to use this strategy on categorical data will result in a {@code RuntimeException}. | ||
*/ | ||
public class MedianImputation implements IImputationStrategy { | ||
@Override | ||
public String[] performImputation(String[] datasetColumn) { | ||
checkIfColumnContainsCategoricalValues(datasetColumn); | ||
|
||
String[] updatedDatasetColumn = new String[datasetColumn.length]; | ||
double median = calculateMedian(datasetColumn); | ||
|
||
for (int index = 0; index < datasetColumn.length; index++) { | ||
if (datasetColumn[index].isBlank()) { | ||
updatedDatasetColumn[index] = String.valueOf(median); | ||
|
||
} else { | ||
updatedDatasetColumn[index] = datasetColumn[index]; | ||
} | ||
} | ||
|
||
return updatedDatasetColumn; | ||
} | ||
|
||
private void checkIfColumnContainsCategoricalValues(String[] datasetColumn) { | ||
for (String value : datasetColumn) { | ||
if (!isNumeric(value)) { | ||
throw new RuntimeException( | ||
"MEDIAN imputation strategy can not be used on categorical features. " | ||
+ "Use MODE imputation strategy or perform a list wise deletion on the features."); | ||
} | ||
} | ||
} | ||
|
||
private boolean isNumeric(String value) { | ||
return value.matches("-?\\d+(\\.\\d+)?") || value.isBlank(); | ||
} | ||
|
||
double calculateMedian(String[] datasetColumn) { | ||
double[] filteredDatasetColumnInNumbers = Arrays.stream(datasetColumn) | ||
.filter(value -> !value.isBlank()) | ||
.mapToDouble(Double::parseDouble) | ||
.sorted() | ||
.toArray(); | ||
if (filteredDatasetColumnInNumbers.length % 2 == 0) { | ||
Double upper = filteredDatasetColumnInNumbers[filteredDatasetColumnInNumbers.length / 2]; | ||
Double lower = | ||
filteredDatasetColumnInNumbers[(filteredDatasetColumnInNumbers.length / 2) - 1]; | ||
return (upper + lower) / 2.0; | ||
} | ||
return filteredDatasetColumnInNumbers[filteredDatasetColumnInNumbers.length / 2]; | ||
} | ||
} |
69 changes: 69 additions & 0 deletions
69
lib/src/test/java/de/edux/functions/imputation/MedianImputationTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package de.edux.functions.imputation; | ||
|
||
import static org.junit.jupiter.api.Assertions.*; | ||
|
||
import java.util.Arrays; | ||
import java.util.Random; | ||
import org.junit.jupiter.api.Test; | ||
|
||
public class MedianImputationTest { | ||
@Test | ||
void performImputationWithCategoricalValuesShouldThrowRuntimeException() { | ||
String[] categoricalFeatures = {"A", "B", "C"}; | ||
assertThrows( | ||
RuntimeException.class, | ||
() -> new MedianImputation().performImputation(categoricalFeatures)); | ||
} | ||
|
||
@Test | ||
void performImputationWithNumericalValuesTest() { | ||
String[] numericalFeaturesWithMissingValues = {"1", "", "2", "3", "", "4"}; | ||
MedianImputation imputter = new MedianImputation(); | ||
String[] numericalFeaturesWithImputtedValues = | ||
imputter.performImputation(numericalFeaturesWithMissingValues); | ||
assertAll( | ||
() -> assertEquals("2.5", numericalFeaturesWithImputtedValues[1]), | ||
() -> assertEquals("2.5", numericalFeaturesWithImputtedValues[4])); | ||
} | ||
|
||
@Test | ||
public void testCalculateMedianWithLargeDataset() { | ||
String[] largeDataset = new String[1000000]; | ||
Random random = new Random(); | ||
for (int i = 0; i < largeDataset.length; i++) { | ||
if (random.nextDouble() < 0.05) { // 5% empty values | ||
largeDataset[i] = ""; | ||
} else { | ||
largeDataset[i] = String.valueOf(random.nextDouble() * 1000000); | ||
} | ||
} | ||
|
||
// Erwarteter Median | ||
double[] numericValues = | ||
Arrays.stream(largeDataset) | ||
.filter(s -> !s.isBlank()) | ||
.mapToDouble(Double::parseDouble) | ||
.sorted() | ||
.toArray(); | ||
double expectedMedian = | ||
numericValues.length % 2 == 0 | ||
? (numericValues[numericValues.length / 2] | ||
+ numericValues[numericValues.length / 2 - 1]) | ||
/ 2.0 | ||
: numericValues[numericValues.length / 2]; | ||
|
||
MedianImputation medianImputation = new MedianImputation(); | ||
|
||
long startTime = System.nanoTime(); | ||
double calculatedMedian = medianImputation.calculateMedian(largeDataset); | ||
long endTime = System.nanoTime(); | ||
|
||
System.out.println("Process time in seconds: " + (endTime - startTime) / 1e9); | ||
|
||
assertEquals( | ||
expectedMedian, | ||
calculatedMedian, | ||
0.001, | ||
"Calculated median should be equal to the expected median."); | ||
} | ||
} |