Skip to content

Commit

Permalink
Refine TextRebuilder.deduplicateAndSortTable():
Browse files Browse the repository at this point in the history
Also remove duplicate samples if the sorted next sample date is the same as the previous sample date.  This usually just removes samples with different precision being exported from the instrument twice using two different export formats.

refs #59
  • Loading branch information
csjx committed Mar 22, 2022
1 parent 69918c7 commit 66e1ed2
Showing 1 changed file with 30 additions and 1 deletion.
31 changes: 30 additions & 1 deletion src/main/java/edu/hawaii/soest/pacioos/text/TextRebuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
Expand Down Expand Up @@ -242,10 +243,38 @@ protected void removeProcessedDirectory() {
*/
protected Table deduplicateAndSortTable(Table mergedTable, int sortColumnIndex) {
log.info("Removing duplicate samples from the merged table.");

// Remove exact full-row duplicates
Table dedupedTable = mergedTable.dropDuplicateRows();
log.info("Removed " + (mergedTable.rowCount() - dedupedTable.rowCount()) + " duplicate samples.");
// Find the date, time, or datetime columns, and create an instant column
DateTimeColumn[] dateTimeColumns = dedupedTable.dateTimeColumns();
InstantColumn instantColumn = dateTimeColumns[0].asInstantColumn(ZoneId.of("UTC"));

// Also remove duplicates based on datetime only
BooleanColumn uniqueValues = BooleanColumn.create("isUnique", dedupedTable.rowCount());
uniqueValues.set(0, true); // The first row is always unique

// Flag duplicate rows with false
for (int row = 0; row < dedupedTable.rowCount(); row++) {
int nextRow = row + 1;
if (nextRow < dedupedTable.rowCount()) {
Instant nextDateTime = instantColumn.get(nextRow);
if (nextDateTime.equals(instantColumn.get(row))) {
uniqueValues.set(nextRow, false);
} else {
uniqueValues.set(nextRow, true);
}
}
}
// Filter duplicates by-date out of the table
dedupedTable.addColumns(uniqueValues);
Table dedupedUniqueDatesTable = dedupedTable.where(uniqueValues.asSelection());
dedupedUniqueDatesTable.removeColumns("isUnique");
log.info("Removed " + (dedupedTable.rowCount() - dedupedUniqueDatesTable.rowCount()) + " samples with duplicate dates.");
log.info("Sorting the merged table.");

return dedupedTable.sortOn(sortColumnIndex);
return dedupedUniqueDatesTable.sortOn(sortColumnIndex);
}

/*
Expand Down

0 comments on commit 66e1ed2

Please sign in to comment.