From 67ace241f65153f13b84620a69314a4574b94143 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 14:18:12 +0200 Subject: [PATCH 01/10] Revert changes from master up to v1.1 (37837a25e089a5b6356554b9861634dd4afd3abb) --- dfd/dfdAlgorithm/pom.xml | 28 ++++---- .../equivalence/PartitionEquivalences.java | 2 +- .../fdiscovery/approach/runner/DFDMiner.java | 49 +++++++------- .../src/fdiscovery/columns/AgreeSet.java | 2 +- .../fdiscovery/columns/ColumnCollection.java | 64 +++++++++---------- .../src/fdiscovery/columns/DifferenceSet.java | 5 +- .../src/fdiscovery/columns/Path.java | 9 +-- .../src/fdiscovery/columns/Seed.java | 13 ++-- .../equivalence/EquivalenceGroupHashSet.java | 5 +- .../EquivalenceGroupTIntHashSet.java | 5 +- .../equivalence/EquivalenceGroupTreeSet.java | 7 +- .../src/fdiscovery/fastfds/CoverOrder.java | 2 +- .../fdiscovery/fastfds/EquivalenceClass.java | 2 +- .../fastfds/EquivalenceClasses.java | 2 +- .../fastfds/MaximalEquivalenceClasses.java | 2 +- .../src/fdiscovery/fastfds/PartialOrder.java | 24 +++---- .../fdiscovery/fastfds/runner/FastFDs.java | 37 ++++++----- .../src/fdiscovery/general/Benchmarker.java | 6 +- .../src/fdiscovery/general/ColumnFiles.java | 2 +- .../general/FunctionalDependencies.java | 12 ++-- .../src/fdiscovery/general/Miner.java | 2 +- .../partitions/JoinedPartitions.java | 2 +- .../MemoryManagedJoinedPartitions.java | 1 - .../src/fdiscovery/partitions/Partition.java | 8 ++- .../partitions/PartitionStatistics.java | 4 +- .../src/fdiscovery/partitions/ProbeTable.java | 4 +- .../partitions/StrippedPartition.java | 5 +- .../src/fdiscovery/pruning/Holes.java | 3 +- .../src/fdiscovery/pruning/Observations.java | 20 +++--- .../src/fdiscovery/pruning/PruneHashSet.java | 3 +- .../src/fdiscovery/pruning/PruneTable.java | 10 +-- .../fdiscovery/tane/AprioriGeneration.java | 7 +- .../src/fdiscovery/tane/runner/Tane.java | 58 ++++++++--------- dfd/dfdMetanome/pom.xml | 33 ++++++---- .../dfd/{ => dfdMetanome}/DFDMetanome.java | 36 +++-------- dfd/pom.xml | 9 +-- 36 files changed, 243 insertions(+), 240 deletions(-) rename dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/{ => dfdMetanome}/DFDMetanome.java (81%) diff --git a/dfd/dfdAlgorithm/pom.xml b/dfd/dfdAlgorithm/pom.xml index 128a457..975b05d 100755 --- a/dfd/dfdAlgorithm/pom.xml +++ b/dfd/dfdAlgorithm/pom.xml @@ -2,17 +2,17 @@ xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - - DFDAlgorithm + de.metanome.algorithms.dfd + dfdAlgorithm jar - DFDAlgorithm + dfdAlgorithm - de.metanome.algorithms.dfd - DFDModules - 1.2-SNAPSHOT - ../pom.xml + de.metanome.algorithms + algorithms + ${metanome.version} + ../../pom.xml @@ -20,10 +20,10 @@ maven-compiler-plugin - ${maven-compiler-plugin.version} + 3.1 - 1.8 - 1.8 + 1.7 + 1.7 @@ -38,6 +38,7 @@ com.google.guava guava + 15.0 net.sf.trove4j @@ -54,6 +55,11 @@ javatuples 1.2 + + org.apache.lucene + lucene-core + 3.0.3 + log4j log4j @@ -80,4 +86,4 @@ 1.2 - + \ No newline at end of file diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java index 0cf3007..29336e5 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java @@ -38,7 +38,7 @@ public void addPartition(EquivalenceManagedPartition partition) { if (!this.observedPartitions.contains(partition.getIndices()) && !this.containsSimilarPartition(partition)) { this.observedPartitions.add(partition.getIndices()); long hashNumber = partition.getHashNumber(); - System.out.println(String.format("Partition[%s]\t%d\tSize: %d", partition.getIndices(), Long.valueOf(hashNumber), Integer.valueOf(partition.size()))); + System.out.println(String.format("Partition[%s]\t%d\tSize: %d", partition.getIndices(), hashNumber, partition.size())); partitionHashes.putIfAbsent(hashNumber, new TIntObjectHashMap>()); partitionHashes.get(hashNumber).putIfAbsent(partition.size(), new THashSet()); THashSet partitionGroup = partitionHashes.get(hashNumber).get(partition.size()); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java index cd29e27..8f52dfb 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java @@ -65,7 +65,7 @@ public static void main(String[] args) { DFDMiner dfdRunner = new DFDMiner(inputFileProcessor); dfdRunner.run(); - System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(dfdRunner.minimalDependencies.getCount()))); + System.out.println(String.format("Number of dependencies:\t%d", dfdRunner.minimalDependencies.getCount())); long timeFindFDs = System.currentTimeMillis(); System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s"); System.out.println(dfdRunner.getDependencies()); @@ -96,10 +96,10 @@ public static void main2(String[] args) { resultFile = cli.getOptionValue("result"); } if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")); } if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")); } ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); long timeStart = System.currentTimeMillis(); @@ -116,24 +116,24 @@ public static void main2(String[] args) { private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1) ? String.format("%.1f", Double.valueOf((double) (time) / 1000)) : "-1"; + String timeString = (time != -1) ? String.format("%.1f", (double) (time) / 1000) : "-1"; StringBuilder outputBuilder = new StringBuilder(); if (!inputFileName.isEmpty()) { outputBuilder.append(String.format("%s\t", inputFileName)); } - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%d\t", this.numberOfRows)); + outputBuilder.append(String.format("%d\t", this.numberOfColumns)); outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getTotalCount()))); - outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCount())); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(2))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(3))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(4))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(5))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(6))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeGreaterThan(5))); + outputBuilder.append(String.format("%d\t", this.joinedPartitions.getCount())); + outputBuilder.append(String.format("%d\t", this.joinedPartitions.getTotalCount())); + outputBuilder.append(String.format("%d\n", Runtime.getRuntime().totalMemory())); outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); try { @@ -227,7 +227,7 @@ public void run() throws OutOfMemoryError { currentRHS.set(currentRHSIndex); // generate seeds - for (int partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) { + for (Integer partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) { if (partitionIndex != currentRHSIndex) { FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex); this.seeds.push(new Seed(lhsPartition.getIndices())); @@ -290,7 +290,7 @@ else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) { // this.joinedPartitions.getCount())); } - private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) { + private Observation checkDependencyAndStoreIt(Seed seed, Integer currentRHSIndex) { if (nonDependencies.isRepresented(seed.getIndices())) { // System.out.println("Skip because of nonDependency"); Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); @@ -356,11 +356,12 @@ private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) { this.observations.put(seed.getIndices(), observationOfLHS); this.dependencies.add(seed.getIndices()); return observationOfLHS; + } else { + Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); + this.observations.put(seed.getIndices(), observationOfLHS); + this.nonDependencies.add(seed.getIndices()); + return observationOfLHS; } - Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); - this.observations.put(seed.getIndices(), observationOfLHS); - this.nonDependencies.add(seed.getIndices()); - return observationOfLHS; } private Stack nextSeeds(int currentRHSIndex) { @@ -404,7 +405,7 @@ private Stack nextSeeds(int currentRHSIndex) { ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); if (deps.isEmpty()) { ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); - for (int complementColumnIndex : complement.getSetBits()) { + for (Integer complementColumnIndex : complement.getSetBits()) { deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); } } else { @@ -477,7 +478,7 @@ private Seed randomTake() { return null; } - private Seed randomWalkStep(Seed currentSeed, int currentRHSIndex) { + private Seed randomWalkStep(Seed currentSeed, Integer currentRHSIndex) { Observation observationOfSeed = this.observations.get(currentSeed.getIndices()); if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java index bd141f8..d26334d 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java @@ -10,7 +10,7 @@ public class AgreeSet extends ColumnCollection { private static final long serialVersionUID = -5335032949377336772L; - public AgreeSet(Set set1, Set set2, int numberOfColumns) { + public AgreeSet(Set set1, Set set2, long numberOfColumns) { super(numberOfColumns); Set intersected = Sets.intersection(set1, set2); for (Point columnToIdentifier : intersected) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java index 4a74fd5..5f88340 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java @@ -1,32 +1,33 @@ package fdiscovery.columns; -import java.util.BitSet; +import org.apache.lucene.util.OpenBitSet; -public class ColumnCollection extends BitSet implements Comparable { +public class ColumnCollection extends OpenBitSet implements Comparable { private static final long serialVersionUID = -5256272139963505719L; private int formatStringWidth; - protected int numberOfColumns; + protected long numberOfColumns; protected int[] setBits; - public ColumnCollection(int numberOfColumns ) { + public ColumnCollection(long numberOfColumns ) { this.numberOfColumns = numberOfColumns; this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns)); } public int[] getSetBits() { - int[] setBits = new int[this.cardinality()]; + int[] setBits = new int[(int) this.cardinality()]; - int bitIndex = 0; + long bitIndex = 0; int currentArrayIndex = 0; while (bitIndex < this.numberOfColumns) { - int currentNextSetBit = this.nextSetBit(bitIndex); + long currentNextSetBit = this.nextSetBit(bitIndex); if (currentNextSetBit != -1) { - setBits[currentArrayIndex++] = currentNextSetBit; + setBits[currentArrayIndex++] = (int) currentNextSetBit; bitIndex = currentNextSetBit + 1; } else { bitIndex = this.numberOfColumns; + } } @@ -37,7 +38,7 @@ public boolean isAtomic() { return this.cardinality() == 1; } - public ColumnCollection addColumn(int columnIndex) { + public ColumnCollection addColumn(long columnIndex) { ColumnCollection copy = (ColumnCollection) this.clone(); copy.set(columnIndex); @@ -74,7 +75,7 @@ public ColumnCollection andNotCopy(ColumnCollection other) { public ColumnCollection removeCopy(ColumnCollection other) { ColumnCollection copy = (ColumnCollection)this.clone(); - copy.andNot(other); + copy.remove(other); return copy; } @@ -113,11 +114,11 @@ public ColumnCollection complement() { } public boolean isSubsetOf(ColumnCollection other) { - return this.unionCount(other) == other.cardinality(); + return ColumnCollection.unionCount(this, other) == other.cardinality(); } public boolean isSupersetOf(ColumnCollection other) { - return this.unionCount(other) == this.cardinality(); + return ColumnCollection.unionCount(this, other) == this.cardinality(); } @@ -125,7 +126,7 @@ public boolean isProperSubsetOf(ColumnCollection other) { long cardinality = this.cardinality(); long otherCardinality = other.cardinality(); if (cardinality != otherCardinality) { - if (this.unionCount(other) == otherCardinality) { + if (ColumnCollection.unionCount(this, other) == otherCardinality) { return true; } } @@ -137,35 +138,30 @@ public boolean isProperSupersetOf(ColumnCollection other) { long cardinality = this.cardinality(); long otherCardinality = other.cardinality(); if (cardinality != otherCardinality) { - if (this.unionCount(other) == cardinality) { + if (ColumnCollection.unionCount(this, other) == cardinality) { return true; } } return false; } - - public int unionCount(ColumnCollection other) { - ColumnCollection union = (ColumnCollection) this.clone(); - union.and(other); - return union.cardinality(); - } public boolean isSubsetOrSupersetOf(ColumnCollection other) { return isSubsetOf(other) || isSupersetOf(other); } - public int getNumberOfColumns() { + public long getNumberOfColumns() { return this.numberOfColumns; } public long getMostRightBit() { - int bitIndex = 0; + long bitIndex = 0; while (bitIndex < this.numberOfColumns) { - int currentNextSetBit = this.nextSetBit(bitIndex); + long currentNextSetBit = this.nextSetBit(bitIndex); if (currentNextSetBit != -1) { bitIndex = currentNextSetBit + 1; } else { return bitIndex - 1; + } } return bitIndex; @@ -178,8 +174,15 @@ public ColumnCollection removeColumnCopy(int columnIndex) { return copy; } + public ColumnCollection removeColumnCopy(long columnIndex) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.clear(columnIndex); + + return copy; + } + @Override - public int compareTo(BitSet other) { + public int compareTo(OpenBitSet other) { ColumnCollection copy = (ColumnCollection) this.clone(); copy.xor(other); int lowestBit = copy.nextSetBit(0); @@ -195,8 +198,8 @@ public int compareTo(BitSet other) { public String toString() { StringBuilder outputBuilder = new StringBuilder(); if (this.cardinality() > 0) { - for (int columnIndex : this.getSetBits()) { - outputBuilder.append(String.format("%0" + formatStringWidth + "d,", Integer.valueOf(columnIndex))); + for (Integer columnIndex : this.getSetBits()) { + outputBuilder.append(String.format("%0" + formatStringWidth + "d,", columnIndex)); } } else { @@ -206,13 +209,4 @@ public String toString() { return outputBuilder.toString(); } - public void remove(ColumnCollection other) { - this.andNot(other); - } - - public static int intersectionCount(ColumnCollection set1, ColumnCollection set2) { - ColumnCollection intersection = (ColumnCollection) set1.clone(); - intersection.and(set2); - return intersection.cardinality(); - } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java index 9b5cda4..608327a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java @@ -5,10 +5,13 @@ public class DifferenceSet extends ColumnCollection { private static final long serialVersionUID = -5174627424398542681L; + private long numberOfColumns; + public DifferenceSet(AgreeSet agreeSet) { super(agreeSet.getNumberOfColumns()); + this.numberOfColumns = agreeSet.getNumberOfColumns(); - this.or(agreeSet); + this.bits = agreeSet.getBits().clone(); this.flip(0, this.numberOfColumns); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java index 06b713b..5cc2c71 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java @@ -7,7 +7,7 @@ public class Path extends ColumnCollection { private static final long serialVersionUID = -6451347203736964695L; - public Path(int numberOfColumns) { + public Path(long numberOfColumns) { super(numberOfColumns); } @@ -16,9 +16,10 @@ public ArrayList getMaximalSubsets() { if (this.isEmpty()) { return new ArrayList<>(); - } - for (int columnIndex : this.getSetBits()) { - maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex)); + } else { + for (Integer columnIndex : this.getSetBits()) { + maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex)); + } } return maximalSubsetPaths; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java index 9fd3d05..9be0918 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java @@ -21,10 +21,12 @@ public int compareTo(Seed o) { if (this.distinctiveness != o.distinctiveness) { if (o.distinctiveness - this.distinctiveness < 0) { return -1; + } else { + return 1; } - return 1; + } else { + return this.indices.compareTo(o.indices); } - return this.indices.compareTo(o.indices); } @Override @@ -37,9 +39,10 @@ public boolean equals(Object o) { } if (!(o instanceof Seed)) { return false; + } else { + Seed otherSeed = (Seed) o; + return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; } - Seed otherSeed = (Seed) o; - return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; } public ColumnCollection getBaseIndices() { @@ -56,7 +59,7 @@ public int getAdditionalColumnIndex() { public String toString() { StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, Double.valueOf(this.distinctiveness))); + outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, this.distinctiveness)); return outputBuilder.toString(); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java index 5e3c2c0..77819d8 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java @@ -21,8 +21,9 @@ public EquivalenceGroupHashSet(int identifier) { public int compareTo(EquivalenceGroupHashSet o) { if (this.size() != o.size()) { return this.size() - o.size(); + } else { + return this.identifier - o.identifier; } - return this.identifier - o.identifier; } @Override @@ -45,6 +46,6 @@ public void add(int value) { this.identifier = value; } - super.add(Integer.valueOf(value)); + super.add(value); } } \ No newline at end of file diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java index ce33016..2dc6cce 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java @@ -42,7 +42,8 @@ public boolean add(int value) { public int compareTo(EquivalenceGroupTIntHashSet o) { if (this.size() != o.size()) { return this.size() - o.size(); - } - return this.identifier - o.identifier; + } else { + return this.identifier - o.identifier; + } } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java index 687ac00..8a31ac2 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java @@ -22,13 +22,14 @@ public EquivalenceGroupTreeSet(int identifier) { public int compareTo(EquivalenceGroupTreeSet o) { if (this.size() != o.size()) { return this.size() - o.size(); + } else { + return this.first() - o.first(); } - return this.first().intValue() - o.first().intValue(); } @Override public int getIdentifier() { - return this.first().intValue(); + return this.first(); } @Override @@ -46,6 +47,6 @@ public void add(int value) { this.identifier = value; } - super.add(Integer.valueOf(value)); + super.add(value); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java index ad00123..5316fcd 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java @@ -43,7 +43,7 @@ public int compareTo(CoverOrder o) { public String toString() { StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("[%s:%d]", Character.valueOf((char)(this.columnIndex + 65)), Integer.valueOf(this.appearances))); + outputBuilder.append(String.format("[%s:%d]", (char)(this.columnIndex+65), this.appearances)); return outputBuilder.toString(); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java index fa1d52a..81799ad 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java @@ -13,7 +13,7 @@ public String toString() { StringBuilder outputBuilder = new StringBuilder(); for (Iterator it = this.iterator(); it.hasNext(); ) { Point identifier = it.next(); - outputBuilder.append(String.format("(%s,%d),", Character.valueOf((char)(identifier.x + 65)), Integer.valueOf(identifier.y))); + outputBuilder.append(String.format("(%s,%d),", (char)(identifier.x+65), identifier.y)); } return outputBuilder.toString(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java index 9859557..8182bde 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java @@ -34,7 +34,7 @@ public String toString() { for (TIntObjectIterator it = this.iterator(); it.hasNext(); ) { it.advance(); - outputBuilder.append(String.format("ec(%d(\t", Integer.valueOf(it.key()))); + outputBuilder.append(String.format("ec(%d(\t", it.key())); outputBuilder.append(String.format("{%s}\n", it.value().toString())); } outputBuilder.append("EquivalenceClasses\n"); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java index 36e44f4..769f8f2 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java @@ -59,7 +59,7 @@ public MaximalEquivalenceClasses(StrippedPartitions strippedPartitions) throws O } } - for (int groupSize : equivalenceGroupsBySize.keys()) { + for (Integer groupSize : equivalenceGroupsBySize.keys()) { for (TEquivalence sizeGroup : equivalenceGroupsBySize.get(groupSize)) { maximumGroupSize = Math.max(groupSize, maximumGroupSize); this.add(sizeGroup); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java index 05ead74..11fec1d 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java @@ -17,20 +17,20 @@ public PartialOrder(DifferenceSets differenceSets) { for (DifferenceSet differenceSet : differenceSets) { // increase the cover count for set columns - int bitIndex = 0; + long bitIndex = 0; while (bitIndex < differenceSet.getNumberOfColumns()) { - int currentNextSetBit = differenceSet.nextSetBit(bitIndex); + long currentNextSetBit = differenceSet.nextSetBit(bitIndex); if (currentNextSetBit != -1) { bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent(currentNextSetBit, 0); - orderMap.increment(currentNextSetBit); + orderMap.putIfAbsent((int) currentNextSetBit, 0); + orderMap.increment((int) currentNextSetBit); } else { bitIndex = differenceSet.getNumberOfColumns(); } } } - for (int index : orderMap.keys()) { + for (Integer index : orderMap.keys()) { this.add(new CoverOrder(index, orderMap.get(index))); } @@ -38,25 +38,25 @@ public PartialOrder(DifferenceSets differenceSets) { } - public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) { + public PartialOrder(DifferenceSets differenceSets, long columnIndexToSkip) { TIntIntHashMap orderMap = new TIntIntHashMap(); for (DifferenceSet differenceSet : differenceSets) { // increase the cover count for set columns - int bitIndex = columnIndexToSkip; + long bitIndex = columnIndexToSkip; while (bitIndex < differenceSet.getNumberOfColumns()) { - int currentNextSetBit = differenceSet.nextSetBit(bitIndex); + long currentNextSetBit = differenceSet.nextSetBit(bitIndex); if (currentNextSetBit != -1) { bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent(currentNextSetBit, 0); - orderMap.increment(currentNextSetBit); + orderMap.putIfAbsent((int) currentNextSetBit, 0); + orderMap.increment((int) currentNextSetBit); } else { bitIndex = differenceSet.getNumberOfColumns(); } } } - for (int index : orderMap.keys()) { + for (Integer index : orderMap.keys()) { this.add(new CoverOrder(index, orderMap.get(index))); } @@ -67,7 +67,7 @@ public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) { public ArrayList getOrderedColumns() { ArrayList orderedColumns = new ArrayList<>(); for (CoverOrder order : this) { - orderedColumns.add(Integer.valueOf(order.getColumnIndex())); + orderedColumns.add(order.getColumnIndex()); } return orderedColumns; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java index e8e89c9..884c216 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java @@ -32,7 +32,6 @@ public class FastFDs extends Miner { private FunctionalDependencies minimalDependencies; private DifferenceSets differenceSets; - @SuppressWarnings("unused") public static void main2(String[] args) { createColumDirectory(); createResultDirectory(); @@ -51,7 +50,7 @@ public static void main2(String[] args) { FastFDs fastFDRunner = new FastFDs(inputFileProcessor); fastFDRunner.run(); - System.out.println(String.format("Dependencies: %d.", Integer.valueOf(fastFDRunner.minimalDependencies.getCount()))); + System.out.println(String.format("Dependencies: %d.", fastFDRunner.minimalDependencies.getCount())); long timeFindFDs = System.currentTimeMillis(); System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); System.out.println(fastFDRunner.getDependencies()); @@ -81,10 +80,10 @@ public static void main(String[] args) { resultFile = cli.getOptionValue("result"); } if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")); } if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")); } ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); long timeStart = System.currentTimeMillis(); @@ -100,25 +99,25 @@ public static void main(String[] args) { } private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; + String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; StringBuilder outputBuilder = new StringBuilder(); if (!inputFileName.isEmpty()) { outputBuilder.append(String.format("%s\t", inputFileName)); } - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%d\t", this.numberOfRows)); + outputBuilder.append(String.format("%d\t", this.numberOfColumns)); outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); - outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCount())); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(2))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(3))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(4))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(5))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(6))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeGreaterThan(5))); + outputBuilder.append(String.format("%d\t", 0)); + outputBuilder.append(String.format("%d\t", 0)); + outputBuilder.append(String.format("%d\n", Runtime.getRuntime().totalMemory())); outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); try { @@ -175,7 +174,7 @@ public void run() throws OutOfMemoryError { if (orig.isEmpty()) { ColumnCollection lhs = new ColumnCollection(this.numberOfColumns); - for (int lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { + for (Integer lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex); } } @@ -203,7 +202,7 @@ public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets unco } // RECURSIVE CASE - for (int remainingColumn : currentOrder.getOrderedColumns()) { + for (Integer remainingColumn : currentOrder.getOrderedColumns()) { DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn); PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn); Path nextPath = (Path) currentPath.addColumn(remainingColumn); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java index 43d0172..89d0178 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java @@ -100,7 +100,7 @@ public static void main(String[] args) { } if (cmdLine.contains("timeout")) { System.out.println(String.format("Timeout:%s", cmdLine.get("timeout"))); - timeout = Integer.valueOf(cmdLine.get("timeout")).intValue(); + timeout = Integer.valueOf(cmdLine.get("timeout")); } if (cmdLine.containsKey("all")) { System.out.println("Use all files."); @@ -182,7 +182,7 @@ public static void main(String[] args) { resultHandler.waitFor(timeout); long timeEnd = System.currentTimeMillis(); - System.out.println(String.format("Time:%.1f", Double.valueOf((double)(timeEnd - timeStart) / 1000))); + System.out.println(String.format("Time:%.1f", (double)(timeEnd - timeStart)/1000)); int exitCode = 0; if (resultHandler.hasResult()) { @@ -197,7 +197,7 @@ public static void main(String[] args) { executor.getWatchdog().destroyProcess(); } else { } - System.out.println(String.format("ExitCode %d", Integer.valueOf(exitCode))); + System.out.println(String.format("ExitCode %d", exitCode)); if (exitCode == Miner.STATUS_OK) { } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java index 98af5aa..3659509 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java @@ -58,6 +58,6 @@ public boolean accept(File file) { } private final String getColumnFileName(final int columnIndex) { - return String.format(this.formatString, Integer.valueOf(columnIndex)); + return String.format(this.formatString, columnIndex); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java index dbf1c4e..676454e 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java @@ -42,7 +42,7 @@ public void minimize(int rhsIndex) { if (lhsForRhsToDelete.contains(lhs)) { ColumnCollection rhs = this.get(lhs); this.put(lhs, rhs.removeColumnCopy(rhsIndex)); - System.out.println(String.format("Remove %s->%s", lhs, Character.valueOf((char) (rhsIndex + 65)))); + System.out.println(String.format("Remove %s->%s", lhs, (char)(rhsIndex + 65))); } } } @@ -85,7 +85,7 @@ public int getNumberOfNonAtomicDependencies() { return nonAtomicFDCount; } - public void addRHSColumn(ColumnCollection lhs, int rhsIndex) { + public void addRHSColumn(ColumnCollection lhs, Integer rhsIndex) { ColumnCollection rhs = null; if (!this.containsKey(lhs)) { rhs = new ColumnCollection(lhs.getNumberOfColumns()); @@ -140,11 +140,11 @@ public String toString() { StringBuilder outputBuilder = new StringBuilder(); for (ColumnCollection determining : this.keySet()) { - for (int dependentColumn : this.get(determining).getSetBits()) { - for (int determiningColumn : determining.getSetBits()) { - outputBuilder.append(String.format("c%04d\t", Integer.valueOf(determiningColumn))); + for (Integer dependentColumn : this.get(determining).getSetBits()) { + for (Integer determiningColumn : determining.getSetBits()) { + outputBuilder.append(String.format("c%04d\t", determiningColumn)); } - outputBuilder.append(String.format("->\tc%04d\n", Integer.valueOf(dependentColumn))); + outputBuilder.append(String.format("->\tc%04d\n", dependentColumn)); } } return outputBuilder.toString(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java index 1fd9214..c759691 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java @@ -20,7 +20,7 @@ public static String humanReadableByteCount(long bytes, boolean si) { if (bytes < unit) return bytes + " B"; int exp = (int) (Math.log(bytes) / Math.log(unit)); String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp-1) + (si ? "" : "i"); - return String.format("%.1f %sB", Double.valueOf(bytes / Math.pow(unit, exp)), pre); + return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); } protected static final void createColumDirectory() { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java index 2cb9218..1cbfff1 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java @@ -37,7 +37,7 @@ public Partition getAtomicPartition(int columnIndex) { public ArrayList getBestMatchingPartitionsLazy(ColumnCollection path) { ArrayList bestMatchingPartitions = new ArrayList<>(); - for (int columnIndex : path.getSetBits()) { + for (Integer columnIndex : path.getSetBits()) { bestMatchingPartitions.add(this.getAtomicPartition(columnIndex)); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java index 4ce8f42..cca75ee 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java @@ -63,7 +63,6 @@ public int getCount() { return cumulatedCount; } - @SuppressWarnings("unused") public Partition get(ColumnCollection key) { Partition result = this.get(key.cardinality()).get(key); if (USE_MEMORY_MANAGEMENT && result != null) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java index ecb4d79..fae9c2e 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java @@ -61,8 +61,9 @@ private void resetProbeTable() { public int compareTo(Partition o) { if (this.getDistinctiveness() == o.getDistinctiveness()) { return this.indices.compareTo(o.indices); + } else { + return Double.valueOf(this.getDistinctiveness()).compareTo(o.getDistinctiveness()); } - return Double.valueOf(this.getDistinctiveness()).compareTo(Double.valueOf(o.getDistinctiveness())); } public int getNumberOfRows() { @@ -134,12 +135,13 @@ public boolean equals(Partition other) { resetProbeTable(); if (numberOfValues == 0) { return true; + } else { + return false; } - return false; } public String printIndices() { - StringBuilder outputBuilder = new StringBuilder(this.indices.size()); + StringBuilder outputBuilder = new StringBuilder((int)this.indices.size()); for (int i=0; i < this.indices.size(); i++) { if (this.indices.get(i)) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java index a3462fe..98278ec 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java @@ -25,12 +25,12 @@ public String getStatistics() { for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) { statsByLevelIt.advance(); long levelCardinality = statsByLevelIt.key(); - statisticsBuilder.append(String.format("%d attributes {\n", Long.valueOf(levelCardinality))); + statisticsBuilder.append(String.format("%d attributes {\n", levelCardinality)); for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) { countByLevelIt.advance(); int usageCount = countByLevelIt.key(); int numberOfElements = countByLevelIt.value().size(); - statisticsBuilder.append(String.format("\t%d elements used %d times\n", Integer.valueOf(numberOfElements), Integer.valueOf(usageCount))); + statisticsBuilder.append(String.format("\t%d elements used %d times\n", numberOfElements, usageCount)); } statisticsBuilder.append("}\n"); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java index bbcb762..c570849 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java @@ -23,8 +23,8 @@ public ProbeTable(Partition partition) { public String toString() { StringBuilder outputBuilder = new StringBuilder(); outputBuilder.append("ProbeTable:\n"); - for (int key : this.keys()) { - outputBuilder.append(String.format("%d\t->\t%d\n", Integer.valueOf(key), Integer.valueOf(this.get(key)))); + for (Integer key : this.keys()) { + outputBuilder.append(String.format("%d\t->\t%d\n", key, this.get(key))); } return outputBuilder.toString(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java index 8c527a5..d337678 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java @@ -19,7 +19,6 @@ public StrippedPartition() { } - @SuppressWarnings("unused") public StrippedPartition(StrippedPartition base, StrippedPartition additional) { } @@ -36,12 +35,12 @@ public StrippedPartition(String[] columnContent) { valueToIndex.put(value, rowIndex); TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet(); equivalenceGroup.add(rowIndex); - helpMap.put(Integer.valueOf(rowIndex), equivalenceGroup); + helpMap.put(rowIndex, equivalenceGroup); } // otherwise find the right equivalence class and add the current element index else { int equivalenceGroupIndex = valueToIndex.get(value); - TEquivalence equivalenceClass = helpMap.get(Integer.valueOf(equivalenceGroupIndex)); + TEquivalence equivalenceClass = helpMap.get(equivalenceGroupIndex); equivalenceClass.add(rowIndex); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java index 3ed16c1..d17c648 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java @@ -17,8 +17,9 @@ public class Holes extends TreeSet { public int compare(ColumnCollection o1, ColumnCollection o2) { if (o1.isProperSupersetOf(o2)) { return 0; + } else { + return o1.compareTo(o2); } - return o1.compareTo(o2); } }; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java index d354e37..da3b43c 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java @@ -11,7 +11,7 @@ public class Observations extends HashMap { private static final long serialVersionUID = 2932117192054503664L; public ColumnCollection getUncheckedMaximalSubset(ColumnCollection lhs) { - for (int columnIndex : lhs.getSetBits()) { + for (Integer columnIndex : lhs.getSetBits()) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices)) { return subsetIndices; @@ -24,7 +24,7 @@ public THashSet getUncheckedMaximalSubsets(ColumnCollection lh THashSet uncheckedMaximalSubsets = new THashSet<>(); // if (lhs.cardinality() > 2) { - for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (Integer columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices)) { uncheckedMaximalSubsets.add(subsetIndices); @@ -39,7 +39,7 @@ public THashSet getUncheckedOrCandidateMaximalSubsets(ColumnCo // we only want to check subsets with at least 2 columns if (lhs.cardinality() > 2) { - for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (Integer columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices) || this.get(subsetIndices) == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { uncheckedMaximalSubsets.add(subsetIndices); @@ -54,7 +54,7 @@ public THashSet getMaximalSubsets(ColumnCollection lhs, Column // we only want to check subsets with at least 2 columns if (lhs.cardinality() > 2) { - for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (Integer columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); uncheckedMaximalSubsets.add(subsetIndices); } @@ -63,7 +63,7 @@ public THashSet getMaximalSubsets(ColumnCollection lhs, Column } public ColumnCollection getUncheckedMinimalSuperset(ColumnCollection lhs, int rhsIndex) { - for (int columnIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { + for (Integer columnIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); if (!this.containsKey(supersetIndices)) { return supersetIndices; @@ -75,7 +75,7 @@ public ColumnCollection getUncheckedMinimalSuperset(ColumnCollection lhs, int rh public THashSet getUncheckedOrCandidateMinimalSupersets(ColumnCollection lhs, int rhsIndex, ColumnOrder order) { THashSet uncheckedMinimalSupersets = new THashSet<>(); - for (int columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { + for (Integer columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); if (!this.containsKey(supersetIndices) || this.get(supersetIndices) == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) { uncheckedMinimalSupersets.add(supersetIndices); @@ -87,7 +87,7 @@ public THashSet getUncheckedOrCandidateMinimalSupersets(Column public THashSet getUncheckedMinimalSupersets(ColumnCollection lhs, int rhsIndex, ColumnOrder order) { THashSet uncheckedMinimalSupersets = new THashSet<>(); - for (int columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { + for (Integer columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); if (!this.containsKey(supersetIndices)) { uncheckedMinimalSupersets.add(supersetIndices); @@ -99,7 +99,7 @@ public THashSet getUncheckedMinimalSupersets(ColumnCollection public THashSet getMinimalSupersets(ColumnCollection lhs, int rhsIndex, ColumnOrder order) { THashSet uncheckedMinimalSupersets = new THashSet<>(); - for (int columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { + for (Integer columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); uncheckedMinimalSupersets.add(supersetIndices); } @@ -109,7 +109,7 @@ public THashSet getMinimalSupersets(ColumnCollection lhs, int public Observation updateDependencyType(ColumnCollection lhs) { if (lhs.cardinality() > 1) { boolean foundUncheckedSubset = false; - for (int columnIndex : lhs.getSetBits()) { + for (Integer columnIndex : lhs.getSetBits()) { Observation observationOfSubset = this.get(lhs.removeColumnCopy(columnIndex)); if (observationOfSubset == null) { foundUncheckedSubset = true; @@ -126,7 +126,7 @@ public Observation updateDependencyType(ColumnCollection lhs) { public Observation updateNonDependencyType(ColumnCollection lhs, int rhsIndex) { boolean foundUncheckedSuperset = false; - for (int columnIndex : lhs.setCopy(rhsIndex).complementCopy().getSetBits()) { + for (Integer columnIndex : lhs.setCopy(rhsIndex).complementCopy().getSetBits()) { Observation observationOfSuperset = this.get(lhs.setCopy(columnIndex)); if (observationOfSuperset == null) { foundUncheckedSuperset = true; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java index 1a11432..26eaf79 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java @@ -22,6 +22,7 @@ public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDep for (ColumnCollection candidate : candidates) { if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) { return candidate; + } else { } } return null; @@ -46,7 +47,7 @@ public void rebalance() { @Override public void rebalanceGroup(ColumnCollection groupKey) { HashSet depsOfGroup = this.get(groupKey); - for (int columnIndex : groupKey.complementCopy().getSetBits()) { + for (Integer columnIndex : groupKey.complementCopy().getSetBits()) { ColumnCollection newKey = groupKey.setCopy(columnIndex); HashSet newGroup = new HashSet(); this.put(newKey, newGroup); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java index cf045e2..987c5c4 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java @@ -6,7 +6,7 @@ import fdiscovery.columns.ColumnCollection; // from rhs to lhs -public abstract class PruneTable extends HashMap>> { +public abstract class PruneTable extends HashMap>> { private static final long serialVersionUID = 4470955427882698208L; @@ -23,14 +23,14 @@ public int getCount(ColumnCollection RHS) { public void addValue(ColumnCollection RHS, ColumnCollection LHS) { if (!this.containsKey(RHS)) { - this.put(RHS, new HashMap>()); + this.put(RHS, new HashMap>()); } - if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) { - this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList()); + if (!this.get(RHS).containsKey(LHS.cardinality())) { + this.get(RHS).put(LHS.cardinality(), new ArrayList()); } // System.out.println(this.get(RHS)); // System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality())); - ArrayList dependencies = this.get(RHS).get(Integer.valueOf(LHS.cardinality())); + ArrayList dependencies = this.get(RHS).get(LHS.cardinality()); // System.out.println(dependencies); dependencies.add(LHS); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java index 7d74b08..50e4c20 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java @@ -100,9 +100,10 @@ public static boolean haveCommonPrefixBlock(ColumnCollection x, ColumnCollection // System.out.println("true"); // System.out.println("---------------------------"); return true; + } else { +// System.out.println("false"); +// System.out.println("---------------------------"); + return false; } -// System.out.println("false"); -// System.out.println("---------------------------"); - return false; } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java index 0909653..fd8cc1f 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java @@ -44,7 +44,6 @@ public FunctionalDependencies getDependencies() { return this.minimalDependencies; } - @SuppressWarnings("unused") public static void main2(String[] args) { createColumDirectory(); createResultDirectory(); @@ -64,7 +63,7 @@ public static void main2(String[] args) { Tane taneRunner = new Tane(inputFileProcessor); taneRunner.run(); - System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(taneRunner.minimalDependencies.getCount())));; + System.out.println(String.format("Number of dependencies:\t%d", taneRunner.minimalDependencies.getCount()));; long timeFindFDs = System.currentTimeMillis(); System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); System.out.println(taneRunner.getDependencies()); @@ -74,6 +73,7 @@ public static void main2(String[] args) { } catch (IOException e) { System.out.println("The input reader could not be reset."); } + } public static void main(String[] args) { @@ -95,10 +95,10 @@ public static void main(String[] args) { resultFile = cli.getOptionValue("result"); } if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")); } if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")); } ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); long timeStart = System.currentTimeMillis(); @@ -114,25 +114,25 @@ public static void main(String[] args) { } private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; + String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; StringBuilder outputBuilder = new StringBuilder(); if (!inputFileName.isEmpty()) { outputBuilder.append(String.format("%s\t", inputFileName)); } - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%d\t", this.numberOfRows)); + outputBuilder.append(String.format("%d\t", this.numberOfColumns)); outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); - outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCount())); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(2))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(3))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(4))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(5))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(6))); + outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeGreaterThan(5))); + outputBuilder.append(String.format("%d\t", this.strippedPartitions.size())); + outputBuilder.append(String.format("%d\t", this.strippedPartitions.size())); + outputBuilder.append(String.format("%d\n", Runtime.getRuntime().totalMemory())); outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); try { @@ -218,7 +218,7 @@ private CollectionSet generateNextLevel(CollectionSet currentLevel) { } for (ColumnCollection x : currentLevel) { - for (int a : x.andCopy(cPlus.get(x)).getSetBits()) { - boolean isDependency = isValidDependency(x.clearCopy(a), Integer.valueOf(a)); + for (Integer a : x.andCopy(cPlus.get(x)).getSetBits()) { + boolean isDependency = isValidDependency(x.clearCopy(a), a); if (isDependency) { minimalDependencies.addRHSColumn(x.clearCopy(a), a); cPlus.get(x).clear(a); - for (int B : rSet.removeCopy(x).getSetBits()) { + for (Integer B : rSet.removeCopy(x).getSetBits()) { cPlus.get(x).clear(B); } } @@ -268,7 +268,7 @@ private ColumnCollection addCPlusOfX(ColumnCollection x) { } else { cPlusOfX = (ColumnCollection) cPlusOfX.clone(); } - for (int a : x.getSetBits()) { + for (Integer a : x.getSetBits()) { ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a)); if (nextCPlusOfX == null) { @@ -302,7 +302,7 @@ private void prune(CollectionSet currentLevel) { boolean isSuperKey = isSuperKey(x); if (isSuperKey) { - for (int a : cPlus.get(x).removeCopy(x).getSetBits()) { + for (Integer a : cPlus.get(x).removeCopy(x).getSetBits()) { ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0)); ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey); if (firstCPlusCandidates == null) { @@ -310,7 +310,7 @@ private void prune(CollectionSet currentLevel) { } else { firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone(); } - for (int b : x.getSetBits()) { + for (Integer b : x.getSetBits()) { ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b)); if (nextCPlusCandidates == null) { @@ -376,7 +376,7 @@ private boolean isValidDependency(ColumnCollection LHS, Integer RHS) { return false; } - return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS.intValue()))) == 0); + return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS))) == 0); } public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) { @@ -397,7 +397,7 @@ public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedP T[tValue] = i; } - S.put(Integer.valueOf(i), new EquivalenceGroupTIntHashSet()); + S.put(i, new EquivalenceGroupTIntHashSet()); i++; } @@ -405,17 +405,17 @@ public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedP for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { int tValue = tIt.next(); if (T[tValue] != -1) { - TEquivalence sOld = S.get(Integer.valueOf(T[tValue])); + TEquivalence sOld = S.get(T[tValue]); sOld.add(tValue); } } for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { int tValue = tIt.next(); - TEquivalence s = S.get(Integer.valueOf(T[tValue])); + TEquivalence s = S.get(T[tValue]); if (s != null && s.size() > 1) { xPartition.add(s); } - S.put(Integer.valueOf(T[tValue]), new EquivalenceGroupTIntHashSet()); + S.put(T[tValue], new EquivalenceGroupTIntHashSet()); } } i = 1; diff --git a/dfd/dfdMetanome/pom.xml b/dfd/dfdMetanome/pom.xml index 4ab019d..9c50517 100644 --- a/dfd/dfdMetanome/pom.xml +++ b/dfd/dfdMetanome/pom.xml @@ -4,16 +4,17 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - DFD + de.metanome.algorithms.dfd + dfdMetanome jar - DFD + dfdMetanome - de.metanome.algorithms.dfd - DFDModules - 1.2-SNAPSHOT - ../pom.xml + de.metanome.algorithms + algorithms + ${metanome.version} + ../../pom.xml @@ -29,10 +30,10 @@ org.apache.maven.plugins maven-compiler-plugin - ${maven-compiler-plugin.version} + 3.1 - 1.8 - 1.8 + 1.7 + 1.7 true true -Xlint:all @@ -41,12 +42,12 @@ org.apache.maven.plugins maven-assembly-plugin - ${maven-assembly-plugin.version} + 2.4 - de.metanome.algorithms.dfd.DFDMetanome + de.metanome.algorithms.dfd.dfdMetanome.DFDMetanome @@ -72,12 +73,16 @@ de.metanome algorithm_integration + ${metanome.version} + compile de.metanome.algorithms.dfd - DFDAlgorithm - 1.2-SNAPSHOT + dfdAlgorithm + ${metanome.version} + compile + - + \ No newline at end of file diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java similarity index 81% rename from dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java rename to dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java index 1c8fd82..37d364c 100644 --- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java +++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java @@ -1,9 +1,4 @@ -package de.metanome.algorithms.dfd; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; +package de.metanome.algorithms.dfd.dfdMetanome; import de.metanome.algorithm_integration.AlgorithmConfigurationException; import de.metanome.algorithm_integration.AlgorithmExecutionException; @@ -14,9 +9,13 @@ import de.metanome.algorithm_integration.configuration.ConfigurationRequirement; import de.metanome.algorithm_integration.configuration.ConfigurationRequirementFileInput; import de.metanome.algorithm_integration.input.FileInputGenerator; -import de.metanome.algorithm_integration.input.RelationalInput; import de.metanome.algorithm_integration.result_receiver.FunctionalDependencyResultReceiver; import de.metanome.algorithm_integration.results.FunctionalDependency; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; + import fdiscovery.approach.runner.DFDMiner; import fdiscovery.columns.ColumnCollection; import fdiscovery.general.FunctionalDependencies; @@ -71,26 +70,21 @@ public void execute() throws AlgorithmExecutionException { DFDMiner dfdMiner = new DFDMiner(inputFileProcessor); dfdMiner.run(); FunctionalDependencies fds = dfdMiner.getDependencies(); - - RelationalInput input = fileInput.generateNewCopy(); - String relationName = input.relationName(); - List columnNames = input.columnNames(); - for (ColumnCollection determining : fds.keySet()) { - for (int dependentColumn : fds.get(determining).getSetBits()) { + for (Integer dependentColumn : fds.get(determining).getSetBits()) { ColumnIdentifier[] determiningColumns = new ColumnIdentifier[determining.getSetBits().length]; int i = 0; - for (int determiningColumn : determining.getSetBits()) { + for (Integer determiningColumn : determining.getSetBits()) { determiningColumns[i] = - new ColumnIdentifier(relationName, columnNames.get(determiningColumn)); + new ColumnIdentifier(this.identifier, "Column " + determiningColumn); i++; } FunctionalDependency fd = new FunctionalDependency( new ColumnCombination(determiningColumns), - new ColumnIdentifier(relationName, columnNames.get(dependentColumn))); + new ColumnIdentifier(this.identifier, "Column " + dependentColumn)); this.resultReceiver.receiveResult(fd); } } @@ -98,14 +92,4 @@ public void execute() throws AlgorithmExecutionException { } } - @Override - public String getAuthors() { - return "Patrick Schulze"; - } - - @Override - public String getDescription() { - return "Random Walk-based FD discovery"; - } - } diff --git a/dfd/pom.xml b/dfd/pom.xml index e92f1a9..39dce46 100644 --- a/dfd/pom.xml +++ b/dfd/pom.xml @@ -5,10 +5,10 @@ 4.0.0 de.metanome.algorithms.dfd - DFDModules + dfd pom - DFDModules + dfd dfdAlgorithm @@ -16,14 +16,15 @@ + 0.0.2-SNAPSHOT UTF-8 de.metanome.algorithms algorithms - 1.2-SNAPSHOT + ${metanome.version} ../pom.xml - + \ No newline at end of file From df63ae8a5876766aee54bad6b5ab1f6dcc87aac7 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 15:59:04 +0200 Subject: [PATCH 02/10] Integer types changes int vs Integer --- .../fdiscovery/approach/runner/DFDMiner.java | 1068 ++++++++--------- .../fdiscovery/columns/ColumnCollection.java | 424 +++---- .../src/fdiscovery/columns/Path.java | 53 +- .../equivalence/EquivalenceGroupHashSet.java | 100 +- .../equivalence/EquivalenceGroupTreeSet.java | 7 +- .../fdiscovery/fastfds/EquivalenceClass.java | 2 +- .../fastfds/EquivalenceClasses.java | 2 +- .../fastfds/MaximalEquivalenceClasses.java | 2 +- .../src/fdiscovery/fastfds/PartialOrder.java | 150 +-- .../fdiscovery/fastfds/runner/FastFDs.java | 436 +++---- .../src/fdiscovery/general/Benchmarker.java | 438 +++---- .../src/fdiscovery/general/ColumnFiles.java | 2 +- .../partitions/JoinedPartitions.java | 2 +- .../src/fdiscovery/partitions/Partition.java | 346 +++--- .../partitions/PartitionStatistics.java | 80 +- .../src/fdiscovery/partitions/ProbeTable.java | 4 +- .../partitions/StrippedPartition.java | 154 +-- .../src/fdiscovery/pruning/Observations.java | 20 +- .../src/fdiscovery/pruning/PruneHashSet.java | 142 +-- .../src/fdiscovery/pruning/PruneTable.java | 74 +- .../src/fdiscovery/tane/runner/Tane.java | 862 ++++++------- .../dfd/dfdMetanome/DFDMetanome.java | 4 +- 22 files changed, 2185 insertions(+), 2187 deletions(-) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java index 8f52dfb..c666d1b 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java @@ -1,535 +1,535 @@ -package fdiscovery.approach.runner; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Stack; - -import org.apache.commons.cli.CommandLine; - - -import fdiscovery.approach.ColumnOrder; -import fdiscovery.columns.ColumnCollection; -import fdiscovery.general.CLIParserMiner; -import fdiscovery.general.ColumnFiles; -import fdiscovery.general.FunctionalDependencies; -import fdiscovery.general.Miner; -import fdiscovery.partitions.ComposedPartition; -import fdiscovery.partitions.FileBasedPartition; -import fdiscovery.partitions.FileBasedPartitions; -import fdiscovery.partitions.MemoryManagedJoinedPartitions; -import fdiscovery.partitions.Partition; -import fdiscovery.preprocessing.SVFileProcessor; -import fdiscovery.pruning.Dependencies; -import fdiscovery.pruning.NonDependencies; -import fdiscovery.pruning.Observation; -import fdiscovery.pruning.Observations; -import fdiscovery.pruning.Seed; -import gnu.trove.map.hash.TLongObjectHashMap; -import gnu.trove.set.hash.THashSet; - -public class DFDMiner extends Miner implements Runnable { - - private int numberOfColumns; - private int numberOfRows; - private ColumnOrder columnOrder; - private Stack trace; - private Stack seeds; - private Observations observations; - private FunctionalDependencies minimalDependencies; - private FunctionalDependencies maximalNonDependencies; - private FileBasedPartitions fileBasedPartitions; - private Dependencies dependencies; - private NonDependencies nonDependencies; - private MemoryManagedJoinedPartitions joinedPartitions; - - public static void main(String[] args) { - createColumDirectory(); - - File source = new File(DFDMiner.input); - SVFileProcessor inputFileProcessor = null; - try { - long timeStart = System.currentTimeMillis(); - - inputFileProcessor = new SVFileProcessor(source); - inputFileProcessor.init(); - System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); - System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); - System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); - inputFileProcessor.createColumnFiles(); - DFDMiner dfdRunner = new DFDMiner(inputFileProcessor); - - dfdRunner.run(); - System.out.println(String.format("Number of dependencies:\t%d", dfdRunner.minimalDependencies.getCount())); - long timeFindFDs = System.currentTimeMillis(); - System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s"); - System.out.println(dfdRunner.getDependencies()); - - } catch (FileNotFoundException e) { - System.out.println("The input file could not be found."); - } catch (IOException e) { - System.out.println("The input reader could not be reset."); - } - } - - public static void main2(String[] args) { - CLIParserMiner parser = new CLIParserMiner(); - CommandLine cli = parser.parse(args); - String inputFilename = new String(); - String columnFileDirectory = new String(); - String resultFile = new String(); - int numberOfColumns = 0; - int numberOfRows = 0; - - if (cli.hasOption("file")) { - inputFilename = cli.getOptionValue("file"); - } - if (cli.hasOption("input")) { - columnFileDirectory = cli.getOptionValue("input"); - } - if (cli.hasOption("result")) { - resultFile = cli.getOptionValue("result"); - } - if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")); - } - if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")); - } - ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); - long timeStart = System.currentTimeMillis(); - DFDMiner runner = new DFDMiner(columnFiles, numberOfRows); - try { - runner.run(); - long timeEnd = System.currentTimeMillis(); - runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); - } catch (OutOfMemoryError e) { - System.exit(Miner.STATUS_OOM); - } - System.exit(0); - } - - private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - - String timeString = (time != -1) ? String.format("%.1f", (double) (time) / 1000) : "-1"; - StringBuilder outputBuilder = new StringBuilder(); - if (!inputFileName.isEmpty()) { - outputBuilder.append(String.format("%s\t", inputFileName)); - } - outputBuilder.append(String.format("%d\t", this.numberOfRows)); - outputBuilder.append(String.format("%d\t", this.numberOfColumns)); - outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCount())); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(2))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(3))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(4))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(5))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(6))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeGreaterThan(5))); - outputBuilder.append(String.format("%d\t", this.joinedPartitions.getCount())); - outputBuilder.append(String.format("%d\t", this.joinedPartitions.getTotalCount())); - outputBuilder.append(String.format("%d\n", Runtime.getRuntime().totalMemory())); - outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); - - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); - resultFileWriter.write(outputBuilder.toString()); - System.out.print(outputBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write output."); - } - } - - public DFDMiner(SVFileProcessor table) throws OutOfMemoryError { - this.observations = new Observations(); - this.numberOfColumns = table.getNumberOfColumns(); - this.numberOfRows = table.getNumberOfRows(); - this.trace = new Stack<>(); - this.seeds = new Stack<>(); - this.minimalDependencies = new FunctionalDependencies(); - this.maximalNonDependencies = new FunctionalDependencies(); - this.dependencies = new Dependencies(this.numberOfColumns); - this.nonDependencies = new NonDependencies(this.numberOfColumns); - this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); - this.fileBasedPartitions = new FileBasedPartitions(table); - this.columnOrder = new ColumnOrder(fileBasedPartitions); - for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { - ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); - columnIdentifier.set(columnIndex); - this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); - } - } - - public DFDMiner(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { - this.observations = new Observations(); - this.numberOfColumns = columnFiles.getNumberOfColumns(); - this.numberOfRows = numberOfRows; - this.trace = new Stack<>(); - this.seeds = new Stack<>(); - this.minimalDependencies = new FunctionalDependencies(); - this.maximalNonDependencies = new FunctionalDependencies(); - this.dependencies = new Dependencies(this.numberOfColumns); - this.nonDependencies = new NonDependencies(this.numberOfColumns); - this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); - this.fileBasedPartitions = new FileBasedPartitions(columnFiles, numberOfRows); - columnFiles.clear(); - this.columnOrder = new ColumnOrder(fileBasedPartitions); - for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { - ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); - columnIdentifier.set(columnIndex); - this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); - } - } - - public void run() throws OutOfMemoryError { - - ArrayList keys = new ArrayList<>(); - - // check each column for uniqueness - // if a column is unique it's a key for all other columns - // therefore uniquePartition -> schema - uniquePartition - for (FileBasedPartition fileBasedPartition : this.fileBasedPartitions) { - if (fileBasedPartition.isUnique()) { - ColumnCollection uniquePartitionIndices = fileBasedPartition.getIndices(); - ColumnCollection RHS = uniquePartitionIndices.complementCopy(); - this.minimalDependencies.put(uniquePartitionIndices, RHS); - // add unique columns to minimal uniques - keys.add(uniquePartitionIndices); - } - } - - // do this for all RHS - for (int currentRHSIndex = 0; currentRHSIndex < this.numberOfColumns; currentRHSIndex++) { - - this.dependencies = new Dependencies(numberOfColumns); - this.nonDependencies = new NonDependencies(numberOfColumns); - this.trace.clear(); - this.observations.clear(); - - for (int lhsIndex = 0; lhsIndex < this.numberOfColumns; lhsIndex++) { - if (lhsIndex != currentRHSIndex) { - ColumnCollection lhs = new ColumnCollection(numberOfColumns); - lhs.set(lhsIndex); - if (keys.contains(lhs)) { - this.dependencies.add(lhs); - this.observations.put(lhs, Observation.MINIMAL_DEPENDENCY); - } - } - } - - ColumnCollection currentRHS = new ColumnCollection(numberOfColumns); - currentRHS.set(currentRHSIndex); - - // generate seeds - for (Integer partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) { - if (partitionIndex != currentRHSIndex) { - FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex); - this.seeds.push(new Seed(lhsPartition.getIndices())); - } - } - - do { - while (!seeds.isEmpty()) { - Seed currentSeed = this.randomTake(); - do { - ColumnCollection lhsIndices = currentSeed.getIndices(); - Observation observationOfLHS = this.observations.get(currentSeed.getIndices()); - if (observationOfLHS == null) { - observationOfLHS = this.checkDependencyAndStoreIt(currentSeed, currentRHSIndex); - - // if we couldn't find any dependency that is a - // subset of the current valid LHS it is minimal - if (observationOfLHS == Observation.MINIMAL_DEPENDENCY) { - this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - // if we couldn't find any non-dependency that is - // superset of the current non-valid LHS it is - // maximal - else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) { - this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - currentSeed = randomWalkStep(currentSeed, currentRHSIndex); - } else { -// System.out.println(String.format("[2]Current [%s]%s\t[%s]", (char) (currentRHSIndex + 65), currentSeed, observationOfLHS)); - if (observationOfLHS.isCandidate()) { - if (observationOfLHS.isDependency()) { - Observation updatedDependencyType = this.observations.updateDependencyType(currentSeed.getIndices()); - // System.out.println(String.format("\tupdated:\t%s", - // updatedDependencyType)); - this.observations.put(lhsIndices, updatedDependencyType); - if (updatedDependencyType == Observation.MINIMAL_DEPENDENCY) { - // System.out.println("Add min dependency:\t" - // + currentSeed); - this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - } else { - Observation updatedNonDependencyType = this.observations.updateNonDependencyType(currentSeed.getIndices(), currentRHSIndex); - this.observations.put(lhsIndices, updatedNonDependencyType); - // System.out.println(String.format("\tupdated:\t%s", - // updatedNonDependencyType)); - if (updatedNonDependencyType == Observation.MAXIMAL_NON_DEPENDENCY) { - this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - } - } - currentSeed = randomWalkStep(currentSeed, currentRHSIndex); - } - - } while (currentSeed != null); - } - seeds = this.nextSeeds(currentRHSIndex); - } while (!seeds.isEmpty()); - } - // System.out.println(String.format("Number partitions:\t%d", - // this.joinedPartitions.getCount())); - } - - private Observation checkDependencyAndStoreIt(Seed seed, Integer currentRHSIndex) { - if (nonDependencies.isRepresented(seed.getIndices())) { - // System.out.println("Skip because of nonDependency"); - Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); - this.observations.put(seed.getIndices(), observationOfLHS); - this.nonDependencies.add(seed.getIndices()); - return observationOfLHS; - } else if (dependencies.isRepresented(seed.getIndices())) { - // System.out.println("Skip because of dependency"); - Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); - this.observations.put(seed.getIndices(), observationOfLHS); - this.dependencies.add(seed.getIndices()); - return observationOfLHS; - } - - FileBasedPartition currentRHSPartition = this.fileBasedPartitions.get(currentRHSIndex); - Partition currentLHSPartition = null; - Partition currentLHSJoinedRHSPartition = null; - - if (seed.isAtomic()) { - currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); - currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); - } else { - - // if we went upwards in the lattice we can build the currentLHS - // partition directly from the previous partition - if (seed.getAdditionalColumnIndex() != -1) { - int additionalColumn = seed.getAdditionalColumnIndex(); - Partition previousLHSPartition = joinedPartitions.get(seed.getBaseIndices()); - if (previousLHSPartition == null) { - ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getBaseIndices()); - previousLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); - } - FileBasedPartition additionalColumnPartition = this.fileBasedPartitions.get(additionalColumn); - currentLHSPartition = this.joinedPartitions.get(previousLHSPartition.getIndices().setCopy(additionalColumn)); - if (currentLHSPartition == null) { - currentLHSPartition = new ComposedPartition(previousLHSPartition, additionalColumnPartition); - this.joinedPartitions.addPartition(currentLHSPartition); - } - currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); - if (currentLHSJoinedRHSPartition == null) { - currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); - this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); - } - } else { - currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); - if (currentLHSPartition == null) { - ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getIndices()); - currentLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); - this.joinedPartitions.addPartition(currentLHSPartition); - } - currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); - if (currentLHSJoinedRHSPartition == null) { - currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); - this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); - } - } -// this.joinedPartitions.addPartition(currentLHSPartition); -// this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); - } - - if (Partition.representsFD(currentLHSPartition, currentLHSJoinedRHSPartition)) { - Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); - this.observations.put(seed.getIndices(), observationOfLHS); - this.dependencies.add(seed.getIndices()); - return observationOfLHS; - } else { - Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); - this.observations.put(seed.getIndices(), observationOfLHS); - this.nonDependencies.add(seed.getIndices()); - return observationOfLHS; - } - } - - private Stack nextSeeds(int currentRHSIndex) { -// System.out.println("Find holes"); - THashSet deps = new THashSet<>(); - ArrayList currentMaximalNonDependencies = maximalNonDependencies.getLHSForRHS(currentRHSIndex); - HashSet currentMinimalDependencies = new HashSet<>(minimalDependencies.getLHSForRHS(currentRHSIndex)); - ArrayList newDeps = new ArrayList<>(numberOfColumns * deps.size()); -// Holes holes = new Holes(); - -// int i = 0; -// for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { -// ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); -// if (deps.isEmpty()) { -// ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); -// for (Integer complementColumnIndex : complement.getSetBits()) { -// deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); -// } -// } else { -// for (ColumnCollection dep : deps) { -// int[] setBits = complement.getSetBits(); -// for (int setBit = 0; setBit < setBits.length; setBit++) { -// holes.add(dep.setCopy(setBits[setBit])); -//// System.out.println("Dep:\t" + dep.setCopy(setBits[setBit])); -// } -// } -// // minimize newDeps -// System.out.println(i++ + "\t" + currentMaximalNonDependencies.size()); -// System.out.println("total deps:\t" + deps.size()); -// System.out.println("before minimizing:\t" + holes.size()); -//// ArrayList minimizedNewDeps = minimizeSeeds(newDeps); -// holes.minimize(); -// System.out.println("after minimizing:\t" + holes.size()); -// deps.clear(); -// deps.addAll(holes); -// holes.clear(); -// } -// } - - for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { - ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); - if (deps.isEmpty()) { - ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); - for (Integer complementColumnIndex : complement.getSetBits()) { - deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); - } - } else { - for (ColumnCollection dep : deps) { - int[] setBits = complement.getSetBits(); - for (int setBit = 0; setBit < setBits.length; setBit++) { - newDeps.add(dep.setCopy(setBits[setBit])); - } - } - // minimize newDeps - ArrayList minimizedNewDeps = minimizeSeeds(newDeps); - deps.clear(); - deps.addAll(minimizedNewDeps); - newDeps.clear(); - } - } - - // return only elements that aren't already covered by the minimal - // dependencies - Stack remainingSeeds = new Stack<>(); - deps.removeAll(currentMinimalDependencies); - for (ColumnCollection remainingSeed : deps) { - remainingSeeds.push(new Seed(remainingSeed)); - } - - return remainingSeeds; - } - - private ArrayList minimizeSeeds(ArrayList seeds) { - long maxCardinality = 0; - TLongObjectHashMap> seedsBySize = new TLongObjectHashMap<>(numberOfColumns); - for (ColumnCollection seed : seeds) { - long cardinalityOfSeed = seed.cardinality(); - maxCardinality = Math.max(maxCardinality, cardinalityOfSeed); - seedsBySize.putIfAbsent(cardinalityOfSeed, new ArrayList(seeds.size()/numberOfColumns)); - seedsBySize.get(cardinalityOfSeed).add(seed); - } - - for (long lowerBound = 1; lowerBound < maxCardinality; lowerBound++) { - ArrayList lowerBoundSeeds = seedsBySize.get(lowerBound); - if (lowerBoundSeeds != null) { - for (long upperBound = maxCardinality; upperBound > lowerBound; upperBound--) { - ArrayList upperBoundSeeds = seedsBySize.get(upperBound); - if (upperBoundSeeds != null) { - for (Iterator lowerIt = lowerBoundSeeds.iterator(); lowerIt.hasNext();) { - ColumnCollection lowerSeed = lowerIt.next(); - for (Iterator upperIt = upperBoundSeeds.iterator(); upperIt.hasNext();) { - if (lowerSeed.isSubsetOf(upperIt.next())) { - upperIt.remove(); - } - } - } - } - } - } - } - ArrayList minimizedSeeds = new ArrayList<>(); - for (ArrayList seedList : seedsBySize.valueCollection()) { - for (ColumnCollection seed : seedList) { - minimizedSeeds.add(seed); - } - } - return minimizedSeeds; - } - - private Seed randomTake() { - if (!this.seeds.isEmpty()) { - return this.seeds.pop(); - } - return null; - } - - private Seed randomWalkStep(Seed currentSeed, Integer currentRHSIndex) { - Observation observationOfSeed = this.observations.get(currentSeed.getIndices()); - - if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { - THashSet uncheckedSubsets = this.observations.getUncheckedMaximalSubsets(currentSeed.getIndices(), columnOrder); - THashSet prunedNonDependencySubsets = nonDependencies.getPrunedSupersets(uncheckedSubsets); - for (ColumnCollection prunedNonDependencySubset : prunedNonDependencySubsets) { - observations.put(prunedNonDependencySubset, Observation.NON_DEPENDENCY); - } - uncheckedSubsets.removeAll(prunedNonDependencySubsets); - if (uncheckedSubsets.isEmpty() && prunedNonDependencySubsets.isEmpty()) { - observations.put(currentSeed.getIndices(), Observation.MINIMAL_DEPENDENCY); - minimalDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); - } else if (!uncheckedSubsets.isEmpty()) { - ColumnCollection notRepresentedUncheckedSubset = uncheckedSubsets.iterator().next(); - if (notRepresentedUncheckedSubset != null) { - trace.push(currentSeed); - return new Seed(notRepresentedUncheckedSubset); - } - } - } else if (observationOfSeed == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) { - THashSet uncheckedSupersets = this.observations.getUncheckedMinimalSupersets(currentSeed.getIndices(), currentRHSIndex, columnOrder); - THashSet prunedNonDependencySupersets = nonDependencies.getPrunedSupersets(uncheckedSupersets); - THashSet prunedDependencySupersets = dependencies.getPrunedSubsets(uncheckedSupersets); - for (ColumnCollection prunedNonDependencySuperset : prunedNonDependencySupersets) { - observations.put(prunedNonDependencySuperset, Observation.NON_DEPENDENCY); - } - for (ColumnCollection prunedDependencySuperset : prunedDependencySupersets) { - observations.put(prunedDependencySuperset, Observation.DEPENDENCY); - } - uncheckedSupersets.removeAll(prunedDependencySupersets); - uncheckedSupersets.removeAll(prunedNonDependencySupersets); - if (uncheckedSupersets.isEmpty() && prunedNonDependencySupersets.isEmpty()) { - observations.put(currentSeed.getIndices(), Observation.MAXIMAL_NON_DEPENDENCY); - maximalNonDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); - } else if (!uncheckedSupersets.isEmpty()) { - ColumnCollection notRepresentedUncheckedSuperset = uncheckedSupersets.iterator().next(); - if (notRepresentedUncheckedSuperset != null) { - trace.push(currentSeed); - int additionalColumn = notRepresentedUncheckedSuperset.removeCopy(currentSeed.getIndices()).nextSetBit(0); - return new Seed(notRepresentedUncheckedSuperset, additionalColumn); - } - } - } - if (!this.trace.isEmpty()) { - Seed nextSeed = this.trace.pop(); - return nextSeed; - } - return null; - } - - public FunctionalDependencies getDependencies() { - return this.minimalDependencies; - } +package fdiscovery.approach.runner; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Stack; + +import org.apache.commons.cli.CommandLine; + + +import fdiscovery.approach.ColumnOrder; +import fdiscovery.columns.ColumnCollection; +import fdiscovery.general.CLIParserMiner; +import fdiscovery.general.ColumnFiles; +import fdiscovery.general.FunctionalDependencies; +import fdiscovery.general.Miner; +import fdiscovery.partitions.ComposedPartition; +import fdiscovery.partitions.FileBasedPartition; +import fdiscovery.partitions.FileBasedPartitions; +import fdiscovery.partitions.MemoryManagedJoinedPartitions; +import fdiscovery.partitions.Partition; +import fdiscovery.preprocessing.SVFileProcessor; +import fdiscovery.pruning.Dependencies; +import fdiscovery.pruning.NonDependencies; +import fdiscovery.pruning.Observation; +import fdiscovery.pruning.Observations; +import fdiscovery.pruning.Seed; +import gnu.trove.map.hash.TLongObjectHashMap; +import gnu.trove.set.hash.THashSet; + +public class DFDMiner extends Miner implements Runnable { + + private int numberOfColumns; + private int numberOfRows; + private ColumnOrder columnOrder; + private Stack trace; + private Stack seeds; + private Observations observations; + private FunctionalDependencies minimalDependencies; + private FunctionalDependencies maximalNonDependencies; + private FileBasedPartitions fileBasedPartitions; + private Dependencies dependencies; + private NonDependencies nonDependencies; + private MemoryManagedJoinedPartitions joinedPartitions; + + public static void main(String[] args) { + createColumDirectory(); + + File source = new File(DFDMiner.input); + SVFileProcessor inputFileProcessor = null; + try { + long timeStart = System.currentTimeMillis(); + + inputFileProcessor = new SVFileProcessor(source); + inputFileProcessor.init(); + System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); + System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); + System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); + inputFileProcessor.createColumnFiles(); + DFDMiner dfdRunner = new DFDMiner(inputFileProcessor); + + dfdRunner.run(); + System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(dfdRunner.minimalDependencies.getCount()))); + long timeFindFDs = System.currentTimeMillis(); + System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s"); + System.out.println(dfdRunner.getDependencies()); + + } catch (FileNotFoundException e) { + System.out.println("The input file could not be found."); + } catch (IOException e) { + System.out.println("The input reader could not be reset."); + } + } + + public static void main2(String[] args) { + CLIParserMiner parser = new CLIParserMiner(); + CommandLine cli = parser.parse(args); + String inputFilename = new String(); + String columnFileDirectory = new String(); + String resultFile = new String(); + int numberOfColumns = 0; + int numberOfRows = 0; + + if (cli.hasOption("file")) { + inputFilename = cli.getOptionValue("file"); + } + if (cli.hasOption("input")) { + columnFileDirectory = cli.getOptionValue("input"); + } + if (cli.hasOption("result")) { + resultFile = cli.getOptionValue("result"); + } + if (cli.hasOption("columns")) { + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + } + if (cli.hasOption("rows")) { + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + } + ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); + long timeStart = System.currentTimeMillis(); + DFDMiner runner = new DFDMiner(columnFiles, numberOfRows); + try { + runner.run(); + long timeEnd = System.currentTimeMillis(); + runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); + } catch (OutOfMemoryError e) { + System.exit(Miner.STATUS_OOM); + } + System.exit(0); + } + + private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { + + String timeString = (time != -1) ? String.format("%.1f", (double) (time) / 1000) : "-1"; + StringBuilder outputBuilder = new StringBuilder(); + if (!inputFileName.isEmpty()) { + outputBuilder.append(String.format("%s\t", inputFileName)); + } + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%s\t", timeString)); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getTotalCount()))); + outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); + + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); + resultFileWriter.write(outputBuilder.toString()); + System.out.print(outputBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write output."); + } + } + + public DFDMiner(SVFileProcessor table) throws OutOfMemoryError { + this.observations = new Observations(); + this.numberOfColumns = table.getNumberOfColumns(); + this.numberOfRows = table.getNumberOfRows(); + this.trace = new Stack<>(); + this.seeds = new Stack<>(); + this.minimalDependencies = new FunctionalDependencies(); + this.maximalNonDependencies = new FunctionalDependencies(); + this.dependencies = new Dependencies(this.numberOfColumns); + this.nonDependencies = new NonDependencies(this.numberOfColumns); + this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); + this.fileBasedPartitions = new FileBasedPartitions(table); + this.columnOrder = new ColumnOrder(fileBasedPartitions); + for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { + ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); + columnIdentifier.set(columnIndex); + this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); + } + } + + public DFDMiner(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { + this.observations = new Observations(); + this.numberOfColumns = columnFiles.getNumberOfColumns(); + this.numberOfRows = numberOfRows; + this.trace = new Stack<>(); + this.seeds = new Stack<>(); + this.minimalDependencies = new FunctionalDependencies(); + this.maximalNonDependencies = new FunctionalDependencies(); + this.dependencies = new Dependencies(this.numberOfColumns); + this.nonDependencies = new NonDependencies(this.numberOfColumns); + this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); + this.fileBasedPartitions = new FileBasedPartitions(columnFiles, numberOfRows); + columnFiles.clear(); + this.columnOrder = new ColumnOrder(fileBasedPartitions); + for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { + ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); + columnIdentifier.set(columnIndex); + this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); + } + } + + public void run() throws OutOfMemoryError { + + ArrayList keys = new ArrayList<>(); + + // check each column for uniqueness + // if a column is unique it's a key for all other columns + // therefore uniquePartition -> schema - uniquePartition + for (FileBasedPartition fileBasedPartition : this.fileBasedPartitions) { + if (fileBasedPartition.isUnique()) { + ColumnCollection uniquePartitionIndices = fileBasedPartition.getIndices(); + ColumnCollection RHS = uniquePartitionIndices.complementCopy(); + this.minimalDependencies.put(uniquePartitionIndices, RHS); + // add unique columns to minimal uniques + keys.add(uniquePartitionIndices); + } + } + + // do this for all RHS + for (int currentRHSIndex = 0; currentRHSIndex < this.numberOfColumns; currentRHSIndex++) { + + this.dependencies = new Dependencies(numberOfColumns); + this.nonDependencies = new NonDependencies(numberOfColumns); + this.trace.clear(); + this.observations.clear(); + + for (int lhsIndex = 0; lhsIndex < this.numberOfColumns; lhsIndex++) { + if (lhsIndex != currentRHSIndex) { + ColumnCollection lhs = new ColumnCollection(numberOfColumns); + lhs.set(lhsIndex); + if (keys.contains(lhs)) { + this.dependencies.add(lhs); + this.observations.put(lhs, Observation.MINIMAL_DEPENDENCY); + } + } + } + + ColumnCollection currentRHS = new ColumnCollection(numberOfColumns); + currentRHS.set(currentRHSIndex); + + // generate seeds + for (int partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) { + if (partitionIndex != currentRHSIndex) { + FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex); + this.seeds.push(new Seed(lhsPartition.getIndices())); + } + } + + do { + while (!seeds.isEmpty()) { + Seed currentSeed = this.randomTake(); + do { + ColumnCollection lhsIndices = currentSeed.getIndices(); + Observation observationOfLHS = this.observations.get(currentSeed.getIndices()); + if (observationOfLHS == null) { + observationOfLHS = this.checkDependencyAndStoreIt(currentSeed, currentRHSIndex); + + // if we couldn't find any dependency that is a + // subset of the current valid LHS it is minimal + if (observationOfLHS == Observation.MINIMAL_DEPENDENCY) { + this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + // if we couldn't find any non-dependency that is + // superset of the current non-valid LHS it is + // maximal + else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) { + this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + currentSeed = randomWalkStep(currentSeed, currentRHSIndex); + } else { +// System.out.println(String.format("[2]Current [%s]%s\t[%s]", (char) (currentRHSIndex + 65), currentSeed, observationOfLHS)); + if (observationOfLHS.isCandidate()) { + if (observationOfLHS.isDependency()) { + Observation updatedDependencyType = this.observations.updateDependencyType(currentSeed.getIndices()); + // System.out.println(String.format("\tupdated:\t%s", + // updatedDependencyType)); + this.observations.put(lhsIndices, updatedDependencyType); + if (updatedDependencyType == Observation.MINIMAL_DEPENDENCY) { + // System.out.println("Add min dependency:\t" + // + currentSeed); + this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + } else { + Observation updatedNonDependencyType = this.observations.updateNonDependencyType(currentSeed.getIndices(), currentRHSIndex); + this.observations.put(lhsIndices, updatedNonDependencyType); + // System.out.println(String.format("\tupdated:\t%s", + // updatedNonDependencyType)); + if (updatedNonDependencyType == Observation.MAXIMAL_NON_DEPENDENCY) { + this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + } + } + currentSeed = randomWalkStep(currentSeed, currentRHSIndex); + } + + } while (currentSeed != null); + } + seeds = this.nextSeeds(currentRHSIndex); + } while (!seeds.isEmpty()); + } + // System.out.println(String.format("Number partitions:\t%d", + // this.joinedPartitions.getCount())); + } + + private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) { + if (nonDependencies.isRepresented(seed.getIndices())) { + // System.out.println("Skip because of nonDependency"); + Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); + this.observations.put(seed.getIndices(), observationOfLHS); + this.nonDependencies.add(seed.getIndices()); + return observationOfLHS; + } else if (dependencies.isRepresented(seed.getIndices())) { + // System.out.println("Skip because of dependency"); + Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); + this.observations.put(seed.getIndices(), observationOfLHS); + this.dependencies.add(seed.getIndices()); + return observationOfLHS; + } + + FileBasedPartition currentRHSPartition = this.fileBasedPartitions.get(currentRHSIndex); + Partition currentLHSPartition = null; + Partition currentLHSJoinedRHSPartition = null; + + if (seed.isAtomic()) { + currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); + currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); + } else { + + // if we went upwards in the lattice we can build the currentLHS + // partition directly from the previous partition + if (seed.getAdditionalColumnIndex() != -1) { + int additionalColumn = seed.getAdditionalColumnIndex(); + Partition previousLHSPartition = joinedPartitions.get(seed.getBaseIndices()); + if (previousLHSPartition == null) { + ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getBaseIndices()); + previousLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); + } + FileBasedPartition additionalColumnPartition = this.fileBasedPartitions.get(additionalColumn); + currentLHSPartition = this.joinedPartitions.get(previousLHSPartition.getIndices().setCopy(additionalColumn)); + if (currentLHSPartition == null) { + currentLHSPartition = new ComposedPartition(previousLHSPartition, additionalColumnPartition); + this.joinedPartitions.addPartition(currentLHSPartition); + } + currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); + if (currentLHSJoinedRHSPartition == null) { + currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); + this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); + } + } else { + currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); + if (currentLHSPartition == null) { + ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getIndices()); + currentLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); + this.joinedPartitions.addPartition(currentLHSPartition); + } + currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); + if (currentLHSJoinedRHSPartition == null) { + currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); + this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); + } + } +// this.joinedPartitions.addPartition(currentLHSPartition); +// this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); + } + + if (Partition.representsFD(currentLHSPartition, currentLHSJoinedRHSPartition)) { + Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); + this.observations.put(seed.getIndices(), observationOfLHS); + this.dependencies.add(seed.getIndices()); + return observationOfLHS; + } else { + Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); + this.observations.put(seed.getIndices(), observationOfLHS); + this.nonDependencies.add(seed.getIndices()); + return observationOfLHS; + } + } + + private Stack nextSeeds(int currentRHSIndex) { +// System.out.println("Find holes"); + THashSet deps = new THashSet<>(); + ArrayList currentMaximalNonDependencies = maximalNonDependencies.getLHSForRHS(currentRHSIndex); + HashSet currentMinimalDependencies = new HashSet<>(minimalDependencies.getLHSForRHS(currentRHSIndex)); + ArrayList newDeps = new ArrayList<>(numberOfColumns * deps.size()); +// Holes holes = new Holes(); + +// int i = 0; +// for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { +// ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); +// if (deps.isEmpty()) { +// ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); +// for (Integer complementColumnIndex : complement.getSetBits()) { +// deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); +// } +// } else { +// for (ColumnCollection dep : deps) { +// int[] setBits = complement.getSetBits(); +// for (int setBit = 0; setBit < setBits.length; setBit++) { +// holes.add(dep.setCopy(setBits[setBit])); +//// System.out.println("Dep:\t" + dep.setCopy(setBits[setBit])); +// } +// } +// // minimize newDeps +// System.out.println(i++ + "\t" + currentMaximalNonDependencies.size()); +// System.out.println("total deps:\t" + deps.size()); +// System.out.println("before minimizing:\t" + holes.size()); +//// ArrayList minimizedNewDeps = minimizeSeeds(newDeps); +// holes.minimize(); +// System.out.println("after minimizing:\t" + holes.size()); +// deps.clear(); +// deps.addAll(holes); +// holes.clear(); +// } +// } + + for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { + ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); + if (deps.isEmpty()) { + ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); + for (int complementColumnIndex : complement.getSetBits()) { + deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); + } + } else { + for (ColumnCollection dep : deps) { + int[] setBits = complement.getSetBits(); + for (int setBit = 0; setBit < setBits.length; setBit++) { + newDeps.add(dep.setCopy(setBits[setBit])); + } + } + // minimize newDeps + ArrayList minimizedNewDeps = minimizeSeeds(newDeps); + deps.clear(); + deps.addAll(minimizedNewDeps); + newDeps.clear(); + } + } + + // return only elements that aren't already covered by the minimal + // dependencies + Stack remainingSeeds = new Stack<>(); + deps.removeAll(currentMinimalDependencies); + for (ColumnCollection remainingSeed : deps) { + remainingSeeds.push(new Seed(remainingSeed)); + } + + return remainingSeeds; + } + + private ArrayList minimizeSeeds(ArrayList seeds) { + long maxCardinality = 0; + TLongObjectHashMap> seedsBySize = new TLongObjectHashMap<>(numberOfColumns); + for (ColumnCollection seed : seeds) { + long cardinalityOfSeed = seed.cardinality(); + maxCardinality = Math.max(maxCardinality, cardinalityOfSeed); + seedsBySize.putIfAbsent(cardinalityOfSeed, new ArrayList(seeds.size()/numberOfColumns)); + seedsBySize.get(cardinalityOfSeed).add(seed); + } + + for (long lowerBound = 1; lowerBound < maxCardinality; lowerBound++) { + ArrayList lowerBoundSeeds = seedsBySize.get(lowerBound); + if (lowerBoundSeeds != null) { + for (long upperBound = maxCardinality; upperBound > lowerBound; upperBound--) { + ArrayList upperBoundSeeds = seedsBySize.get(upperBound); + if (upperBoundSeeds != null) { + for (Iterator lowerIt = lowerBoundSeeds.iterator(); lowerIt.hasNext();) { + ColumnCollection lowerSeed = lowerIt.next(); + for (Iterator upperIt = upperBoundSeeds.iterator(); upperIt.hasNext();) { + if (lowerSeed.isSubsetOf(upperIt.next())) { + upperIt.remove(); + } + } + } + } + } + } + } + ArrayList minimizedSeeds = new ArrayList<>(); + for (ArrayList seedList : seedsBySize.valueCollection()) { + for (ColumnCollection seed : seedList) { + minimizedSeeds.add(seed); + } + } + return minimizedSeeds; + } + + private Seed randomTake() { + if (!this.seeds.isEmpty()) { + return this.seeds.pop(); + } + return null; + } + + private Seed randomWalkStep(Seed currentSeed, int currentRHSIndex) { + Observation observationOfSeed = this.observations.get(currentSeed.getIndices()); + + if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { + THashSet uncheckedSubsets = this.observations.getUncheckedMaximalSubsets(currentSeed.getIndices(), columnOrder); + THashSet prunedNonDependencySubsets = nonDependencies.getPrunedSupersets(uncheckedSubsets); + for (ColumnCollection prunedNonDependencySubset : prunedNonDependencySubsets) { + observations.put(prunedNonDependencySubset, Observation.NON_DEPENDENCY); + } + uncheckedSubsets.removeAll(prunedNonDependencySubsets); + if (uncheckedSubsets.isEmpty() && prunedNonDependencySubsets.isEmpty()) { + observations.put(currentSeed.getIndices(), Observation.MINIMAL_DEPENDENCY); + minimalDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); + } else if (!uncheckedSubsets.isEmpty()) { + ColumnCollection notRepresentedUncheckedSubset = uncheckedSubsets.iterator().next(); + if (notRepresentedUncheckedSubset != null) { + trace.push(currentSeed); + return new Seed(notRepresentedUncheckedSubset); + } + } + } else if (observationOfSeed == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) { + THashSet uncheckedSupersets = this.observations.getUncheckedMinimalSupersets(currentSeed.getIndices(), currentRHSIndex, columnOrder); + THashSet prunedNonDependencySupersets = nonDependencies.getPrunedSupersets(uncheckedSupersets); + THashSet prunedDependencySupersets = dependencies.getPrunedSubsets(uncheckedSupersets); + for (ColumnCollection prunedNonDependencySuperset : prunedNonDependencySupersets) { + observations.put(prunedNonDependencySuperset, Observation.NON_DEPENDENCY); + } + for (ColumnCollection prunedDependencySuperset : prunedDependencySupersets) { + observations.put(prunedDependencySuperset, Observation.DEPENDENCY); + } + uncheckedSupersets.removeAll(prunedDependencySupersets); + uncheckedSupersets.removeAll(prunedNonDependencySupersets); + if (uncheckedSupersets.isEmpty() && prunedNonDependencySupersets.isEmpty()) { + observations.put(currentSeed.getIndices(), Observation.MAXIMAL_NON_DEPENDENCY); + maximalNonDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); + } else if (!uncheckedSupersets.isEmpty()) { + ColumnCollection notRepresentedUncheckedSuperset = uncheckedSupersets.iterator().next(); + if (notRepresentedUncheckedSuperset != null) { + trace.push(currentSeed); + int additionalColumn = notRepresentedUncheckedSuperset.removeCopy(currentSeed.getIndices()).nextSetBit(0); + return new Seed(notRepresentedUncheckedSuperset, additionalColumn); + } + } + } + if (!this.trace.isEmpty()) { + Seed nextSeed = this.trace.pop(); + return nextSeed; + } + return null; + } + + public FunctionalDependencies getDependencies() { + return this.minimalDependencies; + } } \ No newline at end of file diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java index 5f88340..7921f77 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java @@ -1,212 +1,212 @@ -package fdiscovery.columns; - -import org.apache.lucene.util.OpenBitSet; - -public class ColumnCollection extends OpenBitSet implements Comparable { - - private static final long serialVersionUID = -5256272139963505719L; - - private int formatStringWidth; - protected long numberOfColumns; - protected int[] setBits; - - public ColumnCollection(long numberOfColumns ) { - this.numberOfColumns = numberOfColumns; - this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns)); - } - - public int[] getSetBits() { - int[] setBits = new int[(int) this.cardinality()]; - - long bitIndex = 0; - int currentArrayIndex = 0; - while (bitIndex < this.numberOfColumns) { - long currentNextSetBit = this.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - setBits[currentArrayIndex++] = (int) currentNextSetBit; - bitIndex = currentNextSetBit + 1; - } else { - bitIndex = this.numberOfColumns; - - } - } - - return setBits; - } - - public boolean isAtomic() { - return this.cardinality() == 1; - } - - public ColumnCollection addColumn(long columnIndex) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.set(columnIndex); - - return copy; - } - - public ColumnCollection andCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.and(other); - - return copy; - } - - public ColumnCollection clearCopy(int startBit) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.clear(startBit); - - return copy; - } - - public ColumnCollection clearAllCopy() { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.clear(0, this.numberOfColumns); - - return copy; - } - - public ColumnCollection andNotCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.andNot(other); - - return copy; - } - - public ColumnCollection removeCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.remove(other); - - return copy; - } - - public ColumnCollection orCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.or(other); - - return copy; - } - - public ColumnCollection setCopy(int index) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.set(index); - - return copy; - } - - public ColumnCollection xorCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.xor(other); - - return copy; - } - - public ColumnCollection complementCopy() { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.flip(0, this.numberOfColumns); - - return copy; - } - - public ColumnCollection complement() { - this.flip(0, this.numberOfColumns); - return this; - } - - public boolean isSubsetOf(ColumnCollection other) { - return ColumnCollection.unionCount(this, other) == other.cardinality(); - } - - public boolean isSupersetOf(ColumnCollection other) { - return ColumnCollection.unionCount(this, other) == this.cardinality(); - - } - - public boolean isProperSubsetOf(ColumnCollection other) { - long cardinality = this.cardinality(); - long otherCardinality = other.cardinality(); - if (cardinality != otherCardinality) { - if (ColumnCollection.unionCount(this, other) == otherCardinality) { - return true; - } - } - return false; - } - - - public boolean isProperSupersetOf(ColumnCollection other) { - long cardinality = this.cardinality(); - long otherCardinality = other.cardinality(); - if (cardinality != otherCardinality) { - if (ColumnCollection.unionCount(this, other) == cardinality) { - return true; - } - } - return false; - } - - public boolean isSubsetOrSupersetOf(ColumnCollection other) { - return isSubsetOf(other) || isSupersetOf(other); - } - - public long getNumberOfColumns() { - return this.numberOfColumns; - } - - public long getMostRightBit() { - long bitIndex = 0; - while (bitIndex < this.numberOfColumns) { - long currentNextSetBit = this.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - bitIndex = currentNextSetBit + 1; - } else { - return bitIndex - 1; - - } - } - return bitIndex; - } - - public ColumnCollection removeColumnCopy(int columnIndex) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.clear(columnIndex); - - return copy; - } - - public ColumnCollection removeColumnCopy(long columnIndex) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.clear(columnIndex); - - return copy; - } - - @Override - public int compareTo(OpenBitSet other) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.xor(other); - int lowestBit = copy.nextSetBit(0); - if (lowestBit == -1) { - return 0; - } else if (this.get(lowestBit)) { - return -1; - } else { - return 1; - } - } - - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - if (this.cardinality() > 0) { - for (Integer columnIndex : this.getSetBits()) { - outputBuilder.append(String.format("%0" + formatStringWidth + "d,", columnIndex)); - - } - } else { - outputBuilder.append("emptyset"); - } - - return outputBuilder.toString(); - } - -} +package fdiscovery.columns; + +import org.apache.lucene.util.OpenBitSet; + +public class ColumnCollection extends OpenBitSet implements Comparable { + + private static final long serialVersionUID = -5256272139963505719L; + + private int formatStringWidth; + protected long numberOfColumns; + protected int[] setBits; + + public ColumnCollection(long numberOfColumns ) { + this.numberOfColumns = numberOfColumns; + this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns)); + } + + public int[] getSetBits() { + int[] setBits = new int[(int) this.cardinality()]; + + long bitIndex = 0; + int currentArrayIndex = 0; + while (bitIndex < this.numberOfColumns) { + long currentNextSetBit = this.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + setBits[currentArrayIndex++] = (int) currentNextSetBit; + bitIndex = currentNextSetBit + 1; + } else { + bitIndex = this.numberOfColumns; + + } + } + + return setBits; + } + + public boolean isAtomic() { + return this.cardinality() == 1; + } + + public ColumnCollection addColumn(long columnIndex) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.set(columnIndex); + + return copy; + } + + public ColumnCollection andCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.and(other); + + return copy; + } + + public ColumnCollection clearCopy(int startBit) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.clear(startBit); + + return copy; + } + + public ColumnCollection clearAllCopy() { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.clear(0, this.numberOfColumns); + + return copy; + } + + public ColumnCollection andNotCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.andNot(other); + + return copy; + } + + public ColumnCollection removeCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.remove(other); + + return copy; + } + + public ColumnCollection orCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.or(other); + + return copy; + } + + public ColumnCollection setCopy(int index) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.set(index); + + return copy; + } + + public ColumnCollection xorCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.xor(other); + + return copy; + } + + public ColumnCollection complementCopy() { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.flip(0, this.numberOfColumns); + + return copy; + } + + public ColumnCollection complement() { + this.flip(0, this.numberOfColumns); + return this; + } + + public boolean isSubsetOf(ColumnCollection other) { + return ColumnCollection.unionCount(this, other) == other.cardinality(); + } + + public boolean isSupersetOf(ColumnCollection other) { + return ColumnCollection.unionCount(this, other) == this.cardinality(); + + } + + public boolean isProperSubsetOf(ColumnCollection other) { + long cardinality = this.cardinality(); + long otherCardinality = other.cardinality(); + if (cardinality != otherCardinality) { + if (ColumnCollection.unionCount(this, other) == otherCardinality) { + return true; + } + } + return false; + } + + + public boolean isProperSupersetOf(ColumnCollection other) { + long cardinality = this.cardinality(); + long otherCardinality = other.cardinality(); + if (cardinality != otherCardinality) { + if (ColumnCollection.unionCount(this, other) == cardinality) { + return true; + } + } + return false; + } + + public boolean isSubsetOrSupersetOf(ColumnCollection other) { + return isSubsetOf(other) || isSupersetOf(other); + } + + public long getNumberOfColumns() { + return this.numberOfColumns; + } + + public long getMostRightBit() { + long bitIndex = 0; + while (bitIndex < this.numberOfColumns) { + long currentNextSetBit = this.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + bitIndex = currentNextSetBit + 1; + } else { + return bitIndex - 1; + + } + } + return bitIndex; + } + + public ColumnCollection removeColumnCopy(int columnIndex) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.clear(columnIndex); + + return copy; + } + + public ColumnCollection removeColumnCopy(long columnIndex) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.clear(columnIndex); + + return copy; + } + + @Override + public int compareTo(OpenBitSet other) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.xor(other); + int lowestBit = copy.nextSetBit(0); + if (lowestBit == -1) { + return 0; + } else if (this.get(lowestBit)) { + return -1; + } else { + return 1; + } + } + + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + if (this.cardinality() > 0) { + for (int columnIndex : this.getSetBits()) { + outputBuilder.append(String.format("%0" + formatStringWidth + "d,", Integer.valueOf(columnIndex))); + + } + } else { + outputBuilder.append("emptyset"); + } + + return outputBuilder.toString(); + } + +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java index 5cc2c71..da4c6cc 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java @@ -1,27 +1,26 @@ -package fdiscovery.columns; - -import java.util.ArrayList; - - -public class Path extends ColumnCollection { - - private static final long serialVersionUID = -6451347203736964695L; - - public Path(long numberOfColumns) { - super(numberOfColumns); - } - - public ArrayList getMaximalSubsets() { - ArrayList maximalSubsetPaths = new ArrayList<>(); - - if (this.isEmpty()) { - return new ArrayList<>(); - } else { - for (Integer columnIndex : this.getSetBits()) { - maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex)); - } - } - - return maximalSubsetPaths; - } -} +package fdiscovery.columns; + +import java.util.ArrayList; + + +public class Path extends ColumnCollection { + + private static final long serialVersionUID = -6451347203736964695L; + + public Path(long numberOfColumns) { + super(numberOfColumns); + } + + public ArrayList getMaximalSubsets() { + ArrayList maximalSubsetPaths = new ArrayList<>(); + + if (this.isEmpty()) { + return new ArrayList<>(); + } + for (int columnIndex : this.getSetBits()) { + maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex)); + } + + return maximalSubsetPaths; + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java index 77819d8..bd5d28d 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java @@ -1,51 +1,51 @@ -package fdiscovery.equivalence; - -import java.util.HashSet; -import java.util.Set; - -public class EquivalenceGroupHashSet extends HashSet implements Comparable, Equivalence { - - private static final long serialVersionUID = 8411462245069900864L; - - private int identifier; - - public EquivalenceGroupHashSet() { - this.identifier = Equivalence.unassignedIdentifier; - } - - public EquivalenceGroupHashSet(int identifier) { - this.identifier = identifier; - } - - @Override - public int compareTo(EquivalenceGroupHashSet o) { - if (this.size() != o.size()) { - return this.size() - o.size(); - } else { - return this.identifier - o.identifier; - } - } - - @Override - public int getIdentifier() { - return this.identifier; - } - - @Override - public > boolean isProperSubset(T other) { - if (this.size() >= other.size()) { - return false; - } - - return other.containsAll(this); - } - - @Override - public void add(int value) { - if (this.identifier == Equivalence.unassignedIdentifier) { - this.identifier = value; - } - - super.add(value); - } +package fdiscovery.equivalence; + +import java.util.HashSet; +import java.util.Set; + +public class EquivalenceGroupHashSet extends HashSet implements Comparable, Equivalence { + + private static final long serialVersionUID = 8411462245069900864L; + + private int identifier; + + public EquivalenceGroupHashSet() { + this.identifier = Equivalence.unassignedIdentifier; + } + + public EquivalenceGroupHashSet(int identifier) { + this.identifier = identifier; + } + + @Override + public int compareTo(EquivalenceGroupHashSet o) { + if (this.size() != o.size()) { + return this.size() - o.size(); + } else { + return this.identifier - o.identifier; + } + } + + @Override + public int getIdentifier() { + return this.identifier; + } + + @Override + public > boolean isProperSubset(T other) { + if (this.size() >= other.size()) { + return false; + } + + return other.containsAll(this); + } + + @Override + public void add(int value) { + if (this.identifier == Equivalence.unassignedIdentifier) { + this.identifier = value; + } + + super.add(Integer.valueOf(value)); + } } \ No newline at end of file diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java index 8a31ac2..687ac00 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTreeSet.java @@ -22,14 +22,13 @@ public EquivalenceGroupTreeSet(int identifier) { public int compareTo(EquivalenceGroupTreeSet o) { if (this.size() != o.size()) { return this.size() - o.size(); - } else { - return this.first() - o.first(); } + return this.first().intValue() - o.first().intValue(); } @Override public int getIdentifier() { - return this.first(); + return this.first().intValue(); } @Override @@ -47,6 +46,6 @@ public void add(int value) { this.identifier = value; } - super.add(value); + super.add(Integer.valueOf(value)); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java index 81799ad..fa1d52a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClass.java @@ -13,7 +13,7 @@ public String toString() { StringBuilder outputBuilder = new StringBuilder(); for (Iterator it = this.iterator(); it.hasNext(); ) { Point identifier = it.next(); - outputBuilder.append(String.format("(%s,%d),", (char)(identifier.x+65), identifier.y)); + outputBuilder.append(String.format("(%s,%d),", Character.valueOf((char)(identifier.x + 65)), Integer.valueOf(identifier.y))); } return outputBuilder.toString(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java index 8182bde..9859557 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/EquivalenceClasses.java @@ -34,7 +34,7 @@ public String toString() { for (TIntObjectIterator it = this.iterator(); it.hasNext(); ) { it.advance(); - outputBuilder.append(String.format("ec(%d(\t", it.key())); + outputBuilder.append(String.format("ec(%d(\t", Integer.valueOf(it.key()))); outputBuilder.append(String.format("{%s}\n", it.value().toString())); } outputBuilder.append("EquivalenceClasses\n"); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java index 769f8f2..36e44f4 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/MaximalEquivalenceClasses.java @@ -59,7 +59,7 @@ public MaximalEquivalenceClasses(StrippedPartitions strippedPartitions) throws O } } - for (Integer groupSize : equivalenceGroupsBySize.keys()) { + for (int groupSize : equivalenceGroupsBySize.keys()) { for (TEquivalence sizeGroup : equivalenceGroupsBySize.get(groupSize)) { maximumGroupSize = Math.max(groupSize, maximumGroupSize); this.add(sizeGroup); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java index 11fec1d..a2b5851 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java @@ -1,75 +1,75 @@ -package fdiscovery.fastfds; - -import gnu.trove.map.hash.TIntIntHashMap; - -import java.util.ArrayList; -import java.util.Collections; - -import fdiscovery.columns.DifferenceSet; -import fdiscovery.columns.DifferenceSets; - -public class PartialOrder extends ArrayList { - - private static final long serialVersionUID = -4312148937513750522L; - - public PartialOrder(DifferenceSets differenceSets) { - TIntIntHashMap orderMap = new TIntIntHashMap(); - - for (DifferenceSet differenceSet : differenceSets) { - // increase the cover count for set columns - long bitIndex = 0; - while (bitIndex < differenceSet.getNumberOfColumns()) { - long currentNextSetBit = differenceSet.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent((int) currentNextSetBit, 0); - orderMap.increment((int) currentNextSetBit); - } else { - bitIndex = differenceSet.getNumberOfColumns(); - } - } - } - - for (Integer index : orderMap.keys()) { - this.add(new CoverOrder(index, orderMap.get(index))); - } - - Collections.sort(this, Collections.reverseOrder()); - - } - - public PartialOrder(DifferenceSets differenceSets, long columnIndexToSkip) { - TIntIntHashMap orderMap = new TIntIntHashMap(); - - for (DifferenceSet differenceSet : differenceSets) { - // increase the cover count for set columns - long bitIndex = columnIndexToSkip; - while (bitIndex < differenceSet.getNumberOfColumns()) { - long currentNextSetBit = differenceSet.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent((int) currentNextSetBit, 0); - orderMap.increment((int) currentNextSetBit); - } else { - bitIndex = differenceSet.getNumberOfColumns(); - } - } - } - - for (Integer index : orderMap.keys()) { - this.add(new CoverOrder(index, orderMap.get(index))); - } - - Collections.sort(this, Collections.reverseOrder()); - - } - - public ArrayList getOrderedColumns() { - ArrayList orderedColumns = new ArrayList<>(); - for (CoverOrder order : this) { - orderedColumns.add(order.getColumnIndex()); - } - - return orderedColumns; - } -} +package fdiscovery.fastfds; + +import gnu.trove.map.hash.TIntIntHashMap; + +import java.util.ArrayList; +import java.util.Collections; + +import fdiscovery.columns.DifferenceSet; +import fdiscovery.columns.DifferenceSets; + +public class PartialOrder extends ArrayList { + + private static final long serialVersionUID = -4312148937513750522L; + + public PartialOrder(DifferenceSets differenceSets) { + TIntIntHashMap orderMap = new TIntIntHashMap(); + + for (DifferenceSet differenceSet : differenceSets) { + // increase the cover count for set columns + long bitIndex = 0; + while (bitIndex < differenceSet.getNumberOfColumns()) { + long currentNextSetBit = differenceSet.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + bitIndex = currentNextSetBit + 1; + orderMap.putIfAbsent((int) currentNextSetBit, 0); + orderMap.increment((int) currentNextSetBit); + } else { + bitIndex = differenceSet.getNumberOfColumns(); + } + } + } + + for (int index : orderMap.keys()) { + this.add(new CoverOrder(index, orderMap.get(index))); + } + + Collections.sort(this, Collections.reverseOrder()); + + } + + public PartialOrder(DifferenceSets differenceSets, long columnIndexToSkip) { + TIntIntHashMap orderMap = new TIntIntHashMap(); + + for (DifferenceSet differenceSet : differenceSets) { + // increase the cover count for set columns + long bitIndex = columnIndexToSkip; + while (bitIndex < differenceSet.getNumberOfColumns()) { + long currentNextSetBit = differenceSet.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + bitIndex = currentNextSetBit + 1; + orderMap.putIfAbsent((int) currentNextSetBit, 0); + orderMap.increment((int) currentNextSetBit); + } else { + bitIndex = differenceSet.getNumberOfColumns(); + } + } + } + + for (int index : orderMap.keys()) { + this.add(new CoverOrder(index, orderMap.get(index))); + } + + Collections.sort(this, Collections.reverseOrder()); + + } + + public ArrayList getOrderedColumns() { + ArrayList orderedColumns = new ArrayList<>(); + for (CoverOrder order : this) { + orderedColumns.add(Integer.valueOf(order.getColumnIndex())); + } + + return orderedColumns; + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java index 884c216..9876cab 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java @@ -1,218 +1,218 @@ -package fdiscovery.fastfds.runner; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; - -import org.apache.commons.cli.CommandLine; - -import fdiscovery.columns.AgreeSets; -import fdiscovery.columns.ColumnCollection; -import fdiscovery.columns.DifferenceSets; -import fdiscovery.columns.Path; - -import com.rits.cloning.Cloner; - -import fdiscovery.partitions.StrippedPartitions; -import fdiscovery.preprocessing.SVFileProcessor; -import fdiscovery.fastfds.EquivalenceClasses; -import fdiscovery.fastfds.MaximalEquivalenceClasses; -import fdiscovery.fastfds.PartialOrder; -import fdiscovery.general.CLIParserMiner; -import fdiscovery.general.ColumnFiles; -import fdiscovery.general.FunctionalDependencies; -import fdiscovery.general.Miner; - -public class FastFDs extends Miner { - - private int numberOfColumns; - private int numberOfRows; - private FunctionalDependencies minimalDependencies; - private DifferenceSets differenceSets; - - public static void main2(String[] args) { - createColumDirectory(); - createResultDirectory(); - - File source = new File(Miner.input); - SVFileProcessor inputFileProcessor = null; - try { - long timeStart = System.currentTimeMillis(); - - inputFileProcessor = new SVFileProcessor(source); - inputFileProcessor.init(); - System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); - System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); - System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); - inputFileProcessor.createColumnFiles(); - FastFDs fastFDRunner = new FastFDs(inputFileProcessor); - - fastFDRunner.run(); - System.out.println(String.format("Dependencies: %d.", fastFDRunner.minimalDependencies.getCount())); - long timeFindFDs = System.currentTimeMillis(); - System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); - System.out.println(fastFDRunner.getDependencies()); - } catch (FileNotFoundException e) { - System.out.println("The input file could not be found."); - } catch (IOException e) { - System.out.println("The input reader could not be reset."); - } - } - - public static void main(String[] args) { - CLIParserMiner parser = new CLIParserMiner(); - CommandLine cli = parser.parse(args); - String inputFilename = new String(); - String columnFileDirectory = new String(); - String resultFile = new String(); - int numberOfColumns = 0; - int numberOfRows = 0; - - if (cli.hasOption("file")) { - inputFilename = cli.getOptionValue("file"); - } - if (cli.hasOption("input")) { - columnFileDirectory = cli.getOptionValue("input"); - } - if (cli.hasOption("result")) { - resultFile = cli.getOptionValue("result"); - } - if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")); - } - if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")); - } - ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); - long timeStart = System.currentTimeMillis(); - try { - FastFDs runner = new FastFDs(columnFiles, numberOfRows); - runner.run(); - long timeEnd = System.currentTimeMillis(); - runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); - } catch(OutOfMemoryError e) { - System.exit(Miner.STATUS_OOM); - } - System.exit(0); - } - - private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; - - StringBuilder outputBuilder = new StringBuilder(); - if (!inputFileName.isEmpty()) { - outputBuilder.append(String.format("%s\t", inputFileName)); - } - outputBuilder.append(String.format("%d\t", this.numberOfRows)); - outputBuilder.append(String.format("%d\t", this.numberOfColumns)); - outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCount())); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(2))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(3))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(4))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(5))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(6))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeGreaterThan(5))); - outputBuilder.append(String.format("%d\t", 0)); - outputBuilder.append(String.format("%d\t", 0)); - outputBuilder.append(String.format("%d\n", Runtime.getRuntime().totalMemory())); - outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); - - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); - resultFileWriter.write(outputBuilder.toString()); - System.out.print(outputBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write output."); - } - } - - public FastFDs(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { - this.minimalDependencies = new FunctionalDependencies(); - this.numberOfColumns = columnFiles.getNumberOfColumns(); - this.numberOfRows = numberOfRows; - - StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); - EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); - MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); - strippedPartitions.clear(); - AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); - maximalEquivalenceClasses.clear(); - equivalenceClasses.clear(); - this.differenceSets = new DifferenceSets(agreeSets); - agreeSets.clear(); - } - - public FastFDs(SVFileProcessor table) throws OutOfMemoryError { - this.minimalDependencies = new FunctionalDependencies(); - this.numberOfColumns = table.getNumberOfColumns(); - this.numberOfRows = table.getNumberOfRows(); - - ColumnFiles columnFiles = table.getColumnFiles(); - StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); - EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); - MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); - strippedPartitions.clear(); - AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); - maximalEquivalenceClasses.clear(); - equivalenceClasses.clear(); - this.differenceSets = new DifferenceSets(agreeSets); - agreeSets.clear(); - } - - public void run() throws OutOfMemoryError { - int numberOfColumns = this.numberOfColumns; - - DifferenceSets[] differenceSetsModulo = this.differenceSets.allModulo(this.numberOfColumns); - for (int rhsIndex = 0; rhsIndex < numberOfColumns; rhsIndex++) { - DifferenceSets orig = differenceSetsModulo[rhsIndex]; - Cloner cloner = new Cloner(); - DifferenceSets uncovered = cloner.deepClone(orig); - if (orig.isEmpty()) { - ColumnCollection lhs = new ColumnCollection(this.numberOfColumns); - - for (Integer lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { - this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex); - } - } - else if (!orig.containsEmptySet()) { - PartialOrder currentOrder = new PartialOrder(orig); - Path path = new Path(numberOfColumns); - findCovers(rhsIndex, orig, uncovered, path, currentOrder); - } - } - } - - public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets uncovered, Path currentPath, PartialOrder currentOrder) { - // no dependencies here - if (currentOrder.isEmpty() && !uncovered.isEmpty()) { - return; - } - - if (uncovered.isEmpty()) { - if (!orig.maximumSubsetCoversDifferenceSet(currentPath)) { - this.minimalDependencies.addRHSColumn(currentPath, columnIndex); - } else { - // dependency not minimal - return; - } - } - - // RECURSIVE CASE - for (Integer remainingColumn : currentOrder.getOrderedColumns()) { - DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn); - PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn); - Path nextPath = (Path) currentPath.addColumn(remainingColumn); - - nextPath.addColumn(remainingColumn); - findCovers(columnIndex, orig, nextDifferenceSets, nextPath, nextOrder); - } - } - - public FunctionalDependencies getDependencies() { - return this.minimalDependencies; - } -} +package fdiscovery.fastfds.runner; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; + +import org.apache.commons.cli.CommandLine; + +import fdiscovery.columns.AgreeSets; +import fdiscovery.columns.ColumnCollection; +import fdiscovery.columns.DifferenceSets; +import fdiscovery.columns.Path; + +import com.rits.cloning.Cloner; + +import fdiscovery.partitions.StrippedPartitions; +import fdiscovery.preprocessing.SVFileProcessor; +import fdiscovery.fastfds.EquivalenceClasses; +import fdiscovery.fastfds.MaximalEquivalenceClasses; +import fdiscovery.fastfds.PartialOrder; +import fdiscovery.general.CLIParserMiner; +import fdiscovery.general.ColumnFiles; +import fdiscovery.general.FunctionalDependencies; +import fdiscovery.general.Miner; + +public class FastFDs extends Miner { + + private int numberOfColumns; + private int numberOfRows; + private FunctionalDependencies minimalDependencies; + private DifferenceSets differenceSets; + + public static void main2(String[] args) { + createColumDirectory(); + createResultDirectory(); + + File source = new File(Miner.input); + SVFileProcessor inputFileProcessor = null; + try { + long timeStart = System.currentTimeMillis(); + + inputFileProcessor = new SVFileProcessor(source); + inputFileProcessor.init(); + System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); + System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); + System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); + inputFileProcessor.createColumnFiles(); + FastFDs fastFDRunner = new FastFDs(inputFileProcessor); + + fastFDRunner.run(); + System.out.println(String.format("Dependencies: %d.", Integer.valueOf(fastFDRunner.minimalDependencies.getCount()))); + long timeFindFDs = System.currentTimeMillis(); + System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); + System.out.println(fastFDRunner.getDependencies()); + } catch (FileNotFoundException e) { + System.out.println("The input file could not be found."); + } catch (IOException e) { + System.out.println("The input reader could not be reset."); + } + } + + public static void main(String[] args) { + CLIParserMiner parser = new CLIParserMiner(); + CommandLine cli = parser.parse(args); + String inputFilename = new String(); + String columnFileDirectory = new String(); + String resultFile = new String(); + int numberOfColumns = 0; + int numberOfRows = 0; + + if (cli.hasOption("file")) { + inputFilename = cli.getOptionValue("file"); + } + if (cli.hasOption("input")) { + columnFileDirectory = cli.getOptionValue("input"); + } + if (cli.hasOption("result")) { + resultFile = cli.getOptionValue("result"); + } + if (cli.hasOption("columns")) { + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + } + if (cli.hasOption("rows")) { + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + } + ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); + long timeStart = System.currentTimeMillis(); + try { + FastFDs runner = new FastFDs(columnFiles, numberOfRows); + runner.run(); + long timeEnd = System.currentTimeMillis(); + runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); + } catch(OutOfMemoryError e) { + System.exit(Miner.STATUS_OOM); + } + System.exit(0); + } + + private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { + String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; + + StringBuilder outputBuilder = new StringBuilder(); + if (!inputFileName.isEmpty()) { + outputBuilder.append(String.format("%s\t", inputFileName)); + } + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%s\t", timeString)); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); + outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); + + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); + resultFileWriter.write(outputBuilder.toString()); + System.out.print(outputBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write output."); + } + } + + public FastFDs(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { + this.minimalDependencies = new FunctionalDependencies(); + this.numberOfColumns = columnFiles.getNumberOfColumns(); + this.numberOfRows = numberOfRows; + + StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); + EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); + MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); + strippedPartitions.clear(); + AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); + maximalEquivalenceClasses.clear(); + equivalenceClasses.clear(); + this.differenceSets = new DifferenceSets(agreeSets); + agreeSets.clear(); + } + + public FastFDs(SVFileProcessor table) throws OutOfMemoryError { + this.minimalDependencies = new FunctionalDependencies(); + this.numberOfColumns = table.getNumberOfColumns(); + this.numberOfRows = table.getNumberOfRows(); + + ColumnFiles columnFiles = table.getColumnFiles(); + StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); + EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); + MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); + strippedPartitions.clear(); + AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); + maximalEquivalenceClasses.clear(); + equivalenceClasses.clear(); + this.differenceSets = new DifferenceSets(agreeSets); + agreeSets.clear(); + } + + public void run() throws OutOfMemoryError { + int numberOfColumns = this.numberOfColumns; + + DifferenceSets[] differenceSetsModulo = this.differenceSets.allModulo(this.numberOfColumns); + for (int rhsIndex = 0; rhsIndex < numberOfColumns; rhsIndex++) { + DifferenceSets orig = differenceSetsModulo[rhsIndex]; + Cloner cloner = new Cloner(); + DifferenceSets uncovered = cloner.deepClone(orig); + if (orig.isEmpty()) { + ColumnCollection lhs = new ColumnCollection(this.numberOfColumns); + + for (int lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { + this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex); + } + } + else if (!orig.containsEmptySet()) { + PartialOrder currentOrder = new PartialOrder(orig); + Path path = new Path(numberOfColumns); + findCovers(rhsIndex, orig, uncovered, path, currentOrder); + } + } + } + + public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets uncovered, Path currentPath, PartialOrder currentOrder) { + // no dependencies here + if (currentOrder.isEmpty() && !uncovered.isEmpty()) { + return; + } + + if (uncovered.isEmpty()) { + if (!orig.maximumSubsetCoversDifferenceSet(currentPath)) { + this.minimalDependencies.addRHSColumn(currentPath, columnIndex); + } else { + // dependency not minimal + return; + } + } + + // RECURSIVE CASE + for (int remainingColumn : currentOrder.getOrderedColumns()) { + DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn); + PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn); + Path nextPath = (Path) currentPath.addColumn(remainingColumn); + + nextPath.addColumn(remainingColumn); + findCovers(columnIndex, orig, nextDifferenceSets, nextPath, nextOrder); + } + } + + public FunctionalDependencies getDependencies() { + return this.minimalDependencies; + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java index 89d0178..8152d42 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java @@ -1,219 +1,219 @@ -package fdiscovery.general; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.Arrays; - -import org.apache.commons.exec.CommandLine; -import org.apache.commons.exec.DefaultExecuteResultHandler; -import org.apache.commons.exec.DefaultExecutor; -import org.apache.commons.exec.ExecuteWatchdog; -import org.apache.commons.exec.PumpStreamHandler; - -import fdiscovery.preprocessing.SVFileProcessor; -import gnu.trove.map.hash.THashMap; - -public class Benchmarker { - - protected static File[] getBenchmarkFilesWithPattern(File benchmarkDirectory) { - File[] benchmarkFiles = benchmarkDirectory.listFiles(new FilenameFilter() { - - @Override - public boolean accept(File dir, String name) { - return name.matches(Miner.BENCHMARK_FILE_REGEX); - } - }); - return benchmarkFiles; - } - - protected static final String getResultFileName(String inputDirectory, String miner) { - String[] splitInputDirectory = inputDirectory.split("\\" + File.separator); - if (splitInputDirectory.length >= 2) { - String staticComponent = splitInputDirectory[splitInputDirectory.length-1]; - String source = splitInputDirectory[splitInputDirectory.length-2]; - return String.format("%s%s-%s-%s.dat", Miner.RESULT_FILE_PATH, miner, staticComponent, source); - } - return new String(); - } - - protected static final void writeErrorCode(File resultFile, int exitCode) { - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile, true)); - if (exitCode == Miner.STATUS_OOT) { - resultFileWriter.write("#OOT"); - } else if (exitCode == Miner.STATUS_OOM) { - resultFileWriter.write("#OOM"); - } - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write meta data."); - } - } - - protected static final void writeMetaData(File resultFile, THashMap cmdLine) { - StringBuilder metaDataLineBuilder = new StringBuilder(); - for (String optionKey : cmdLine.keySet()) { - if (cmdLine.get(optionKey) != null) { - metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); - System.out.print(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); - } else { - metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, "true")); - System.out.print(String.format("# %s :\t%s\n", optionKey, "true")); - } - } - metaDataLineBuilder.append("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); - System.out.println("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile)); - resultFileWriter.write(metaDataLineBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write meta data."); - } - } - - public static void main(String[] args) { - CLIParserBenchmarker parser = new CLIParserBenchmarker(); - THashMap cmdLine = parser.parse(args); - String inputDirectoryName = new String(); - String miner = new String(); - char delimiter = '\t'; - String xmx = new String(); - int timeout = -1; - boolean allFiles = false; - - if (cmdLine.contains("input")) { - inputDirectoryName = cmdLine.get("input"); - } - if (cmdLine.contains("miner")) { - miner = cmdLine.get("miner"); - } - if (cmdLine.contains("delimiter")) { - delimiter = (cmdLine.get("delimiter")).charAt(0); - } - if (cmdLine.contains("xmx")) { - xmx = cmdLine.get("xmx"); - } - if (cmdLine.contains("timeout")) { - System.out.println(String.format("Timeout:%s", cmdLine.get("timeout"))); - timeout = Integer.valueOf(cmdLine.get("timeout")); - } - if (cmdLine.containsKey("all")) { - System.out.println("Use all files."); - allFiles = true; - } - File executable = null; - if (miner.equals("tane")) { - executable = new File("tane.jar"); - } else if (miner.equals("fastfds")) { - executable = new File("fastfds.jar"); - } else if (miner.equals("dfd")) { - executable = new File("dfd.jar"); - } - else { - System.out.println(String.format("No valid miner:\t%s", miner)); - System.exit(1); - } - - File inputDirectory = new File(inputDirectoryName); - if (!inputDirectory.exists()) { - System.out.println("Input directory doesn't exist."); - System.exit(1); - } - - File[] benchmarkFiles = new File[0]; - if (allFiles) { - benchmarkFiles = inputDirectory.listFiles(); - } else { - benchmarkFiles = getBenchmarkFilesWithPattern(inputDirectory); - } - Arrays.sort(benchmarkFiles); - - if (benchmarkFiles.length != 0) { - Miner.createColumDirectory(); - Miner.createResultDirectory(); - String resultFilename = getResultFileName(inputDirectory.getAbsolutePath(), miner); - File resultFile = new File(resultFilename); - writeMetaData(resultFile, cmdLine); - boolean errors = false; - for (File benchmarkFile : benchmarkFiles) { - if (!errors) { - try { - // create columns files and collect meta data - SVFileProcessor fileProcessor = new SVFileProcessor(benchmarkFile); - fileProcessor.init(delimiter); - fileProcessor.createColumnFiles(); - - // build command line with parameters - CommandLine processCmdLine = new CommandLine("java"); - processCmdLine.addArgument("-d64"); - processCmdLine.addArgument("-XX:GCTimeLimit=90"); - processCmdLine.addArgument("-XX:GCHeapFreeLimit=10"); - processCmdLine.addArgument("-XX:+UseSerialGC"); - processCmdLine.addArgument(String.format("-Xmx%s", xmx)); - processCmdLine.addArgument("-jar"); - processCmdLine.addArgument(executable.getName()); - processCmdLine.addArgument("-file"); - processCmdLine.addArgument(String.valueOf(benchmarkFile.getName())); - processCmdLine.addArgument("-columns"); - processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfColumns())); - processCmdLine.addArgument("-rows"); - processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfRows())); - processCmdLine.addArgument("-result"); - processCmdLine.addArgument(resultFile.getAbsolutePath()); - processCmdLine.addArgument("-input"); - processCmdLine.addArgument(fileProcessor.getColumnDirectoryName()); - - // build process with watchdog - DefaultExecutor executor = new DefaultExecutor(); - ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout); - executor.setWatchdog(watchdog); - - // handle results - DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler(); - PumpStreamHandler streamHandler = new PumpStreamHandler(); - executor.setStreamHandler(streamHandler); - long timeStart = System.currentTimeMillis(); - executor.execute(processCmdLine, resultHandler); - resultHandler.waitFor(timeout); - - long timeEnd = System.currentTimeMillis(); - System.out.println(String.format("Time:%.1f", (double)(timeEnd - timeStart)/1000)); - - int exitCode = 0; - if (resultHandler.hasResult()) { - exitCode = resultHandler.getExitValue(); - } else { - exitCode = Miner.STATUS_OOT; - executor.getWatchdog().destroyProcess(); - } - - if (watchdog.killedProcess()) { - exitCode = Miner.STATUS_OOT; - executor.getWatchdog().destroyProcess(); - } else { - } - System.out.println(String.format("ExitCode %d", exitCode)); - if (exitCode == Miner.STATUS_OK) { - - } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) { - writeErrorCode(resultFile, exitCode); - errors = true; - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - } - - } -} +package fdiscovery.general; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.commons.exec.CommandLine; +import org.apache.commons.exec.DefaultExecuteResultHandler; +import org.apache.commons.exec.DefaultExecutor; +import org.apache.commons.exec.ExecuteWatchdog; +import org.apache.commons.exec.PumpStreamHandler; + +import fdiscovery.preprocessing.SVFileProcessor; +import gnu.trove.map.hash.THashMap; + +public class Benchmarker { + + protected static File[] getBenchmarkFilesWithPattern(File benchmarkDirectory) { + File[] benchmarkFiles = benchmarkDirectory.listFiles(new FilenameFilter() { + + @Override + public boolean accept(File dir, String name) { + return name.matches(Miner.BENCHMARK_FILE_REGEX); + } + }); + return benchmarkFiles; + } + + protected static final String getResultFileName(String inputDirectory, String miner) { + String[] splitInputDirectory = inputDirectory.split("\\" + File.separator); + if (splitInputDirectory.length >= 2) { + String staticComponent = splitInputDirectory[splitInputDirectory.length-1]; + String source = splitInputDirectory[splitInputDirectory.length-2]; + return String.format("%s%s-%s-%s.dat", Miner.RESULT_FILE_PATH, miner, staticComponent, source); + } + return new String(); + } + + protected static final void writeErrorCode(File resultFile, int exitCode) { + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile, true)); + if (exitCode == Miner.STATUS_OOT) { + resultFileWriter.write("#OOT"); + } else if (exitCode == Miner.STATUS_OOM) { + resultFileWriter.write("#OOM"); + } + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write meta data."); + } + } + + protected static final void writeMetaData(File resultFile, THashMap cmdLine) { + StringBuilder metaDataLineBuilder = new StringBuilder(); + for (String optionKey : cmdLine.keySet()) { + if (cmdLine.get(optionKey) != null) { + metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); + System.out.print(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); + } else { + metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, "true")); + System.out.print(String.format("# %s :\t%s\n", optionKey, "true")); + } + } + metaDataLineBuilder.append("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); + System.out.println("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile)); + resultFileWriter.write(metaDataLineBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write meta data."); + } + } + + public static void main(String[] args) { + CLIParserBenchmarker parser = new CLIParserBenchmarker(); + THashMap cmdLine = parser.parse(args); + String inputDirectoryName = new String(); + String miner = new String(); + char delimiter = '\t'; + String xmx = new String(); + int timeout = -1; + boolean allFiles = false; + + if (cmdLine.contains("input")) { + inputDirectoryName = cmdLine.get("input"); + } + if (cmdLine.contains("miner")) { + miner = cmdLine.get("miner"); + } + if (cmdLine.contains("delimiter")) { + delimiter = (cmdLine.get("delimiter")).charAt(0); + } + if (cmdLine.contains("xmx")) { + xmx = cmdLine.get("xmx"); + } + if (cmdLine.contains("timeout")) { + System.out.println(String.format("Timeout:%s", cmdLine.get("timeout"))); + timeout = Integer.valueOf(cmdLine.get("timeout")).intValue(); + } + if (cmdLine.containsKey("all")) { + System.out.println("Use all files."); + allFiles = true; + } + File executable = null; + if (miner.equals("tane")) { + executable = new File("tane.jar"); + } else if (miner.equals("fastfds")) { + executable = new File("fastfds.jar"); + } else if (miner.equals("dfd")) { + executable = new File("dfd.jar"); + } + else { + System.out.println(String.format("No valid miner:\t%s", miner)); + System.exit(1); + } + + File inputDirectory = new File(inputDirectoryName); + if (!inputDirectory.exists()) { + System.out.println("Input directory doesn't exist."); + System.exit(1); + } + + File[] benchmarkFiles = new File[0]; + if (allFiles) { + benchmarkFiles = inputDirectory.listFiles(); + } else { + benchmarkFiles = getBenchmarkFilesWithPattern(inputDirectory); + } + Arrays.sort(benchmarkFiles); + + if (benchmarkFiles.length != 0) { + Miner.createColumDirectory(); + Miner.createResultDirectory(); + String resultFilename = getResultFileName(inputDirectory.getAbsolutePath(), miner); + File resultFile = new File(resultFilename); + writeMetaData(resultFile, cmdLine); + boolean errors = false; + for (File benchmarkFile : benchmarkFiles) { + if (!errors) { + try { + // create columns files and collect meta data + SVFileProcessor fileProcessor = new SVFileProcessor(benchmarkFile); + fileProcessor.init(delimiter); + fileProcessor.createColumnFiles(); + + // build command line with parameters + CommandLine processCmdLine = new CommandLine("java"); + processCmdLine.addArgument("-d64"); + processCmdLine.addArgument("-XX:GCTimeLimit=90"); + processCmdLine.addArgument("-XX:GCHeapFreeLimit=10"); + processCmdLine.addArgument("-XX:+UseSerialGC"); + processCmdLine.addArgument(String.format("-Xmx%s", xmx)); + processCmdLine.addArgument("-jar"); + processCmdLine.addArgument(executable.getName()); + processCmdLine.addArgument("-file"); + processCmdLine.addArgument(String.valueOf(benchmarkFile.getName())); + processCmdLine.addArgument("-columns"); + processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfColumns())); + processCmdLine.addArgument("-rows"); + processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfRows())); + processCmdLine.addArgument("-result"); + processCmdLine.addArgument(resultFile.getAbsolutePath()); + processCmdLine.addArgument("-input"); + processCmdLine.addArgument(fileProcessor.getColumnDirectoryName()); + + // build process with watchdog + DefaultExecutor executor = new DefaultExecutor(); + ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout); + executor.setWatchdog(watchdog); + + // handle results + DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler(); + PumpStreamHandler streamHandler = new PumpStreamHandler(); + executor.setStreamHandler(streamHandler); + long timeStart = System.currentTimeMillis(); + executor.execute(processCmdLine, resultHandler); + resultHandler.waitFor(timeout); + + long timeEnd = System.currentTimeMillis(); + System.out.println(String.format("Time:%.1f", (double)(timeEnd - timeStart)/1000)); + + int exitCode = 0; + if (resultHandler.hasResult()) { + exitCode = resultHandler.getExitValue(); + } else { + exitCode = Miner.STATUS_OOT; + executor.getWatchdog().destroyProcess(); + } + + if (watchdog.killedProcess()) { + exitCode = Miner.STATUS_OOT; + executor.getWatchdog().destroyProcess(); + } else { + } + System.out.println(String.format("ExitCode %d", Integer.valueOf(exitCode))); + if (exitCode == Miner.STATUS_OK) { + + } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) { + writeErrorCode(resultFile, exitCode); + errors = true; + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } + + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java index 3659509..326adc3 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java @@ -58,6 +58,6 @@ public boolean accept(File file) { } private final String getColumnFileName(final int columnIndex) { - return String.format(this.formatString, columnIndex); + return String.format(this.formatString, Integer.valueOf(columnIndex)); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java index 1cbfff1..2cb9218 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/JoinedPartitions.java @@ -37,7 +37,7 @@ public Partition getAtomicPartition(int columnIndex) { public ArrayList getBestMatchingPartitionsLazy(ColumnCollection path) { ArrayList bestMatchingPartitions = new ArrayList<>(); - for (Integer columnIndex : path.getSetBits()) { + for (int columnIndex : path.getSetBits()) { bestMatchingPartitions.add(this.getAtomicPartition(columnIndex)); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java index fae9c2e..d2ba11a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java @@ -1,173 +1,173 @@ -package fdiscovery.partitions; - -import java.util.TreeSet; - -import fdiscovery.columns.ColumnCollection; -import fdiscovery.equivalence.TEquivalence; -import gnu.trove.iterator.TIntIterator; - -public abstract class Partition extends TreeSet implements Comparable { - - private static final long serialVersionUID = 174046028525977844L; - - protected static int[] probeTable; - protected ColumnCollection indices; - protected int numberOfRows; - protected double error; - protected double distinctiveness; -// protected long hashNumber; - - public Partition(int columnIndex, int numberOfColumns, int numberOfRows) { - this.indices = new ColumnCollection(numberOfColumns); - this.indices.set(columnIndex); - this.numberOfRows = numberOfRows; - this.error = -1; - this.distinctiveness = -1; - if (Partition.probeTable == null || Partition.probeTable.length != numberOfRows) { - Partition.probeTable = new int[numberOfRows+1]; - for (int i = 0; i < Partition.probeTable.length; i++) { - Partition.probeTable[i] = -1; - } - } - } - - public void init(int numberOfRows) { - if (Partition.probeTable.length != numberOfRows) { - Partition.probeTable = new int[numberOfRows+1]; - } - } - - public Partition(Partition base, Partition additional) { - this.indices = base.indices.orCopy(additional.indices); - this.error = -1; - this.numberOfRows = base.numberOfRows; - this.distinctiveness = -1; - if (Partition.probeTable == null) { - Partition.probeTable = new int[numberOfRows+1]; - for (int i = 0; i < Partition.probeTable.length; i++) { - Partition.probeTable[i] = -1; - } - } - - } - - private void resetProbeTable() { - for (int i = 0; i < Partition.probeTable.length; i++) { - Partition.probeTable[i] = -1; - } - } - - @Override - public int compareTo(Partition o) { - if (this.getDistinctiveness() == o.getDistinctiveness()) { - return this.indices.compareTo(o.indices); - } else { - return Double.valueOf(this.getDistinctiveness()).compareTo(o.getDistinctiveness()); - } - } - - public int getNumberOfRows() { - return this.numberOfRows; - } - - public ColumnCollection getIndices() { - return this.indices; - } - - protected double getDistinctiveness() { - if (this.distinctiveness == -1) { - double distinctiveness = (double)(this.numberOfRows - this.size())/this.numberOfRows; - this.distinctiveness = distinctiveness; - } - return this.distinctiveness; - } - - public static double estimateDistinctiveness(Partition a, Partition b) { - return a.getDistinctiveness() + b.getDistinctiveness() - a.getDistinctiveness() * b.getDistinctiveness(); - } - - protected double getError() { - if (this.error == -1) { - int cumulatedEqClassSizes = 0; - for (TEquivalence equivalenceGroup : this) { - cumulatedEqClassSizes += equivalenceGroup.size(); - } - double error = (double)(cumulatedEqClassSizes - this.size())/this.numberOfRows; - this.error = error; - } - return this.error; - } - - public static boolean representsFD(Partition base, Partition baseMergedWithRHS) { - if (base.getError() == baseMergedWithRHS.getError()) { - return true; - } - return false; - } - - public boolean isUnique() { - return this.size() == 0; - } - - public boolean equals(Partition other) { - int numberOfValues = 0; - int groupIndex = 0; - for (TEquivalence equivalenceGroup : this) { - for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { - Partition.probeTable[equivalenceGroupIt.next()] = groupIndex; - numberOfValues++; - } - groupIndex++; - } - for (TEquivalence equivalenceGroup : other) { - groupIndex = -2; - for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { - int currentGroupIndex = Partition.probeTable[equivalenceGroupIt.next()]; - if (groupIndex == -2 || currentGroupIndex == groupIndex) { - groupIndex = currentGroupIndex; - } else { - resetProbeTable(); - return false; - } - numberOfValues--; - } - } - resetProbeTable(); - if (numberOfValues == 0) { - return true; - } else { - return false; - } - } - - public String printIndices() { - StringBuilder outputBuilder = new StringBuilder((int)this.indices.size()); - - for (int i=0; i < this.indices.size(); i++) { - if (this.indices.get(i)) { - outputBuilder.append("1"); - } else { - outputBuilder.append("0"); - } - } - return outputBuilder.toString(); - } - - @Override - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("[%s]{", this.indices)); - - for(TEquivalence equivalenceGroup : this) { - outputBuilder.append("{"); - for (TIntIterator valueIt=equivalenceGroup.iterator(); valueIt.hasNext(); ) { - outputBuilder.append(valueIt.next()); - outputBuilder.append(","); - } - outputBuilder.append("}"); - } - outputBuilder.append("}"); - - return outputBuilder.toString(); - } -} +package fdiscovery.partitions; + +import java.util.TreeSet; + +import fdiscovery.columns.ColumnCollection; +import fdiscovery.equivalence.TEquivalence; +import gnu.trove.iterator.TIntIterator; + +public abstract class Partition extends TreeSet implements Comparable { + + private static final long serialVersionUID = 174046028525977844L; + + protected static int[] probeTable; + protected ColumnCollection indices; + protected int numberOfRows; + protected double error; + protected double distinctiveness; +// protected long hashNumber; + + public Partition(int columnIndex, int numberOfColumns, int numberOfRows) { + this.indices = new ColumnCollection(numberOfColumns); + this.indices.set(columnIndex); + this.numberOfRows = numberOfRows; + this.error = -1; + this.distinctiveness = -1; + if (Partition.probeTable == null || Partition.probeTable.length != numberOfRows) { + Partition.probeTable = new int[numberOfRows+1]; + for (int i = 0; i < Partition.probeTable.length; i++) { + Partition.probeTable[i] = -1; + } + } + } + + public void init(int numberOfRows) { + if (Partition.probeTable.length != numberOfRows) { + Partition.probeTable = new int[numberOfRows+1]; + } + } + + public Partition(Partition base, Partition additional) { + this.indices = base.indices.orCopy(additional.indices); + this.error = -1; + this.numberOfRows = base.numberOfRows; + this.distinctiveness = -1; + if (Partition.probeTable == null) { + Partition.probeTable = new int[numberOfRows+1]; + for (int i = 0; i < Partition.probeTable.length; i++) { + Partition.probeTable[i] = -1; + } + } + + } + + private void resetProbeTable() { + for (int i = 0; i < Partition.probeTable.length; i++) { + Partition.probeTable[i] = -1; + } + } + + @Override + public int compareTo(Partition o) { + if (this.getDistinctiveness() == o.getDistinctiveness()) { + return this.indices.compareTo(o.indices); + } else { + return Double.valueOf(this.getDistinctiveness()).compareTo(o.getDistinctiveness()); + } + } + + public int getNumberOfRows() { + return this.numberOfRows; + } + + public ColumnCollection getIndices() { + return this.indices; + } + + protected double getDistinctiveness() { + if (this.distinctiveness == -1) { + double distinctiveness = (double)(this.numberOfRows - this.size())/this.numberOfRows; + this.distinctiveness = distinctiveness; + } + return this.distinctiveness; + } + + public static double estimateDistinctiveness(Partition a, Partition b) { + return a.getDistinctiveness() + b.getDistinctiveness() - a.getDistinctiveness() * b.getDistinctiveness(); + } + + protected double getError() { + if (this.error == -1) { + int cumulatedEqClassSizes = 0; + for (TEquivalence equivalenceGroup : this) { + cumulatedEqClassSizes += equivalenceGroup.size(); + } + double error = (double)(cumulatedEqClassSizes - this.size())/this.numberOfRows; + this.error = error; + } + return this.error; + } + + public static boolean representsFD(Partition base, Partition baseMergedWithRHS) { + if (base.getError() == baseMergedWithRHS.getError()) { + return true; + } + return false; + } + + public boolean isUnique() { + return this.size() == 0; + } + + public boolean equals(Partition other) { + int numberOfValues = 0; + int groupIndex = 0; + for (TEquivalence equivalenceGroup : this) { + for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { + Partition.probeTable[equivalenceGroupIt.next()] = groupIndex; + numberOfValues++; + } + groupIndex++; + } + for (TEquivalence equivalenceGroup : other) { + groupIndex = -2; + for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { + int currentGroupIndex = Partition.probeTable[equivalenceGroupIt.next()]; + if (groupIndex == -2 || currentGroupIndex == groupIndex) { + groupIndex = currentGroupIndex; + } else { + resetProbeTable(); + return false; + } + numberOfValues--; + } + } + resetProbeTable(); + if (numberOfValues == 0) { + return true; + } else { + return false; + } + } + + public String printIndices() { + StringBuilder outputBuilder = new StringBuilder(this.indices.size()); + + for (int i=0; i < this.indices.size(); i++) { + if (this.indices.get(i)) { + outputBuilder.append("1"); + } else { + outputBuilder.append("0"); + } + } + return outputBuilder.toString(); + } + + @Override + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + outputBuilder.append(String.format("[%s]{", this.indices)); + + for(TEquivalence equivalenceGroup : this) { + outputBuilder.append("{"); + for (TIntIterator valueIt=equivalenceGroup.iterator(); valueIt.hasNext(); ) { + outputBuilder.append(valueIt.next()); + outputBuilder.append(","); + } + outputBuilder.append("}"); + } + outputBuilder.append("}"); + + return outputBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java index 98278ec..a38e4f5 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java @@ -1,40 +1,40 @@ -package fdiscovery.partitions; - -import java.util.ArrayList; - -import fdiscovery.columns.ColumnCollection; -import gnu.trove.iterator.TIntObjectIterator; -import gnu.trove.iterator.TLongObjectIterator; -import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.map.hash.TLongObjectHashMap; -import gnu.trove.map.hash.TObjectIntHashMap; - -public class PartitionStatistics extends TObjectIntHashMap { - - public String getStatistics() { - TLongObjectHashMap>> statsAndCountsByLevel = new TLongObjectHashMap<>(); - for (ColumnCollection partitionKey : this.keySet()) { - long keyCardinality = partitionKey.cardinality(); - int usageCount = this.get(partitionKey); - statsAndCountsByLevel.putIfAbsent(keyCardinality, new TIntObjectHashMap>()); - statsAndCountsByLevel.get(keyCardinality).putIfAbsent(usageCount, new ArrayList()); - statsAndCountsByLevel.get(keyCardinality).get(usageCount).add(partitionKey); - } - StringBuilder statisticsBuilder = new StringBuilder(); - statisticsBuilder.append("Statistics:\n"); - for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) { - statsByLevelIt.advance(); - long levelCardinality = statsByLevelIt.key(); - statisticsBuilder.append(String.format("%d attributes {\n", levelCardinality)); - for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) { - countByLevelIt.advance(); - int usageCount = countByLevelIt.key(); - int numberOfElements = countByLevelIt.value().size(); - statisticsBuilder.append(String.format("\t%d elements used %d times\n", numberOfElements, usageCount)); - } - statisticsBuilder.append("}\n"); - } - - return statisticsBuilder.toString(); - } -} +package fdiscovery.partitions; + +import java.util.ArrayList; + +import fdiscovery.columns.ColumnCollection; +import gnu.trove.iterator.TIntObjectIterator; +import gnu.trove.iterator.TLongObjectIterator; +import gnu.trove.map.hash.TIntObjectHashMap; +import gnu.trove.map.hash.TLongObjectHashMap; +import gnu.trove.map.hash.TObjectIntHashMap; + +public class PartitionStatistics extends TObjectIntHashMap { + + public String getStatistics() { + TLongObjectHashMap>> statsAndCountsByLevel = new TLongObjectHashMap<>(); + for (ColumnCollection partitionKey : this.keySet()) { + long keyCardinality = partitionKey.cardinality(); + int usageCount = this.get(partitionKey); + statsAndCountsByLevel.putIfAbsent(keyCardinality, new TIntObjectHashMap>()); + statsAndCountsByLevel.get(keyCardinality).putIfAbsent(usageCount, new ArrayList()); + statsAndCountsByLevel.get(keyCardinality).get(usageCount).add(partitionKey); + } + StringBuilder statisticsBuilder = new StringBuilder(); + statisticsBuilder.append("Statistics:\n"); + for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) { + statsByLevelIt.advance(); + long levelCardinality = statsByLevelIt.key(); + statisticsBuilder.append(String.format("%d attributes {\n", levelCardinality)); + for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) { + countByLevelIt.advance(); + int usageCount = countByLevelIt.key(); + int numberOfElements = countByLevelIt.value().size(); + statisticsBuilder.append(String.format("\t%d elements used %d times\n", Integer.valueOf(numberOfElements), Integer.valueOf(usageCount))); + } + statisticsBuilder.append("}\n"); + } + + return statisticsBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java index c570849..bbcb762 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/ProbeTable.java @@ -23,8 +23,8 @@ public ProbeTable(Partition partition) { public String toString() { StringBuilder outputBuilder = new StringBuilder(); outputBuilder.append("ProbeTable:\n"); - for (Integer key : this.keys()) { - outputBuilder.append(String.format("%d\t->\t%d\n", key, this.get(key))); + for (int key : this.keys()) { + outputBuilder.append(String.format("%d\t->\t%d\n", Integer.valueOf(key), Integer.valueOf(this.get(key)))); } return outputBuilder.toString(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java index d337678..096ed14 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java @@ -1,77 +1,77 @@ -package fdiscovery.partitions; - -import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; -import fdiscovery.equivalence.TEquivalence; -import gnu.trove.iterator.TIntIterator; -import gnu.trove.map.hash.TObjectIntHashMap; - -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.TreeSet; - -public class StrippedPartition extends TreeSet { - - private static final long serialVersionUID = -10500424753490842L; - - // constructor for TANEs strippedProduct - public StrippedPartition() { - - } - - public StrippedPartition(StrippedPartition base, StrippedPartition additional) { - - } - - public StrippedPartition(String[] columnContent) { - TObjectIntHashMap valueToIndex = new TObjectIntHashMap<>(); - LinkedHashMap helpMap = new LinkedHashMap<>(); - - for (int rowIndex = 0; rowIndex < columnContent.length; rowIndex++) { - String value = columnContent[rowIndex]; - // if the value wasn't there yet, the row index becomes the representative - // for that equivalence class - if (!valueToIndex.containsKey(value)) { - valueToIndex.put(value, rowIndex); - TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet(); - equivalenceGroup.add(rowIndex); - helpMap.put(rowIndex, equivalenceGroup); - } - // otherwise find the right equivalence class and add the current element index - else { - int equivalenceGroupIndex = valueToIndex.get(value); - TEquivalence equivalenceClass = helpMap.get(equivalenceGroupIndex); - equivalenceClass.add(rowIndex); - } - } - // remove equivalence classes with only one element - for(Iterator> it=helpMap.entrySet().iterator(); it.hasNext();) { - Map.Entry entry = it.next(); - if (entry.getValue().size() <= 1) { - it.remove(); - } - } - - // sort the stripped partition by equivalence group sizes - this.addAll(helpMap.values()); - } - - @Override - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append("{"); - - for(TEquivalence entry : this) { - outputBuilder.append("{"); - for (TIntIterator valueIt=entry.iterator(); valueIt.hasNext(); ) { -// for (TIntIteratorInteger value : entry) { - outputBuilder.append(valueIt.next()); - outputBuilder.append(","); - } - outputBuilder.append("}"); - } - outputBuilder.append("}"); - - return outputBuilder.toString(); - } -} +package fdiscovery.partitions; + +import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; +import fdiscovery.equivalence.TEquivalence; +import gnu.trove.iterator.TIntIterator; +import gnu.trove.map.hash.TObjectIntHashMap; + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.TreeSet; + +public class StrippedPartition extends TreeSet { + + private static final long serialVersionUID = -10500424753490842L; + + // constructor for TANEs strippedProduct + public StrippedPartition() { + + } + + public StrippedPartition(StrippedPartition base, StrippedPartition additional) { + + } + + public StrippedPartition(String[] columnContent) { + TObjectIntHashMap valueToIndex = new TObjectIntHashMap<>(); + LinkedHashMap helpMap = new LinkedHashMap<>(); + + for (int rowIndex = 0; rowIndex < columnContent.length; rowIndex++) { + String value = columnContent[rowIndex]; + // if the value wasn't there yet, the row index becomes the representative + // for that equivalence class + if (!valueToIndex.containsKey(value)) { + valueToIndex.put(value, rowIndex); + TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet(); + equivalenceGroup.add(rowIndex); + helpMap.put(Integer.valueOf(rowIndex), equivalenceGroup); + } + // otherwise find the right equivalence class and add the current element index + else { + int equivalenceGroupIndex = valueToIndex.get(value); + TEquivalence equivalenceClass = helpMap.get(Integer.valueOf(equivalenceGroupIndex)); + equivalenceClass.add(rowIndex); + } + } + // remove equivalence classes with only one element + for(Iterator> it=helpMap.entrySet().iterator(); it.hasNext();) { + Map.Entry entry = it.next(); + if (entry.getValue().size() <= 1) { + it.remove(); + } + } + + // sort the stripped partition by equivalence group sizes + this.addAll(helpMap.values()); + } + + @Override + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + outputBuilder.append("{"); + + for(TEquivalence entry : this) { + outputBuilder.append("{"); + for (TIntIterator valueIt=entry.iterator(); valueIt.hasNext(); ) { +// for (TIntIteratorInteger value : entry) { + outputBuilder.append(valueIt.next()); + outputBuilder.append(","); + } + outputBuilder.append("}"); + } + outputBuilder.append("}"); + + return outputBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java index da3b43c..205b4d0 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java @@ -11,7 +11,7 @@ public class Observations extends HashMap { private static final long serialVersionUID = 2932117192054503664L; public ColumnCollection getUncheckedMaximalSubset(ColumnCollection lhs) { - for (Integer columnIndex : lhs.getSetBits()) { + for (int columnIndex : lhs.getSetBits()) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices)) { return subsetIndices; @@ -24,7 +24,7 @@ public THashSet getUncheckedMaximalSubsets(ColumnCollection lh THashSet uncheckedMaximalSubsets = new THashSet<>(); // if (lhs.cardinality() > 2) { - for (Integer columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices)) { uncheckedMaximalSubsets.add(subsetIndices); @@ -39,7 +39,7 @@ public THashSet getUncheckedOrCandidateMaximalSubsets(ColumnCo // we only want to check subsets with at least 2 columns if (lhs.cardinality() > 2) { - for (Integer columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices) || this.get(subsetIndices) == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { uncheckedMaximalSubsets.add(subsetIndices); @@ -54,7 +54,7 @@ public THashSet getMaximalSubsets(ColumnCollection lhs, Column // we only want to check subsets with at least 2 columns if (lhs.cardinality() > 2) { - for (Integer columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); uncheckedMaximalSubsets.add(subsetIndices); } @@ -63,7 +63,7 @@ public THashSet getMaximalSubsets(ColumnCollection lhs, Column } public ColumnCollection getUncheckedMinimalSuperset(ColumnCollection lhs, int rhsIndex) { - for (Integer columnIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { + for (int columnIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); if (!this.containsKey(supersetIndices)) { return supersetIndices; @@ -75,7 +75,7 @@ public ColumnCollection getUncheckedMinimalSuperset(ColumnCollection lhs, int rh public THashSet getUncheckedOrCandidateMinimalSupersets(ColumnCollection lhs, int rhsIndex, ColumnOrder order) { THashSet uncheckedMinimalSupersets = new THashSet<>(); - for (Integer columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { + for (int columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); if (!this.containsKey(supersetIndices) || this.get(supersetIndices) == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) { uncheckedMinimalSupersets.add(supersetIndices); @@ -87,7 +87,7 @@ public THashSet getUncheckedOrCandidateMinimalSupersets(Column public THashSet getUncheckedMinimalSupersets(ColumnCollection lhs, int rhsIndex, ColumnOrder order) { THashSet uncheckedMinimalSupersets = new THashSet<>(); - for (Integer columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { + for (int columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); if (!this.containsKey(supersetIndices)) { uncheckedMinimalSupersets.add(supersetIndices); @@ -99,7 +99,7 @@ public THashSet getUncheckedMinimalSupersets(ColumnCollection public THashSet getMinimalSupersets(ColumnCollection lhs, int rhsIndex, ColumnOrder order) { THashSet uncheckedMinimalSupersets = new THashSet<>(); - for (Integer columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { + for (int columnIndex : order.getOrderLowDistinctCount(lhs.setCopy(rhsIndex).complement())) { ColumnCollection supersetIndices = lhs.setCopy(columnIndex); uncheckedMinimalSupersets.add(supersetIndices); } @@ -109,7 +109,7 @@ public THashSet getMinimalSupersets(ColumnCollection lhs, int public Observation updateDependencyType(ColumnCollection lhs) { if (lhs.cardinality() > 1) { boolean foundUncheckedSubset = false; - for (Integer columnIndex : lhs.getSetBits()) { + for (int columnIndex : lhs.getSetBits()) { Observation observationOfSubset = this.get(lhs.removeColumnCopy(columnIndex)); if (observationOfSubset == null) { foundUncheckedSubset = true; @@ -126,7 +126,7 @@ public Observation updateDependencyType(ColumnCollection lhs) { public Observation updateNonDependencyType(ColumnCollection lhs, int rhsIndex) { boolean foundUncheckedSuperset = false; - for (Integer columnIndex : lhs.setCopy(rhsIndex).complementCopy().getSetBits()) { + for (int columnIndex : lhs.setCopy(rhsIndex).complementCopy().getSetBits()) { Observation observationOfSuperset = this.get(lhs.setCopy(columnIndex)); if (observationOfSuperset == null) { foundUncheckedSuperset = true; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java index 26eaf79..0031c37 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java @@ -1,71 +1,71 @@ -package fdiscovery.pruning; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; - -import fdiscovery.columns.ColumnCollection; - -public class PruneHashSet extends HashMap> implements PruneInterface { - - private static final long serialVersionUID = 8012444410589325434L; - - public PruneHashSet(int numberOfColumns) { - super(numberOfColumns); - ColumnCollection key = new ColumnCollection(numberOfColumns); - for (int columnIndex = 0; columnIndex < numberOfColumns; columnIndex++) { - this.put(key.setCopy(columnIndex), new HashSet()); - } - } - - public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDependencies nonDependencies, ArrayList candidates) { - for (ColumnCollection candidate : candidates) { - if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) { - return candidate; - } else { - } - } - return null; - } - - @Override - public void rebalance() { - boolean rebalancedGroup = false; - - do { - rebalancedGroup = false; - ArrayList groupKeys = new ArrayList<>(this.keySet()); - for (ColumnCollection key : groupKeys) { - if (this.get(key).size() > SPLIT_THRESHOLD) { - rebalanceGroup(key); - rebalancedGroup = true; - } - } - } while (rebalancedGroup); - } - - @Override - public void rebalanceGroup(ColumnCollection groupKey) { - HashSet depsOfGroup = this.get(groupKey); - for (Integer columnIndex : groupKey.complementCopy().getSetBits()) { - ColumnCollection newKey = groupKey.setCopy(columnIndex); - HashSet newGroup = new HashSet(); - this.put(newKey, newGroup); - - for (ColumnCollection depOfGroup : depsOfGroup) { - // when splitting a group it cannot contain the key itself - // because otherwise the group cannot contain any other - // element since it would be a superset of the key and be pruned - // OR - // when splitting a group it cannot contain the key itself - // because otherwise all supersets of the key would have - // been pruned and it wouldn't need to be split - if (newKey.isSubsetOf(depOfGroup)) { - newGroup.add(depOfGroup); - } - } - } - // remove the old group - this.remove(groupKey); - } -} +package fdiscovery.pruning; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import fdiscovery.columns.ColumnCollection; + +public class PruneHashSet extends HashMap> implements PruneInterface { + + private static final long serialVersionUID = 8012444410589325434L; + + public PruneHashSet(int numberOfColumns) { + super(numberOfColumns); + ColumnCollection key = new ColumnCollection(numberOfColumns); + for (int columnIndex = 0; columnIndex < numberOfColumns; columnIndex++) { + this.put(key.setCopy(columnIndex), new HashSet()); + } + } + + public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDependencies nonDependencies, ArrayList candidates) { + for (ColumnCollection candidate : candidates) { + if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) { + return candidate; + } else { + } + } + return null; + } + + @Override + public void rebalance() { + boolean rebalancedGroup = false; + + do { + rebalancedGroup = false; + ArrayList groupKeys = new ArrayList<>(this.keySet()); + for (ColumnCollection key : groupKeys) { + if (this.get(key).size() > SPLIT_THRESHOLD) { + rebalanceGroup(key); + rebalancedGroup = true; + } + } + } while (rebalancedGroup); + } + + @Override + public void rebalanceGroup(ColumnCollection groupKey) { + HashSet depsOfGroup = this.get(groupKey); + for (int columnIndex : groupKey.complementCopy().getSetBits()) { + ColumnCollection newKey = groupKey.setCopy(columnIndex); + HashSet newGroup = new HashSet(); + this.put(newKey, newGroup); + + for (ColumnCollection depOfGroup : depsOfGroup) { + // when splitting a group it cannot contain the key itself + // because otherwise the group cannot contain any other + // element since it would be a superset of the key and be pruned + // OR + // when splitting a group it cannot contain the key itself + // because otherwise all supersets of the key would have + // been pruned and it wouldn't need to be split + if (newKey.isSubsetOf(depOfGroup)) { + newGroup.add(depOfGroup); + } + } + } + // remove the old group + this.remove(groupKey); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java index 987c5c4..565048a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java @@ -1,37 +1,37 @@ -package fdiscovery.pruning; - -import java.util.ArrayList; -import java.util.HashMap; - -import fdiscovery.columns.ColumnCollection; - -// from rhs to lhs -public abstract class PruneTable extends HashMap>> { - - private static final long serialVersionUID = 4470955427882698208L; - - public int getCount(ColumnCollection RHS) { - int count = 0; - if (this.containsKey(RHS)) { - for (ArrayList collection : this.get(RHS).values()) { - count += collection.size(); - } - } - return count; - } - - - public void addValue(ColumnCollection RHS, ColumnCollection LHS) { - if (!this.containsKey(RHS)) { - this.put(RHS, new HashMap>()); - } - if (!this.get(RHS).containsKey(LHS.cardinality())) { - this.get(RHS).put(LHS.cardinality(), new ArrayList()); - } -// System.out.println(this.get(RHS)); -// System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality())); - ArrayList dependencies = this.get(RHS).get(LHS.cardinality()); -// System.out.println(dependencies); - dependencies.add(LHS); - } -} +package fdiscovery.pruning; + +import java.util.ArrayList; +import java.util.HashMap; + +import fdiscovery.columns.ColumnCollection; + +// from rhs to lhs +public abstract class PruneTable extends HashMap>> { + + private static final long serialVersionUID = 4470955427882698208L; + + public int getCount(ColumnCollection RHS) { + int count = 0; + if (this.containsKey(RHS)) { + for (ArrayList collection : this.get(RHS).values()) { + count += collection.size(); + } + } + return count; + } + + + public void addValue(ColumnCollection RHS, ColumnCollection LHS) { + if (!this.containsKey(RHS)) { + this.put(RHS, new HashMap>()); + } + if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) { + this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList()); + } +// System.out.println(this.get(RHS)); +// System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality())); + ArrayList dependencies = this.get(RHS).get(Integer.valueOf(LHS.cardinality())); +// System.out.println(dependencies); + dependencies.add(LHS); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java index fd8cc1f..b8ff35a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java @@ -1,431 +1,431 @@ -package fdiscovery.tane.runner; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; - -import org.apache.commons.cli.CommandLine; - -import fdiscovery.columns.ColumnCollection; - -import com.rits.cloning.Cloner; - -import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; -import fdiscovery.equivalence.TEquivalence; -import fdiscovery.partitions.StrippedPartition; -import fdiscovery.partitions.StrippedPartitions; -import fdiscovery.preprocessing.SVFileProcessor; -import fdiscovery.tane.AprioriGeneration; -import fdiscovery.general.CLIParserMiner; -import fdiscovery.general.CollectionSet; -import fdiscovery.general.ColumnFiles; -import fdiscovery.general.FunctionalDependencies; -import fdiscovery.general.Miner; -import gnu.trove.iterator.TIntIterator; -import gnu.trove.map.hash.THashMap; - -public class Tane extends Miner { - - private int numberOfColumns; - private int numberOfRows; - private int[] T, Te; - private FunctionalDependencies minimalDependencies; - private StrippedPartitions strippedPartitions; - private HashMap cPlus; - private ArrayList> levels; - private ColumnCollection rSet; - - public FunctionalDependencies getDependencies() { - return this.minimalDependencies; - } - - public static void main2(String[] args) { - createColumDirectory(); - createResultDirectory(); - - File source = new File(Miner.input); - SVFileProcessor inputFileProcessor = null; - try { - long timeStart = System.currentTimeMillis(); - - inputFileProcessor = new SVFileProcessor(source); - inputFileProcessor.init(); - System.out.println("TANE"); - System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); - System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); - System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); - inputFileProcessor.createColumnFiles(); - Tane taneRunner = new Tane(inputFileProcessor); - taneRunner.run(); - - System.out.println(String.format("Number of dependencies:\t%d", taneRunner.minimalDependencies.getCount()));; - long timeFindFDs = System.currentTimeMillis(); - System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); - System.out.println(taneRunner.getDependencies()); - - } catch (FileNotFoundException e) { - System.out.println("The input file could not be found."); - } catch (IOException e) { - System.out.println("The input reader could not be reset."); - } - - } - - public static void main(String[] args) { - CLIParserMiner parser = new CLIParserMiner(); - CommandLine cli = parser.parse(args); - String inputFilename = new String(); - String columnFileDirectory = new String(); - String resultFile = new String(); - int numberOfColumns = 0; - int numberOfRows = 0; - - if (cli.hasOption("file")) { - inputFilename = cli.getOptionValue("file"); - } - if (cli.hasOption("input")) { - columnFileDirectory = cli.getOptionValue("input"); - } - if (cli.hasOption("result")) { - resultFile = cli.getOptionValue("result"); - } - if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")); - } - if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")); - } - ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); - long timeStart = System.currentTimeMillis(); - try { - Tane runner = new Tane(columnFiles, numberOfRows); - runner.run(); - long timeEnd = System.currentTimeMillis(); - runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); - } catch(OutOfMemoryError e) { - System.exit(Miner.STATUS_OOM); - } - System.exit(0); - } - - private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; - - StringBuilder outputBuilder = new StringBuilder(); - if (!inputFileName.isEmpty()) { - outputBuilder.append(String.format("%s\t", inputFileName)); - } - outputBuilder.append(String.format("%d\t", this.numberOfRows)); - outputBuilder.append(String.format("%d\t", this.numberOfColumns)); - outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCount())); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(2))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(3))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(4))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(5))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeLesserThan(6))); - outputBuilder.append(String.format("%d\t", this.minimalDependencies.getCountForSizeGreaterThan(5))); - outputBuilder.append(String.format("%d\t", this.strippedPartitions.size())); - outputBuilder.append(String.format("%d\t", this.strippedPartitions.size())); - outputBuilder.append(String.format("%d\n", Runtime.getRuntime().totalMemory())); - outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); - - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); - resultFileWriter.write(outputBuilder.toString()); - System.out.print(outputBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write output."); - } - } - - public Tane(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { - this.numberOfColumns = columnFiles.getNumberOfColumns(); - this.numberOfRows = numberOfRows; - this.minimalDependencies = new FunctionalDependencies(); - this.strippedPartitions = new StrippedPartitions(columnFiles); - columnFiles.clear(); - } - - - public Tane(SVFileProcessor table) throws OutOfMemoryError { - this.numberOfColumns = table.getNumberOfColumns(); - this.numberOfRows = table.getNumberOfRows(); - this.minimalDependencies = new FunctionalDependencies(); - this.strippedPartitions = new StrippedPartitions(table.getColumnFiles()); - } - - public THashMap run() throws OutOfMemoryError { - - levels = new ArrayList<>(); - cPlus = new HashMap<>(); - - // Level 0 is the empty set - levels.add(new CollectionSet()); - // Level 1 initialization - levels.add(new CollectionSet()); - - ColumnCollection emptyLHSSet = new ColumnCollection(this.numberOfColumns); - rSet = new ColumnCollection(this.numberOfColumns); - - cPlus.put(emptyLHSSet, rSet); - - this.T = new int[this.numberOfRows + 1]; - this.Te = new int[this.numberOfRows + 1]; - // initialize T to all -1, because it is specified to be all "NULL" - // (!=0) in TANE - for (int i = 0; i < T.length; i++) { - T[i] = -1; - } - - // Initialization - for (int i = 0; i < this.numberOfColumns; i++) { - // set all bits in R - rSet.set(i); - // build atomic attribute-sets - ColumnCollection subset = new ColumnCollection(this.numberOfColumns); - subset.set(i); - // add to first level - levels.get(1).add(subset); - } - - // main algorithm - int level = 1; - while (!levels.get(level).isEmpty()) { -// System.out.println("Level:\t" + level); - this.computeDependencies(levels.get(level)); - this.prune(levels.get(level)); - levels.add(this.generateNextLevel(levels.get(level))); - levels.get(level).clear(); - level++; - } - return minimalDependencies; - } - - private CollectionSet generateNextLevel(CollectionSet currentLevel) { - CollectionSet nextLevel = new CollectionSet<>(); - - Cloner cloner = new Cloner(); - AprioriGeneration prefixBlockGenerator = new AprioriGeneration<>(cloner.deepClone(currentLevel)); - for (CollectionSet k : prefixBlockGenerator.prefixBlocks()) { - for (ColumnCollection y : k) { - for (ColumnCollection z : k.tailSet(y)) { - ColumnCollection x = y.orCopy(z); - boolean xInNextLevel = true; - for (Integer a : x.getSetBits()) { - x.clear(a); - if (!currentLevel.contains(x)) { - xInNextLevel = false; - break; - } - x.set(a); - } - if (xInNextLevel) { - nextLevel.add(x); - strippedPartitions.put(x, strippedProduct(strippedPartitions.get(y), strippedPartitions.get(z))); - } - } - } - } - - return nextLevel; - } - - private void computeDependencies(CollectionSet currentLevel) { - for (ColumnCollection x : currentLevel) { - addCPlusOfX(x); - } - - for (ColumnCollection x : currentLevel) { - for (Integer a : x.andCopy(cPlus.get(x)).getSetBits()) { - boolean isDependency = isValidDependency(x.clearCopy(a), a); - - if (isDependency) { - minimalDependencies.addRHSColumn(x.clearCopy(a), a); - cPlus.get(x).clear(a); - - for (Integer B : rSet.removeCopy(x).getSetBits()) { - cPlus.get(x).clear(B); - } - } - } - - } - } - - private ColumnCollection addCPlusOfX(ColumnCollection x) { - ColumnCollection cPlusOfX = cPlus.get(x.clearCopy(x.nextSetBit(0))); - - // if cPlusOfX was not in the list it has to be computed recursively - if (cPlusOfX == null) { - cPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(x.nextSetBit(0))).clone(); - } else { - cPlusOfX = (ColumnCollection) cPlusOfX.clone(); - } - for (Integer a : x.getSetBits()) { - ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a)); - - if (nextCPlusOfX == null) { - nextCPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(a)).clone(); - } else { - nextCPlusOfX = (ColumnCollection) nextCPlusOfX.clone(); - } - - cPlusOfX.and(nextCPlusOfX); - } - cPlus.put(x, cPlusOfX); - - return cPlusOfX; - } - - private void prune(CollectionSet currentLevel) { - Iterator currentLevelIterator = currentLevel.iterator(); - - while (currentLevelIterator.hasNext()) { - ColumnCollection x = currentLevelIterator.next(); - - ColumnCollection cPlusOfX = cPlus.get(x); - if (cPlusOfX == null) { - cPlusOfX = addCPlusOfX(x); - } - - if (cPlusOfX.isEmpty()) { - currentLevelIterator.remove(); - continue; - } - - boolean isSuperKey = isSuperKey(x); - if (isSuperKey) { - for (Integer a : cPlus.get(x).removeCopy(x).getSetBits()) { - ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0)); - ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey); - if (firstCPlusCandidates == null) { - firstCPlusCandidates = (ColumnCollection) addCPlusOfX(firstCPlusCandidatesKey).clone(); - } else { - firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone(); - } - for (Integer b : x.getSetBits()) { - - ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b)); - if (nextCPlusCandidates == null) { - nextCPlusCandidates = (ColumnCollection) addCPlusOfX(x.setCopy(a).clearCopy(b)).clone(); - } else { - nextCPlusCandidates = (ColumnCollection) nextCPlusCandidates.clone(); - } - - firstCPlusCandidates.and(nextCPlusCandidates); - } - if (firstCPlusCandidates.get(a)) { - minimalDependencies.addRHSColumn(x, a); - } - } - currentLevelIterator.remove(); - } - } - } - - protected boolean isSuperKey(ColumnCollection LHS) { - StrippedPartition partitionOfX = strippedPartitions.get(LHS); - - int sumOfSizesOfEquivalenceClasses = 0; - int numberOfEquivalenceClasses = 0; - - for (TEquivalence equivalenceGroup : partitionOfX) { - sumOfSizesOfEquivalenceClasses += equivalenceGroup.size(); - numberOfEquivalenceClasses++; - } - - // equation (1) in the paper - boolean result = (((sumOfSizesOfEquivalenceClasses - numberOfEquivalenceClasses) / (double) this.numberOfColumns) == 0); - - return result; - } - - private double error(StrippedPartition xPartition, StrippedPartition xUnionAPartition) { - int e = 0; - - for (TEquivalence equivalenceGroup : xUnionAPartition) { - Te[equivalenceGroup.getIdentifier()] = equivalenceGroup.size(); - } - for (TEquivalence equivalenceGroup : xPartition) { - int m = 1; - - for (TIntIterator tIt=equivalenceGroup.iterator(); tIt.hasNext(); ) { -// for (Integer t : equivalenceGroup) { - m = Math.max(m, Te[tIt.next()]); - } - e = e + equivalenceGroup.size() - m; - - } - for (TEquivalence equivalenceGroup : xUnionAPartition) { - Te[equivalenceGroup.getIdentifier()] = 0; - } - - return (double)e / this.numberOfRows; - } - - - private boolean isValidDependency(ColumnCollection LHS, Integer RHS) { - if (LHS.isEmpty()) { - return false; - } - - return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS))) == 0); - } - - public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) { - StrippedPartition xPartition = new StrippedPartition(); - HashMap S = new HashMap<>(); - - if (yPartition.size() > zPartition.size()) { - StrippedPartition swap = zPartition; - zPartition = yPartition; - yPartition = swap; - } - - // build some kind of probe table - int i = 1; - for (TEquivalence cI : yPartition) { - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - T[tValue] = i; - - } - S.put(i, new EquivalenceGroupTIntHashSet()); - i++; - } - - for (TEquivalence cI : zPartition) { - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - if (T[tValue] != -1) { - TEquivalence sOld = S.get(T[tValue]); - sOld.add(tValue); - } - } - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - TEquivalence s = S.get(T[tValue]); - if (s != null && s.size() > 1) { - xPartition.add(s); - } - S.put(T[tValue], new EquivalenceGroupTIntHashSet()); - } - } - i = 1; - for (TEquivalence cI : yPartition) { - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - T[tValue] = -1; - } - } - - return xPartition; - } -} +package fdiscovery.tane.runner; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; + +import org.apache.commons.cli.CommandLine; + +import fdiscovery.columns.ColumnCollection; + +import com.rits.cloning.Cloner; + +import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; +import fdiscovery.equivalence.TEquivalence; +import fdiscovery.partitions.StrippedPartition; +import fdiscovery.partitions.StrippedPartitions; +import fdiscovery.preprocessing.SVFileProcessor; +import fdiscovery.tane.AprioriGeneration; +import fdiscovery.general.CLIParserMiner; +import fdiscovery.general.CollectionSet; +import fdiscovery.general.ColumnFiles; +import fdiscovery.general.FunctionalDependencies; +import fdiscovery.general.Miner; +import gnu.trove.iterator.TIntIterator; +import gnu.trove.map.hash.THashMap; + +public class Tane extends Miner { + + private int numberOfColumns; + private int numberOfRows; + private int[] T, Te; + private FunctionalDependencies minimalDependencies; + private StrippedPartitions strippedPartitions; + private HashMap cPlus; + private ArrayList> levels; + private ColumnCollection rSet; + + public FunctionalDependencies getDependencies() { + return this.minimalDependencies; + } + + public static void main2(String[] args) { + createColumDirectory(); + createResultDirectory(); + + File source = new File(Miner.input); + SVFileProcessor inputFileProcessor = null; + try { + long timeStart = System.currentTimeMillis(); + + inputFileProcessor = new SVFileProcessor(source); + inputFileProcessor.init(); + System.out.println("TANE"); + System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); + System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); + System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); + inputFileProcessor.createColumnFiles(); + Tane taneRunner = new Tane(inputFileProcessor); + taneRunner.run(); + + System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(taneRunner.minimalDependencies.getCount())));; + long timeFindFDs = System.currentTimeMillis(); + System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); + System.out.println(taneRunner.getDependencies()); + + } catch (FileNotFoundException e) { + System.out.println("The input file could not be found."); + } catch (IOException e) { + System.out.println("The input reader could not be reset."); + } + + } + + public static void main(String[] args) { + CLIParserMiner parser = new CLIParserMiner(); + CommandLine cli = parser.parse(args); + String inputFilename = new String(); + String columnFileDirectory = new String(); + String resultFile = new String(); + int numberOfColumns = 0; + int numberOfRows = 0; + + if (cli.hasOption("file")) { + inputFilename = cli.getOptionValue("file"); + } + if (cli.hasOption("input")) { + columnFileDirectory = cli.getOptionValue("input"); + } + if (cli.hasOption("result")) { + resultFile = cli.getOptionValue("result"); + } + if (cli.hasOption("columns")) { + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + } + if (cli.hasOption("rows")) { + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + } + ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); + long timeStart = System.currentTimeMillis(); + try { + Tane runner = new Tane(columnFiles, numberOfRows); + runner.run(); + long timeEnd = System.currentTimeMillis(); + runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); + } catch(OutOfMemoryError e) { + System.exit(Miner.STATUS_OOM); + } + System.exit(0); + } + + private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { + String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; + + StringBuilder outputBuilder = new StringBuilder(); + if (!inputFileName.isEmpty()) { + outputBuilder.append(String.format("%s\t", inputFileName)); + } + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%s\t", timeString)); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); + outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); + + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); + resultFileWriter.write(outputBuilder.toString()); + System.out.print(outputBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write output."); + } + } + + public Tane(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { + this.numberOfColumns = columnFiles.getNumberOfColumns(); + this.numberOfRows = numberOfRows; + this.minimalDependencies = new FunctionalDependencies(); + this.strippedPartitions = new StrippedPartitions(columnFiles); + columnFiles.clear(); + } + + + public Tane(SVFileProcessor table) throws OutOfMemoryError { + this.numberOfColumns = table.getNumberOfColumns(); + this.numberOfRows = table.getNumberOfRows(); + this.minimalDependencies = new FunctionalDependencies(); + this.strippedPartitions = new StrippedPartitions(table.getColumnFiles()); + } + + public THashMap run() throws OutOfMemoryError { + + levels = new ArrayList<>(); + cPlus = new HashMap<>(); + + // Level 0 is the empty set + levels.add(new CollectionSet()); + // Level 1 initialization + levels.add(new CollectionSet()); + + ColumnCollection emptyLHSSet = new ColumnCollection(this.numberOfColumns); + rSet = new ColumnCollection(this.numberOfColumns); + + cPlus.put(emptyLHSSet, rSet); + + this.T = new int[this.numberOfRows + 1]; + this.Te = new int[this.numberOfRows + 1]; + // initialize T to all -1, because it is specified to be all "NULL" + // (!=0) in TANE + for (int i = 0; i < T.length; i++) { + T[i] = -1; + } + + // Initialization + for (int i = 0; i < this.numberOfColumns; i++) { + // set all bits in R + rSet.set(i); + // build atomic attribute-sets + ColumnCollection subset = new ColumnCollection(this.numberOfColumns); + subset.set(i); + // add to first level + levels.get(1).add(subset); + } + + // main algorithm + int level = 1; + while (!levels.get(level).isEmpty()) { +// System.out.println("Level:\t" + level); + this.computeDependencies(levels.get(level)); + this.prune(levels.get(level)); + levels.add(this.generateNextLevel(levels.get(level))); + levels.get(level).clear(); + level++; + } + return minimalDependencies; + } + + private CollectionSet generateNextLevel(CollectionSet currentLevel) { + CollectionSet nextLevel = new CollectionSet<>(); + + Cloner cloner = new Cloner(); + AprioriGeneration prefixBlockGenerator = new AprioriGeneration<>(cloner.deepClone(currentLevel)); + for (CollectionSet k : prefixBlockGenerator.prefixBlocks()) { + for (ColumnCollection y : k) { + for (ColumnCollection z : k.tailSet(y)) { + ColumnCollection x = y.orCopy(z); + boolean xInNextLevel = true; + for (int a : x.getSetBits()) { + x.clear(a); + if (!currentLevel.contains(x)) { + xInNextLevel = false; + break; + } + x.set(a); + } + if (xInNextLevel) { + nextLevel.add(x); + strippedPartitions.put(x, strippedProduct(strippedPartitions.get(y), strippedPartitions.get(z))); + } + } + } + } + + return nextLevel; + } + + private void computeDependencies(CollectionSet currentLevel) { + for (ColumnCollection x : currentLevel) { + addCPlusOfX(x); + } + + for (ColumnCollection x : currentLevel) { + for (int a : x.andCopy(cPlus.get(x)).getSetBits()) { + boolean isDependency = isValidDependency(x.clearCopy(a), Integer.valueOf(a)); + + if (isDependency) { + minimalDependencies.addRHSColumn(x.clearCopy(a), a); + cPlus.get(x).clear(a); + + for (int B : rSet.removeCopy(x).getSetBits()) { + cPlus.get(x).clear(B); + } + } + } + + } + } + + private ColumnCollection addCPlusOfX(ColumnCollection x) { + ColumnCollection cPlusOfX = cPlus.get(x.clearCopy(x.nextSetBit(0))); + + // if cPlusOfX was not in the list it has to be computed recursively + if (cPlusOfX == null) { + cPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(x.nextSetBit(0))).clone(); + } else { + cPlusOfX = (ColumnCollection) cPlusOfX.clone(); + } + for (int a : x.getSetBits()) { + ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a)); + + if (nextCPlusOfX == null) { + nextCPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(a)).clone(); + } else { + nextCPlusOfX = (ColumnCollection) nextCPlusOfX.clone(); + } + + cPlusOfX.and(nextCPlusOfX); + } + cPlus.put(x, cPlusOfX); + + return cPlusOfX; + } + + private void prune(CollectionSet currentLevel) { + Iterator currentLevelIterator = currentLevel.iterator(); + + while (currentLevelIterator.hasNext()) { + ColumnCollection x = currentLevelIterator.next(); + + ColumnCollection cPlusOfX = cPlus.get(x); + if (cPlusOfX == null) { + cPlusOfX = addCPlusOfX(x); + } + + if (cPlusOfX.isEmpty()) { + currentLevelIterator.remove(); + continue; + } + + boolean isSuperKey = isSuperKey(x); + if (isSuperKey) { + for (int a : cPlus.get(x).removeCopy(x).getSetBits()) { + ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0)); + ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey); + if (firstCPlusCandidates == null) { + firstCPlusCandidates = (ColumnCollection) addCPlusOfX(firstCPlusCandidatesKey).clone(); + } else { + firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone(); + } + for (int b : x.getSetBits()) { + + ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b)); + if (nextCPlusCandidates == null) { + nextCPlusCandidates = (ColumnCollection) addCPlusOfX(x.setCopy(a).clearCopy(b)).clone(); + } else { + nextCPlusCandidates = (ColumnCollection) nextCPlusCandidates.clone(); + } + + firstCPlusCandidates.and(nextCPlusCandidates); + } + if (firstCPlusCandidates.get(a)) { + minimalDependencies.addRHSColumn(x, a); + } + } + currentLevelIterator.remove(); + } + } + } + + protected boolean isSuperKey(ColumnCollection LHS) { + StrippedPartition partitionOfX = strippedPartitions.get(LHS); + + int sumOfSizesOfEquivalenceClasses = 0; + int numberOfEquivalenceClasses = 0; + + for (TEquivalence equivalenceGroup : partitionOfX) { + sumOfSizesOfEquivalenceClasses += equivalenceGroup.size(); + numberOfEquivalenceClasses++; + } + + // equation (1) in the paper + boolean result = (((sumOfSizesOfEquivalenceClasses - numberOfEquivalenceClasses) / (double) this.numberOfColumns) == 0); + + return result; + } + + private double error(StrippedPartition xPartition, StrippedPartition xUnionAPartition) { + int e = 0; + + for (TEquivalence equivalenceGroup : xUnionAPartition) { + Te[equivalenceGroup.getIdentifier()] = equivalenceGroup.size(); + } + for (TEquivalence equivalenceGroup : xPartition) { + int m = 1; + + for (TIntIterator tIt=equivalenceGroup.iterator(); tIt.hasNext(); ) { +// for (Integer t : equivalenceGroup) { + m = Math.max(m, Te[tIt.next()]); + } + e = e + equivalenceGroup.size() - m; + + } + for (TEquivalence equivalenceGroup : xUnionAPartition) { + Te[equivalenceGroup.getIdentifier()] = 0; + } + + return (double)e / this.numberOfRows; + } + + + private boolean isValidDependency(ColumnCollection LHS, Integer RHS) { + if (LHS.isEmpty()) { + return false; + } + + return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS.intValue()))) == 0); + } + + public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) { + StrippedPartition xPartition = new StrippedPartition(); + HashMap S = new HashMap<>(); + + if (yPartition.size() > zPartition.size()) { + StrippedPartition swap = zPartition; + zPartition = yPartition; + yPartition = swap; + } + + // build some kind of probe table + int i = 1; + for (TEquivalence cI : yPartition) { + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + T[tValue] = i; + + } + S.put(Integer.valueOf(i), new EquivalenceGroupTIntHashSet()); + i++; + } + + for (TEquivalence cI : zPartition) { + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + if (T[tValue] != -1) { + TEquivalence sOld = S.get(Integer.valueOf(T[tValue])); + sOld.add(tValue); + } + } + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + TEquivalence s = S.get(Integer.valueOf(T[tValue])); + if (s != null && s.size() > 1) { + xPartition.add(s); + } + S.put(Integer.valueOf(T[tValue]), new EquivalenceGroupTIntHashSet()); + } + } + i = 1; + for (TEquivalence cI : yPartition) { + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + T[tValue] = -1; + } + } + + return xPartition; + } +} diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java index 37d364c..0fc176c 100644 --- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java +++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java @@ -71,12 +71,12 @@ public void execute() throws AlgorithmExecutionException { dfdMiner.run(); FunctionalDependencies fds = dfdMiner.getDependencies(); for (ColumnCollection determining : fds.keySet()) { - for (Integer dependentColumn : fds.get(determining).getSetBits()) { + for (int dependentColumn : fds.get(determining).getSetBits()) { ColumnIdentifier[] determiningColumns = new ColumnIdentifier[determining.getSetBits().length]; int i = 0; - for (Integer determiningColumn : determining.getSetBits()) { + for (int determiningColumn : determining.getSetBits()) { determiningColumns[i] = new ColumnIdentifier(this.identifier, "Column " + determiningColumn); i++; From e8929d38024353af405791348ea110e8543ed20c Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:02:13 +0200 Subject: [PATCH 03/10] Double, Long, Character types changes --- .../equivalence/PartitionEquivalences.java | 2 +- .../fdiscovery/approach/runner/DFDMiner.java | 2 +- .../src/fdiscovery/columns/Seed.java | 130 +++++++++--------- .../src/fdiscovery/fastfds/CoverOrder.java | 2 +- .../fdiscovery/fastfds/runner/FastFDs.java | 2 +- .../src/fdiscovery/general/Benchmarker.java | 2 +- .../general/FunctionalDependencies.java | 2 +- .../src/fdiscovery/general/Miner.java | 2 +- .../partitions/PartitionStatistics.java | 2 +- .../src/fdiscovery/tane/runner/Tane.java | 2 +- 10 files changed, 74 insertions(+), 74 deletions(-) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java index 29336e5..0cf3007 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/equivalence/PartitionEquivalences.java @@ -38,7 +38,7 @@ public void addPartition(EquivalenceManagedPartition partition) { if (!this.observedPartitions.contains(partition.getIndices()) && !this.containsSimilarPartition(partition)) { this.observedPartitions.add(partition.getIndices()); long hashNumber = partition.getHashNumber(); - System.out.println(String.format("Partition[%s]\t%d\tSize: %d", partition.getIndices(), hashNumber, partition.size())); + System.out.println(String.format("Partition[%s]\t%d\tSize: %d", partition.getIndices(), Long.valueOf(hashNumber), Integer.valueOf(partition.size()))); partitionHashes.putIfAbsent(hashNumber, new TIntObjectHashMap>()); partitionHashes.get(hashNumber).putIfAbsent(partition.size(), new THashSet()); THashSet partitionGroup = partitionHashes.get(hashNumber).get(partition.size()); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java index c666d1b..ef0026f 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java @@ -116,7 +116,7 @@ public static void main2(String[] args) { private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1) ? String.format("%.1f", (double) (time) / 1000) : "-1"; + String timeString = (time != -1) ? String.format("%.1f", Double.valueOf((double) (time) / 1000)) : "-1"; StringBuilder outputBuilder = new StringBuilder(); if (!inputFileName.isEmpty()) { outputBuilder.append(String.format("%s\t", inputFileName)); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java index 9be0918..5c48534 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java @@ -1,65 +1,65 @@ -package fdiscovery.columns; - -import fdiscovery.partitions.FileBasedPartition; -import fdiscovery.partitions.Partition; - -public class Seed implements Comparable { - - private ColumnCollection indices; - private int additionalColumnIndex; - private double distinctiveness; - - public Seed(Partition a, FileBasedPartition b) { - this.indices = a.getIndices().orCopy(b.getIndices()); - this.additionalColumnIndex = b.getIndex(); - this.distinctiveness = Partition.estimateDistinctiveness(a, b); - } - - // inverse order - @Override - public int compareTo(Seed o) { - if (this.distinctiveness != o.distinctiveness) { - if (o.distinctiveness - this.distinctiveness < 0) { - return -1; - } else { - return 1; - } - } else { - return this.indices.compareTo(o.indices); - } - } - - @Override - public boolean equals(Object o) { - if (o == null) { - return false; - } - if (o == this) { - return true; - } - if (!(o instanceof Seed)) { - return false; - } else { - Seed otherSeed = (Seed) o; - return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; - } - } - - public ColumnCollection getBaseIndices() { - return this.indices.removeColumnCopy(additionalColumnIndex); - } - - public ColumnCollection getIndices() { - return this.indices; - } - - public int getAdditionalColumnIndex() { - return this.additionalColumnIndex; - } - - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, this.distinctiveness)); - return outputBuilder.toString(); - } -} +package fdiscovery.columns; + +import fdiscovery.partitions.FileBasedPartition; +import fdiscovery.partitions.Partition; + +public class Seed implements Comparable { + + private ColumnCollection indices; + private int additionalColumnIndex; + private double distinctiveness; + + public Seed(Partition a, FileBasedPartition b) { + this.indices = a.getIndices().orCopy(b.getIndices()); + this.additionalColumnIndex = b.getIndex(); + this.distinctiveness = Partition.estimateDistinctiveness(a, b); + } + + // inverse order + @Override + public int compareTo(Seed o) { + if (this.distinctiveness != o.distinctiveness) { + if (o.distinctiveness - this.distinctiveness < 0) { + return -1; + } else { + return 1; + } + } else { + return this.indices.compareTo(o.indices); + } + } + + @Override + public boolean equals(Object o) { + if (o == null) { + return false; + } + if (o == this) { + return true; + } + if (!(o instanceof Seed)) { + return false; + } else { + Seed otherSeed = (Seed) o; + return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; + } + } + + public ColumnCollection getBaseIndices() { + return this.indices.removeColumnCopy(additionalColumnIndex); + } + + public ColumnCollection getIndices() { + return this.indices; + } + + public int getAdditionalColumnIndex() { + return this.additionalColumnIndex; + } + + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, Double.valueOf(this.distinctiveness))); + return outputBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java index 5316fcd..ad00123 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/CoverOrder.java @@ -43,7 +43,7 @@ public int compareTo(CoverOrder o) { public String toString() { StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("[%s:%d]", (char)(this.columnIndex+65), this.appearances)); + outputBuilder.append(String.format("[%s:%d]", Character.valueOf((char)(this.columnIndex + 65)), Integer.valueOf(this.appearances))); return outputBuilder.toString(); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java index 9876cab..5063dda 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java @@ -99,7 +99,7 @@ public static void main(String[] args) { } private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; + String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; StringBuilder outputBuilder = new StringBuilder(); if (!inputFileName.isEmpty()) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java index 8152d42..afc934c 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java @@ -182,7 +182,7 @@ public static void main(String[] args) { resultHandler.waitFor(timeout); long timeEnd = System.currentTimeMillis(); - System.out.println(String.format("Time:%.1f", (double)(timeEnd - timeStart)/1000)); + System.out.println(String.format("Time:%.1f", Double.valueOf((double)(timeEnd - timeStart) / 1000))); int exitCode = 0; if (resultHandler.hasResult()) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java index 676454e..3333de7 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java @@ -42,7 +42,7 @@ public void minimize(int rhsIndex) { if (lhsForRhsToDelete.contains(lhs)) { ColumnCollection rhs = this.get(lhs); this.put(lhs, rhs.removeColumnCopy(rhsIndex)); - System.out.println(String.format("Remove %s->%s", lhs, (char)(rhsIndex + 65))); + System.out.println(String.format("Remove %s->%s", lhs, Character.valueOf((char) (rhsIndex + 65)))); } } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java index c759691..1fd9214 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Miner.java @@ -20,7 +20,7 @@ public static String humanReadableByteCount(long bytes, boolean si) { if (bytes < unit) return bytes + " B"; int exp = (int) (Math.log(bytes) / Math.log(unit)); String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp-1) + (si ? "" : "i"); - return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); + return String.format("%.1f %sB", Double.valueOf(bytes / Math.pow(unit, exp)), pre); } protected static final void createColumDirectory() { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java index a38e4f5..93bb615 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java @@ -25,7 +25,7 @@ public String getStatistics() { for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) { statsByLevelIt.advance(); long levelCardinality = statsByLevelIt.key(); - statisticsBuilder.append(String.format("%d attributes {\n", levelCardinality)); + statisticsBuilder.append(String.format("%d attributes {\n", Long.valueOf(levelCardinality))); for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) { countByLevelIt.advance(); int usageCount = countByLevelIt.key(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java index b8ff35a..8557d52 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java @@ -114,7 +114,7 @@ public static void main(String[] args) { } private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", (double)(time)/1000) : "-1"; + String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; StringBuilder outputBuilder = new StringBuilder(); if (!inputFileName.isEmpty()) { From 8b9499c03d0e365bb537812ff03d6921144a1399 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:05:38 +0200 Subject: [PATCH 04/10] Nonfunctional refactoring Including removing else after return, removing empty else and removing empty lines --- .../src/fdiscovery/approach/runner/DFDMiner.java | 9 ++++----- dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java | 11 ++++------- .../equivalence/EquivalenceGroupHashSet.java | 3 +-- .../equivalence/EquivalenceGroupTIntHashSet.java | 5 ++--- .../src/fdiscovery/partitions/Partition.java | 6 ++---- dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java | 3 +-- .../src/fdiscovery/pruning/PruneHashSet.java | 1 - .../src/fdiscovery/tane/AprioriGeneration.java | 7 +++---- dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java | 1 - 9 files changed, 17 insertions(+), 29 deletions(-) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java index ef0026f..62fedb3 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java @@ -356,12 +356,11 @@ private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) { this.observations.put(seed.getIndices(), observationOfLHS); this.dependencies.add(seed.getIndices()); return observationOfLHS; - } else { - Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); - this.observations.put(seed.getIndices(), observationOfLHS); - this.nonDependencies.add(seed.getIndices()); - return observationOfLHS; } + Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); + this.observations.put(seed.getIndices(), observationOfLHS); + this.nonDependencies.add(seed.getIndices()); + return observationOfLHS; } private Stack nextSeeds(int currentRHSIndex) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java index 5c48534..9fee75a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java @@ -21,12 +21,10 @@ public int compareTo(Seed o) { if (this.distinctiveness != o.distinctiveness) { if (o.distinctiveness - this.distinctiveness < 0) { return -1; - } else { - return 1; } - } else { - return this.indices.compareTo(o.indices); + return 1; } + return this.indices.compareTo(o.indices); } @Override @@ -39,10 +37,9 @@ public boolean equals(Object o) { } if (!(o instanceof Seed)) { return false; - } else { - Seed otherSeed = (Seed) o; - return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; } + Seed otherSeed = (Seed) o; + return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; } public ColumnCollection getBaseIndices() { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java index bd5d28d..2cb0e39 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java @@ -21,9 +21,8 @@ public EquivalenceGroupHashSet(int identifier) { public int compareTo(EquivalenceGroupHashSet o) { if (this.size() != o.size()) { return this.size() - o.size(); - } else { - return this.identifier - o.identifier; } + return this.identifier - o.identifier; } @Override diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java index 2dc6cce..f89ce2d 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java @@ -42,8 +42,7 @@ public boolean add(int value) { public int compareTo(EquivalenceGroupTIntHashSet o) { if (this.size() != o.size()) { return this.size() - o.size(); - } else { - return this.identifier - o.identifier; - } + } + return this.identifier - o.identifier; } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java index d2ba11a..3282ba9 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java @@ -61,9 +61,8 @@ private void resetProbeTable() { public int compareTo(Partition o) { if (this.getDistinctiveness() == o.getDistinctiveness()) { return this.indices.compareTo(o.indices); - } else { - return Double.valueOf(this.getDistinctiveness()).compareTo(o.getDistinctiveness()); } + return Double.valueOf(this.getDistinctiveness()).compareTo(Double.valueOf(o.getDistinctiveness())); } public int getNumberOfRows() { @@ -135,9 +134,8 @@ public boolean equals(Partition other) { resetProbeTable(); if (numberOfValues == 0) { return true; - } else { - return false; } + return false; } public String printIndices() { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java index d17c648..3ed16c1 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Holes.java @@ -17,9 +17,8 @@ public class Holes extends TreeSet { public int compare(ColumnCollection o1, ColumnCollection o2) { if (o1.isProperSupersetOf(o2)) { return 0; - } else { - return o1.compareTo(o2); } + return o1.compareTo(o2); } }; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java index 0031c37..25f43d0 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java @@ -22,7 +22,6 @@ public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDep for (ColumnCollection candidate : candidates) { if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) { return candidate; - } else { } } return null; diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java index 50e4c20..7d74b08 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/AprioriGeneration.java @@ -100,10 +100,9 @@ public static boolean haveCommonPrefixBlock(ColumnCollection x, ColumnCollection // System.out.println("true"); // System.out.println("---------------------------"); return true; - } else { -// System.out.println("false"); -// System.out.println("---------------------------"); - return false; } +// System.out.println("false"); +// System.out.println("---------------------------"); + return false; } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java index 8557d52..7403038 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java @@ -73,7 +73,6 @@ public static void main2(String[] args) { } catch (IOException e) { System.out.println("The input reader could not be reset."); } - } public static void main(String[] args) { From 265c9c667c307ef2dcd7403d1f66c748415172da Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:06:14 +0200 Subject: [PATCH 05/10] Warning suppression --- dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java | 1 + .../src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java | 1 + .../src/fdiscovery/partitions/StrippedPartition.java | 1 + dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java | 1 + 4 files changed, 4 insertions(+) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java index 5063dda..26dd8c2 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java @@ -32,6 +32,7 @@ public class FastFDs extends Miner { private FunctionalDependencies minimalDependencies; private DifferenceSets differenceSets; + @SuppressWarnings("unused") public static void main2(String[] args) { createColumDirectory(); createResultDirectory(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java index cca75ee..4ce8f42 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/MemoryManagedJoinedPartitions.java @@ -63,6 +63,7 @@ public int getCount() { return cumulatedCount; } + @SuppressWarnings("unused") public Partition get(ColumnCollection key) { Partition result = this.get(key.cardinality()).get(key); if (USE_MEMORY_MANAGEMENT && result != null) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java index 096ed14..f812e92 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java @@ -19,6 +19,7 @@ public StrippedPartition() { } + @SuppressWarnings("unused") public StrippedPartition(StrippedPartition base, StrippedPartition additional) { } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java index 7403038..5740f8b 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java @@ -44,6 +44,7 @@ public FunctionalDependencies getDependencies() { return this.minimalDependencies; } + @SuppressWarnings("unused") public static void main2(String[] args) { createColumDirectory(); createResultDirectory(); From 95dcb31023f015c62e0fffa8f26d3e19fe790b34 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:24:56 +0200 Subject: [PATCH 06/10] Refactor ColumnCollection to inherit from java.util.BitSet, fixing previous refactoring of unionCount() (semantic error) - numberOfColumns is now int (was long) - unionCount() used and to calculate the cardinality, this calculates the cardinality of the intersection => changed to or --- .../src/fdiscovery/columns/AgreeSet.java | 2 +- .../fdiscovery/columns/ColumnCollection.java | 62 ++++++++++--------- .../src/fdiscovery/columns/DifferenceSet.java | 34 +++++----- .../src/fdiscovery/columns/Path.java | 2 +- .../src/fdiscovery/fastfds/PartialOrder.java | 18 +++--- .../src/fdiscovery/pruning/PruneTable.java | 4 +- 6 files changed, 64 insertions(+), 58 deletions(-) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java index d26334d..bd141f8 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/AgreeSet.java @@ -10,7 +10,7 @@ public class AgreeSet extends ColumnCollection { private static final long serialVersionUID = -5335032949377336772L; - public AgreeSet(Set set1, Set set2, long numberOfColumns) { + public AgreeSet(Set set1, Set set2, int numberOfColumns) { super(numberOfColumns); Set intersected = Sets.intersection(set1, set2); for (Point columnToIdentifier : intersected) { diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java index 7921f77..722cd8e 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java @@ -1,33 +1,32 @@ package fdiscovery.columns; -import org.apache.lucene.util.OpenBitSet; +import java.util.BitSet; -public class ColumnCollection extends OpenBitSet implements Comparable { +public class ColumnCollection extends BitSet implements Comparable { private static final long serialVersionUID = -5256272139963505719L; private int formatStringWidth; - protected long numberOfColumns; + protected int numberOfColumns; protected int[] setBits; - public ColumnCollection(long numberOfColumns ) { + public ColumnCollection(int numberOfColumns ) { this.numberOfColumns = numberOfColumns; this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns)); } public int[] getSetBits() { - int[] setBits = new int[(int) this.cardinality()]; + int[] setBits = new int[this.cardinality()]; - long bitIndex = 0; + int bitIndex = 0; int currentArrayIndex = 0; while (bitIndex < this.numberOfColumns) { - long currentNextSetBit = this.nextSetBit(bitIndex); + int currentNextSetBit = this.nextSetBit(bitIndex); if (currentNextSetBit != -1) { - setBits[currentArrayIndex++] = (int) currentNextSetBit; + setBits[currentArrayIndex++] = currentNextSetBit; bitIndex = currentNextSetBit + 1; } else { bitIndex = this.numberOfColumns; - } } @@ -38,7 +37,7 @@ public boolean isAtomic() { return this.cardinality() == 1; } - public ColumnCollection addColumn(long columnIndex) { + public ColumnCollection addColumn(int columnIndex) { ColumnCollection copy = (ColumnCollection) this.clone(); copy.set(columnIndex); @@ -75,7 +74,7 @@ public ColumnCollection andNotCopy(ColumnCollection other) { public ColumnCollection removeCopy(ColumnCollection other) { ColumnCollection copy = (ColumnCollection)this.clone(); - copy.remove(other); + copy.andNot(other); return copy; } @@ -114,11 +113,11 @@ public ColumnCollection complement() { } public boolean isSubsetOf(ColumnCollection other) { - return ColumnCollection.unionCount(this, other) == other.cardinality(); + return this.unionCount(other) == other.cardinality(); } public boolean isSupersetOf(ColumnCollection other) { - return ColumnCollection.unionCount(this, other) == this.cardinality(); + return this.unionCount(other) == this.cardinality(); } @@ -126,7 +125,7 @@ public boolean isProperSubsetOf(ColumnCollection other) { long cardinality = this.cardinality(); long otherCardinality = other.cardinality(); if (cardinality != otherCardinality) { - if (ColumnCollection.unionCount(this, other) == otherCardinality) { + if (this.unionCount(other) == otherCardinality) { return true; } } @@ -138,30 +137,35 @@ public boolean isProperSupersetOf(ColumnCollection other) { long cardinality = this.cardinality(); long otherCardinality = other.cardinality(); if (cardinality != otherCardinality) { - if (ColumnCollection.unionCount(this, other) == cardinality) { + if (this.unionCount(other) == cardinality) { return true; } } return false; } + public int unionCount(ColumnCollection other) { + ColumnCollection union = (ColumnCollection) this.clone(); + union.or(other); + return union.cardinality(); + } + public boolean isSubsetOrSupersetOf(ColumnCollection other) { return isSubsetOf(other) || isSupersetOf(other); } - public long getNumberOfColumns() { + public int getNumberOfColumns() { return this.numberOfColumns; } public long getMostRightBit() { - long bitIndex = 0; + int bitIndex = 0; while (bitIndex < this.numberOfColumns) { - long currentNextSetBit = this.nextSetBit(bitIndex); + int currentNextSetBit = this.nextSetBit(bitIndex); if (currentNextSetBit != -1) { bitIndex = currentNextSetBit + 1; } else { return bitIndex - 1; - } } return bitIndex; @@ -173,16 +177,9 @@ public ColumnCollection removeColumnCopy(int columnIndex) { return copy; } - - public ColumnCollection removeColumnCopy(long columnIndex) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.clear(columnIndex); - - return copy; - } - + @Override - public int compareTo(OpenBitSet other) { + public int compareTo(BitSet other) { ColumnCollection copy = (ColumnCollection) this.clone(); copy.xor(other); int lowestBit = copy.nextSetBit(0); @@ -209,4 +206,13 @@ public String toString() { return outputBuilder.toString(); } + public void remove(ColumnCollection other) { + this.andNot(other); + } + + public static int intersectionCount(ColumnCollection set1, ColumnCollection set2) { + ColumnCollection intersection = (ColumnCollection) set1.clone(); + intersection.and(set2); + return intersection.cardinality(); + } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java index 608327a..c2bfe60 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java @@ -1,17 +1,17 @@ -package fdiscovery.columns; - - -public class DifferenceSet extends ColumnCollection { - - private static final long serialVersionUID = -5174627424398542681L; - - private long numberOfColumns; - - public DifferenceSet(AgreeSet agreeSet) { - super(agreeSet.getNumberOfColumns()); - this.numberOfColumns = agreeSet.getNumberOfColumns(); - - this.bits = agreeSet.getBits().clone(); - this.flip(0, this.numberOfColumns); - } -} +package fdiscovery.columns; + + +public class DifferenceSet extends ColumnCollection { + + private static final long serialVersionUID = -5174627424398542681L; + + private long numberOfColumns; + + public DifferenceSet(AgreeSet agreeSet) { + super(agreeSet.getNumberOfColumns()); + this.numberOfColumns = agreeSet.getNumberOfColumns(); + + this.or(agreeSet); + this.flip(0, this.numberOfColumns); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java index da4c6cc..7136090 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java @@ -7,7 +7,7 @@ public class Path extends ColumnCollection { private static final long serialVersionUID = -6451347203736964695L; - public Path(long numberOfColumns) { + public Path(int numberOfColumns) { super(numberOfColumns); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java index a2b5851..35cc7c9 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java @@ -17,13 +17,13 @@ public PartialOrder(DifferenceSets differenceSets) { for (DifferenceSet differenceSet : differenceSets) { // increase the cover count for set columns - long bitIndex = 0; + int bitIndex = 0; while (bitIndex < differenceSet.getNumberOfColumns()) { - long currentNextSetBit = differenceSet.nextSetBit(bitIndex); + int currentNextSetBit = differenceSet.nextSetBit(bitIndex); if (currentNextSetBit != -1) { bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent((int) currentNextSetBit, 0); - orderMap.increment((int) currentNextSetBit); + orderMap.putIfAbsent(currentNextSetBit, 0); + orderMap.increment(currentNextSetBit); } else { bitIndex = differenceSet.getNumberOfColumns(); } @@ -38,18 +38,18 @@ public PartialOrder(DifferenceSets differenceSets) { } - public PartialOrder(DifferenceSets differenceSets, long columnIndexToSkip) { + public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) { TIntIntHashMap orderMap = new TIntIntHashMap(); for (DifferenceSet differenceSet : differenceSets) { // increase the cover count for set columns - long bitIndex = columnIndexToSkip; + int bitIndex = columnIndexToSkip; while (bitIndex < differenceSet.getNumberOfColumns()) { - long currentNextSetBit = differenceSet.nextSetBit(bitIndex); + int currentNextSetBit = differenceSet.nextSetBit(bitIndex); if (currentNextSetBit != -1) { bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent((int) currentNextSetBit, 0); - orderMap.increment((int) currentNextSetBit); + orderMap.putIfAbsent(currentNextSetBit, 0); + orderMap.increment(currentNextSetBit); } else { bitIndex = differenceSet.getNumberOfColumns(); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java index 565048a..f6fe76a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java @@ -6,7 +6,7 @@ import fdiscovery.columns.ColumnCollection; // from rhs to lhs -public abstract class PruneTable extends HashMap>> { +public abstract class PruneTable extends HashMap>> { private static final long serialVersionUID = 4470955427882698208L; @@ -23,7 +23,7 @@ public int getCount(ColumnCollection RHS) { public void addValue(ColumnCollection RHS, ColumnCollection LHS) { if (!this.containsKey(RHS)) { - this.put(RHS, new HashMap>()); + this.put(RHS, new HashMap>()); } if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) { this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList()); From 8e4aa9da2e11218a13b1ce3a3a0e632d263c8037 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:25:53 +0200 Subject: [PATCH 07/10] Refactoring ColumnCollection: Merged superfluous setting of numberOfColumns from master --- dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java index c2bfe60..44a6445 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java @@ -4,12 +4,9 @@ public class DifferenceSet extends ColumnCollection { private static final long serialVersionUID = -5174627424398542681L; - - private long numberOfColumns; - + public DifferenceSet(AgreeSet agreeSet) { super(agreeSet.getNumberOfColumns()); - this.numberOfColumns = agreeSet.getNumberOfColumns(); this.or(agreeSet); this.flip(0, this.numberOfColumns); From 6868b86150889e85bbb9faa32e78763edda20705 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:29:12 +0200 Subject: [PATCH 08/10] Maven updates: DFDMetanome is now in package de.metanome.algorithms.dfd (was de.metanome.algorithms.dfd.dfdMetanome), renaming --- dfd/dfdAlgorithm/pom.xml | 28 +++++++--------- dfd/dfdMetanome/pom.xml | 33 ++++++++----------- .../dfd/dfdMetanome/DFDMetanome.java | 12 +++---- dfd/pom.xml | 9 +++-- 4 files changed, 35 insertions(+), 47 deletions(-) diff --git a/dfd/dfdAlgorithm/pom.xml b/dfd/dfdAlgorithm/pom.xml index 975b05d..ec91c77 100755 --- a/dfd/dfdAlgorithm/pom.xml +++ b/dfd/dfdAlgorithm/pom.xml @@ -2,17 +2,17 @@ xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - de.metanome.algorithms.dfd - dfdAlgorithm + + DFDAlgorithm jar - dfdAlgorithm + DFDAlgorithm - de.metanome.algorithms - algorithms - ${metanome.version} - ../../pom.xml + de.metanome.algorithms.dfd + DFDModules + 1.2-SNAPSHOT + ../pom.xml @@ -20,10 +20,10 @@ maven-compiler-plugin - 3.1 + ${maven-compiler-plugin.version} - 1.7 - 1.7 + 1.8 + 1.8 @@ -38,7 +38,6 @@ com.google.guava guava - 15.0 net.sf.trove4j @@ -55,11 +54,6 @@ javatuples 1.2 - - org.apache.lucene - lucene-core - 3.0.3 - log4j log4j @@ -86,4 +80,4 @@ 1.2 - \ No newline at end of file + diff --git a/dfd/dfdMetanome/pom.xml b/dfd/dfdMetanome/pom.xml index 9c50517..4ab019d 100644 --- a/dfd/dfdMetanome/pom.xml +++ b/dfd/dfdMetanome/pom.xml @@ -4,17 +4,16 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - de.metanome.algorithms.dfd - dfdMetanome + DFD jar - dfdMetanome + DFD - de.metanome.algorithms - algorithms - ${metanome.version} - ../../pom.xml + de.metanome.algorithms.dfd + DFDModules + 1.2-SNAPSHOT + ../pom.xml @@ -30,10 +29,10 @@ org.apache.maven.plugins maven-compiler-plugin - 3.1 + ${maven-compiler-plugin.version} - 1.7 - 1.7 + 1.8 + 1.8 true true -Xlint:all @@ -42,12 +41,12 @@ org.apache.maven.plugins maven-assembly-plugin - 2.4 + ${maven-assembly-plugin.version} - de.metanome.algorithms.dfd.dfdMetanome.DFDMetanome + de.metanome.algorithms.dfd.DFDMetanome @@ -73,16 +72,12 @@ de.metanome algorithm_integration - ${metanome.version} - compile de.metanome.algorithms.dfd - dfdAlgorithm - ${metanome.version} - compile + DFDAlgorithm + 1.2-SNAPSHOT - - \ No newline at end of file + diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java index 0fc176c..f41face 100644 --- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java +++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java @@ -1,4 +1,9 @@ -package de.metanome.algorithms.dfd.dfdMetanome; +package de.metanome.algorithms.dfd; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import de.metanome.algorithm_integration.AlgorithmConfigurationException; import de.metanome.algorithm_integration.AlgorithmExecutionException; @@ -11,11 +16,6 @@ import de.metanome.algorithm_integration.input.FileInputGenerator; import de.metanome.algorithm_integration.result_receiver.FunctionalDependencyResultReceiver; import de.metanome.algorithm_integration.results.FunctionalDependency; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; - import fdiscovery.approach.runner.DFDMiner; import fdiscovery.columns.ColumnCollection; import fdiscovery.general.FunctionalDependencies; diff --git a/dfd/pom.xml b/dfd/pom.xml index 39dce46..e92f1a9 100644 --- a/dfd/pom.xml +++ b/dfd/pom.xml @@ -5,10 +5,10 @@ 4.0.0 de.metanome.algorithms.dfd - dfd + DFDModules pom - dfd + DFDModules dfdAlgorithm @@ -16,15 +16,14 @@ - 0.0.2-SNAPSHOT UTF-8 de.metanome.algorithms algorithms - ${metanome.version} + 1.2-SNAPSHOT ../pom.xml - \ No newline at end of file + From 2d1724e40525508959cae2ab984087948788ca41 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 16:30:34 +0200 Subject: [PATCH 09/10] Added (unused) RelationalInput variables --- .../de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java index f41face..e3a960b 100644 --- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java +++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java @@ -14,6 +14,7 @@ import de.metanome.algorithm_integration.configuration.ConfigurationRequirement; import de.metanome.algorithm_integration.configuration.ConfigurationRequirementFileInput; import de.metanome.algorithm_integration.input.FileInputGenerator; +import de.metanome.algorithm_integration.input.RelationalInput; import de.metanome.algorithm_integration.result_receiver.FunctionalDependencyResultReceiver; import de.metanome.algorithm_integration.results.FunctionalDependency; import fdiscovery.approach.runner.DFDMiner; @@ -70,6 +71,11 @@ public void execute() throws AlgorithmExecutionException { DFDMiner dfdMiner = new DFDMiner(inputFileProcessor); dfdMiner.run(); FunctionalDependencies fds = dfdMiner.getDependencies(); + + RelationalInput input = fileInput.generateNewCopy(); + String relationName = input.relationName(); + List columnNames = input.columnNames(); + for (ColumnCollection determining : fds.keySet()) { for (int dependentColumn : fds.get(determining).getSetBits()) { ColumnIdentifier[] From 9aaf911611b98c07c737b70d62f40825ed332196 Mon Sep 17 00:00:00 2001 From: Torben Eims Date: Mon, 15 May 2023 17:12:25 +0200 Subject: [PATCH 10/10] Changes from master I missed earlier --- .../fdiscovery/general/FunctionalDependencies.java | 2 +- .../algorithms/dfd/dfdMetanome/DFDMetanome.java | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java index 3333de7..7868f32 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java @@ -85,7 +85,7 @@ public int getNumberOfNonAtomicDependencies() { return nonAtomicFDCount; } - public void addRHSColumn(ColumnCollection lhs, Integer rhsIndex) { + public void addRHSColumn(ColumnCollection lhs, int rhsIndex) { ColumnCollection rhs = null; if (!this.containsKey(lhs)) { rhs = new ColumnCollection(lhs.getNumberOfColumns()); diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java index e3a960b..e631ab7 100644 --- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java +++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java @@ -84,13 +84,13 @@ public void execute() throws AlgorithmExecutionException { int i = 0; for (int determiningColumn : determining.getSetBits()) { determiningColumns[i] = - new ColumnIdentifier(this.identifier, "Column " + determiningColumn); + new ColumnIdentifier(relationName, columnNames.get(determiningColumn)); i++; } FunctionalDependency fd = new FunctionalDependency( new ColumnCombination(determiningColumns), - new ColumnIdentifier(this.identifier, "Column " + dependentColumn)); + new ColumnIdentifier(relationName, columnNames.get(dependentColumn))); this.resultReceiver.receiveResult(fd); } } @@ -98,4 +98,14 @@ public void execute() throws AlgorithmExecutionException { } } + @Override + public String getAuthors() { + return "Patrick Schulze"; + } + + @Override + public String getDescription() { + return "Random Walk-based FD discovery"; + } + }