diff --git a/dfd/dfdAlgorithm/pom.xml b/dfd/dfdAlgorithm/pom.xml
index 128a457..ec91c77 100755
--- a/dfd/dfdAlgorithm/pom.xml
+++ b/dfd/dfdAlgorithm/pom.xml
@@ -2,7 +2,7 @@
xmlns="http://maven.apache.org/POM/4.0.0"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
-
+
DFDAlgorithm
jar
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java
index cd29e27..62fedb3 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java
@@ -1,534 +1,534 @@
-package fdiscovery.approach.runner;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Stack;
-
-import org.apache.commons.cli.CommandLine;
-
-
-import fdiscovery.approach.ColumnOrder;
-import fdiscovery.columns.ColumnCollection;
-import fdiscovery.general.CLIParserMiner;
-import fdiscovery.general.ColumnFiles;
-import fdiscovery.general.FunctionalDependencies;
-import fdiscovery.general.Miner;
-import fdiscovery.partitions.ComposedPartition;
-import fdiscovery.partitions.FileBasedPartition;
-import fdiscovery.partitions.FileBasedPartitions;
-import fdiscovery.partitions.MemoryManagedJoinedPartitions;
-import fdiscovery.partitions.Partition;
-import fdiscovery.preprocessing.SVFileProcessor;
-import fdiscovery.pruning.Dependencies;
-import fdiscovery.pruning.NonDependencies;
-import fdiscovery.pruning.Observation;
-import fdiscovery.pruning.Observations;
-import fdiscovery.pruning.Seed;
-import gnu.trove.map.hash.TLongObjectHashMap;
-import gnu.trove.set.hash.THashSet;
-
-public class DFDMiner extends Miner implements Runnable {
-
- private int numberOfColumns;
- private int numberOfRows;
- private ColumnOrder columnOrder;
- private Stack trace;
- private Stack seeds;
- private Observations observations;
- private FunctionalDependencies minimalDependencies;
- private FunctionalDependencies maximalNonDependencies;
- private FileBasedPartitions fileBasedPartitions;
- private Dependencies dependencies;
- private NonDependencies nonDependencies;
- private MemoryManagedJoinedPartitions joinedPartitions;
-
- public static void main(String[] args) {
- createColumDirectory();
-
- File source = new File(DFDMiner.input);
- SVFileProcessor inputFileProcessor = null;
- try {
- long timeStart = System.currentTimeMillis();
-
- inputFileProcessor = new SVFileProcessor(source);
- inputFileProcessor.init();
- System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter());
- System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns());
- System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows());
- inputFileProcessor.createColumnFiles();
- DFDMiner dfdRunner = new DFDMiner(inputFileProcessor);
-
- dfdRunner.run();
- System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(dfdRunner.minimalDependencies.getCount())));
- long timeFindFDs = System.currentTimeMillis();
- System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s");
- System.out.println(dfdRunner.getDependencies());
-
- } catch (FileNotFoundException e) {
- System.out.println("The input file could not be found.");
- } catch (IOException e) {
- System.out.println("The input reader could not be reset.");
- }
- }
-
- public static void main2(String[] args) {
- CLIParserMiner parser = new CLIParserMiner();
- CommandLine cli = parser.parse(args);
- String inputFilename = new String();
- String columnFileDirectory = new String();
- String resultFile = new String();
- int numberOfColumns = 0;
- int numberOfRows = 0;
-
- if (cli.hasOption("file")) {
- inputFilename = cli.getOptionValue("file");
- }
- if (cli.hasOption("input")) {
- columnFileDirectory = cli.getOptionValue("input");
- }
- if (cli.hasOption("result")) {
- resultFile = cli.getOptionValue("result");
- }
- if (cli.hasOption("columns")) {
- numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue();
- }
- if (cli.hasOption("rows")) {
- numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue();
- }
- ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows);
- long timeStart = System.currentTimeMillis();
- DFDMiner runner = new DFDMiner(columnFiles, numberOfRows);
- try {
- runner.run();
- long timeEnd = System.currentTimeMillis();
- runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename);
- } catch (OutOfMemoryError e) {
- System.exit(Miner.STATUS_OOM);
- }
- System.exit(0);
- }
-
- private void writeOutputSuccessful(String outputFile, long time, String inputFileName) {
-
- String timeString = (time != -1) ? String.format("%.1f", Double.valueOf((double) (time) / 1000)) : "-1";
- StringBuilder outputBuilder = new StringBuilder();
- if (!inputFileName.isEmpty()) {
- outputBuilder.append(String.format("%s\t", inputFileName));
- }
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows)));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns)));
- outputBuilder.append(String.format("%s\t", timeString));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount())));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getCount())));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getTotalCount())));
- outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory())));
- outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false)));
-
- try {
- BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true));
- resultFileWriter.write(outputBuilder.toString());
- System.out.print(outputBuilder.toString());
- resultFileWriter.close();
- } catch (IOException e) {
- System.out.println("Couldn't write output.");
- }
- }
-
- public DFDMiner(SVFileProcessor table) throws OutOfMemoryError {
- this.observations = new Observations();
- this.numberOfColumns = table.getNumberOfColumns();
- this.numberOfRows = table.getNumberOfRows();
- this.trace = new Stack<>();
- this.seeds = new Stack<>();
- this.minimalDependencies = new FunctionalDependencies();
- this.maximalNonDependencies = new FunctionalDependencies();
- this.dependencies = new Dependencies(this.numberOfColumns);
- this.nonDependencies = new NonDependencies(this.numberOfColumns);
- this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns);
- this.fileBasedPartitions = new FileBasedPartitions(table);
- this.columnOrder = new ColumnOrder(fileBasedPartitions);
- for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) {
- ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns);
- columnIdentifier.set(columnIndex);
- this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex));
- }
- }
-
- public DFDMiner(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError {
- this.observations = new Observations();
- this.numberOfColumns = columnFiles.getNumberOfColumns();
- this.numberOfRows = numberOfRows;
- this.trace = new Stack<>();
- this.seeds = new Stack<>();
- this.minimalDependencies = new FunctionalDependencies();
- this.maximalNonDependencies = new FunctionalDependencies();
- this.dependencies = new Dependencies(this.numberOfColumns);
- this.nonDependencies = new NonDependencies(this.numberOfColumns);
- this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns);
- this.fileBasedPartitions = new FileBasedPartitions(columnFiles, numberOfRows);
- columnFiles.clear();
- this.columnOrder = new ColumnOrder(fileBasedPartitions);
- for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) {
- ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns);
- columnIdentifier.set(columnIndex);
- this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex));
- }
- }
-
- public void run() throws OutOfMemoryError {
-
- ArrayList keys = new ArrayList<>();
-
- // check each column for uniqueness
- // if a column is unique it's a key for all other columns
- // therefore uniquePartition -> schema - uniquePartition
- for (FileBasedPartition fileBasedPartition : this.fileBasedPartitions) {
- if (fileBasedPartition.isUnique()) {
- ColumnCollection uniquePartitionIndices = fileBasedPartition.getIndices();
- ColumnCollection RHS = uniquePartitionIndices.complementCopy();
- this.minimalDependencies.put(uniquePartitionIndices, RHS);
- // add unique columns to minimal uniques
- keys.add(uniquePartitionIndices);
- }
- }
-
- // do this for all RHS
- for (int currentRHSIndex = 0; currentRHSIndex < this.numberOfColumns; currentRHSIndex++) {
-
- this.dependencies = new Dependencies(numberOfColumns);
- this.nonDependencies = new NonDependencies(numberOfColumns);
- this.trace.clear();
- this.observations.clear();
-
- for (int lhsIndex = 0; lhsIndex < this.numberOfColumns; lhsIndex++) {
- if (lhsIndex != currentRHSIndex) {
- ColumnCollection lhs = new ColumnCollection(numberOfColumns);
- lhs.set(lhsIndex);
- if (keys.contains(lhs)) {
- this.dependencies.add(lhs);
- this.observations.put(lhs, Observation.MINIMAL_DEPENDENCY);
- }
- }
- }
-
- ColumnCollection currentRHS = new ColumnCollection(numberOfColumns);
- currentRHS.set(currentRHSIndex);
-
- // generate seeds
- for (int partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) {
- if (partitionIndex != currentRHSIndex) {
- FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex);
- this.seeds.push(new Seed(lhsPartition.getIndices()));
- }
- }
-
- do {
- while (!seeds.isEmpty()) {
- Seed currentSeed = this.randomTake();
- do {
- ColumnCollection lhsIndices = currentSeed.getIndices();
- Observation observationOfLHS = this.observations.get(currentSeed.getIndices());
- if (observationOfLHS == null) {
- observationOfLHS = this.checkDependencyAndStoreIt(currentSeed, currentRHSIndex);
-
- // if we couldn't find any dependency that is a
- // subset of the current valid LHS it is minimal
- if (observationOfLHS == Observation.MINIMAL_DEPENDENCY) {
- this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
- }
- // if we couldn't find any non-dependency that is
- // superset of the current non-valid LHS it is
- // maximal
- else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) {
- this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
- }
- currentSeed = randomWalkStep(currentSeed, currentRHSIndex);
- } else {
-// System.out.println(String.format("[2]Current [%s]%s\t[%s]", (char) (currentRHSIndex + 65), currentSeed, observationOfLHS));
- if (observationOfLHS.isCandidate()) {
- if (observationOfLHS.isDependency()) {
- Observation updatedDependencyType = this.observations.updateDependencyType(currentSeed.getIndices());
- // System.out.println(String.format("\tupdated:\t%s",
- // updatedDependencyType));
- this.observations.put(lhsIndices, updatedDependencyType);
- if (updatedDependencyType == Observation.MINIMAL_DEPENDENCY) {
- // System.out.println("Add min dependency:\t"
- // + currentSeed);
- this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
- }
- } else {
- Observation updatedNonDependencyType = this.observations.updateNonDependencyType(currentSeed.getIndices(), currentRHSIndex);
- this.observations.put(lhsIndices, updatedNonDependencyType);
- // System.out.println(String.format("\tupdated:\t%s",
- // updatedNonDependencyType));
- if (updatedNonDependencyType == Observation.MAXIMAL_NON_DEPENDENCY) {
- this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
- }
- }
- }
- currentSeed = randomWalkStep(currentSeed, currentRHSIndex);
- }
-
- } while (currentSeed != null);
- }
- seeds = this.nextSeeds(currentRHSIndex);
- } while (!seeds.isEmpty());
- }
- // System.out.println(String.format("Number partitions:\t%d",
- // this.joinedPartitions.getCount()));
- }
-
- private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) {
- if (nonDependencies.isRepresented(seed.getIndices())) {
- // System.out.println("Skip because of nonDependency");
- Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex);
- this.observations.put(seed.getIndices(), observationOfLHS);
- this.nonDependencies.add(seed.getIndices());
- return observationOfLHS;
- } else if (dependencies.isRepresented(seed.getIndices())) {
- // System.out.println("Skip because of dependency");
- Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices());
- this.observations.put(seed.getIndices(), observationOfLHS);
- this.dependencies.add(seed.getIndices());
- return observationOfLHS;
- }
-
- FileBasedPartition currentRHSPartition = this.fileBasedPartitions.get(currentRHSIndex);
- Partition currentLHSPartition = null;
- Partition currentLHSJoinedRHSPartition = null;
-
- if (seed.isAtomic()) {
- currentLHSPartition = this.joinedPartitions.get(seed.getIndices());
- currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition);
- } else {
-
- // if we went upwards in the lattice we can build the currentLHS
- // partition directly from the previous partition
- if (seed.getAdditionalColumnIndex() != -1) {
- int additionalColumn = seed.getAdditionalColumnIndex();
- Partition previousLHSPartition = joinedPartitions.get(seed.getBaseIndices());
- if (previousLHSPartition == null) {
- ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getBaseIndices());
- previousLHSPartition = ComposedPartition.buildPartition(partitionsToJoin);
- }
- FileBasedPartition additionalColumnPartition = this.fileBasedPartitions.get(additionalColumn);
- currentLHSPartition = this.joinedPartitions.get(previousLHSPartition.getIndices().setCopy(additionalColumn));
- if (currentLHSPartition == null) {
- currentLHSPartition = new ComposedPartition(previousLHSPartition, additionalColumnPartition);
- this.joinedPartitions.addPartition(currentLHSPartition);
- }
- currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex));
- if (currentLHSJoinedRHSPartition == null) {
- currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition);
- this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition);
- }
- } else {
- currentLHSPartition = this.joinedPartitions.get(seed.getIndices());
- if (currentLHSPartition == null) {
- ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getIndices());
- currentLHSPartition = ComposedPartition.buildPartition(partitionsToJoin);
- this.joinedPartitions.addPartition(currentLHSPartition);
- }
- currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex));
- if (currentLHSJoinedRHSPartition == null) {
- currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition);
- this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition);
- }
- }
-// this.joinedPartitions.addPartition(currentLHSPartition);
-// this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition);
- }
-
- if (Partition.representsFD(currentLHSPartition, currentLHSJoinedRHSPartition)) {
- Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices());
- this.observations.put(seed.getIndices(), observationOfLHS);
- this.dependencies.add(seed.getIndices());
- return observationOfLHS;
- }
- Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex);
- this.observations.put(seed.getIndices(), observationOfLHS);
- this.nonDependencies.add(seed.getIndices());
- return observationOfLHS;
- }
-
- private Stack nextSeeds(int currentRHSIndex) {
-// System.out.println("Find holes");
- THashSet deps = new THashSet<>();
- ArrayList currentMaximalNonDependencies = maximalNonDependencies.getLHSForRHS(currentRHSIndex);
- HashSet currentMinimalDependencies = new HashSet<>(minimalDependencies.getLHSForRHS(currentRHSIndex));
- ArrayList newDeps = new ArrayList<>(numberOfColumns * deps.size());
-// Holes holes = new Holes();
-
-// int i = 0;
-// for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) {
-// ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement();
-// if (deps.isEmpty()) {
-// ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns);
-// for (Integer complementColumnIndex : complement.getSetBits()) {
-// deps.add(emptyColumnIndices.setCopy(complementColumnIndex));
-// }
-// } else {
-// for (ColumnCollection dep : deps) {
-// int[] setBits = complement.getSetBits();
-// for (int setBit = 0; setBit < setBits.length; setBit++) {
-// holes.add(dep.setCopy(setBits[setBit]));
-//// System.out.println("Dep:\t" + dep.setCopy(setBits[setBit]));
-// }
-// }
-// // minimize newDeps
-// System.out.println(i++ + "\t" + currentMaximalNonDependencies.size());
-// System.out.println("total deps:\t" + deps.size());
-// System.out.println("before minimizing:\t" + holes.size());
-//// ArrayList minimizedNewDeps = minimizeSeeds(newDeps);
-// holes.minimize();
-// System.out.println("after minimizing:\t" + holes.size());
-// deps.clear();
-// deps.addAll(holes);
-// holes.clear();
-// }
-// }
-
- for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) {
- ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement();
- if (deps.isEmpty()) {
- ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns);
- for (int complementColumnIndex : complement.getSetBits()) {
- deps.add(emptyColumnIndices.setCopy(complementColumnIndex));
- }
- } else {
- for (ColumnCollection dep : deps) {
- int[] setBits = complement.getSetBits();
- for (int setBit = 0; setBit < setBits.length; setBit++) {
- newDeps.add(dep.setCopy(setBits[setBit]));
- }
- }
- // minimize newDeps
- ArrayList minimizedNewDeps = minimizeSeeds(newDeps);
- deps.clear();
- deps.addAll(minimizedNewDeps);
- newDeps.clear();
- }
- }
-
- // return only elements that aren't already covered by the minimal
- // dependencies
- Stack remainingSeeds = new Stack<>();
- deps.removeAll(currentMinimalDependencies);
- for (ColumnCollection remainingSeed : deps) {
- remainingSeeds.push(new Seed(remainingSeed));
- }
-
- return remainingSeeds;
- }
-
- private ArrayList minimizeSeeds(ArrayList seeds) {
- long maxCardinality = 0;
- TLongObjectHashMap> seedsBySize = new TLongObjectHashMap<>(numberOfColumns);
- for (ColumnCollection seed : seeds) {
- long cardinalityOfSeed = seed.cardinality();
- maxCardinality = Math.max(maxCardinality, cardinalityOfSeed);
- seedsBySize.putIfAbsent(cardinalityOfSeed, new ArrayList(seeds.size()/numberOfColumns));
- seedsBySize.get(cardinalityOfSeed).add(seed);
- }
-
- for (long lowerBound = 1; lowerBound < maxCardinality; lowerBound++) {
- ArrayList lowerBoundSeeds = seedsBySize.get(lowerBound);
- if (lowerBoundSeeds != null) {
- for (long upperBound = maxCardinality; upperBound > lowerBound; upperBound--) {
- ArrayList upperBoundSeeds = seedsBySize.get(upperBound);
- if (upperBoundSeeds != null) {
- for (Iterator lowerIt = lowerBoundSeeds.iterator(); lowerIt.hasNext();) {
- ColumnCollection lowerSeed = lowerIt.next();
- for (Iterator upperIt = upperBoundSeeds.iterator(); upperIt.hasNext();) {
- if (lowerSeed.isSubsetOf(upperIt.next())) {
- upperIt.remove();
- }
- }
- }
- }
- }
- }
- }
- ArrayList minimizedSeeds = new ArrayList<>();
- for (ArrayList seedList : seedsBySize.valueCollection()) {
- for (ColumnCollection seed : seedList) {
- minimizedSeeds.add(seed);
- }
- }
- return minimizedSeeds;
- }
-
- private Seed randomTake() {
- if (!this.seeds.isEmpty()) {
- return this.seeds.pop();
- }
- return null;
- }
-
- private Seed randomWalkStep(Seed currentSeed, int currentRHSIndex) {
- Observation observationOfSeed = this.observations.get(currentSeed.getIndices());
-
- if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) {
- THashSet uncheckedSubsets = this.observations.getUncheckedMaximalSubsets(currentSeed.getIndices(), columnOrder);
- THashSet prunedNonDependencySubsets = nonDependencies.getPrunedSupersets(uncheckedSubsets);
- for (ColumnCollection prunedNonDependencySubset : prunedNonDependencySubsets) {
- observations.put(prunedNonDependencySubset, Observation.NON_DEPENDENCY);
- }
- uncheckedSubsets.removeAll(prunedNonDependencySubsets);
- if (uncheckedSubsets.isEmpty() && prunedNonDependencySubsets.isEmpty()) {
- observations.put(currentSeed.getIndices(), Observation.MINIMAL_DEPENDENCY);
- minimalDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex);
- } else if (!uncheckedSubsets.isEmpty()) {
- ColumnCollection notRepresentedUncheckedSubset = uncheckedSubsets.iterator().next();
- if (notRepresentedUncheckedSubset != null) {
- trace.push(currentSeed);
- return new Seed(notRepresentedUncheckedSubset);
- }
- }
- } else if (observationOfSeed == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) {
- THashSet uncheckedSupersets = this.observations.getUncheckedMinimalSupersets(currentSeed.getIndices(), currentRHSIndex, columnOrder);
- THashSet prunedNonDependencySupersets = nonDependencies.getPrunedSupersets(uncheckedSupersets);
- THashSet prunedDependencySupersets = dependencies.getPrunedSubsets(uncheckedSupersets);
- for (ColumnCollection prunedNonDependencySuperset : prunedNonDependencySupersets) {
- observations.put(prunedNonDependencySuperset, Observation.NON_DEPENDENCY);
- }
- for (ColumnCollection prunedDependencySuperset : prunedDependencySupersets) {
- observations.put(prunedDependencySuperset, Observation.DEPENDENCY);
- }
- uncheckedSupersets.removeAll(prunedDependencySupersets);
- uncheckedSupersets.removeAll(prunedNonDependencySupersets);
- if (uncheckedSupersets.isEmpty() && prunedNonDependencySupersets.isEmpty()) {
- observations.put(currentSeed.getIndices(), Observation.MAXIMAL_NON_DEPENDENCY);
- maximalNonDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex);
- } else if (!uncheckedSupersets.isEmpty()) {
- ColumnCollection notRepresentedUncheckedSuperset = uncheckedSupersets.iterator().next();
- if (notRepresentedUncheckedSuperset != null) {
- trace.push(currentSeed);
- int additionalColumn = notRepresentedUncheckedSuperset.removeCopy(currentSeed.getIndices()).nextSetBit(0);
- return new Seed(notRepresentedUncheckedSuperset, additionalColumn);
- }
- }
- }
- if (!this.trace.isEmpty()) {
- Seed nextSeed = this.trace.pop();
- return nextSeed;
- }
- return null;
- }
-
- public FunctionalDependencies getDependencies() {
- return this.minimalDependencies;
- }
+package fdiscovery.approach.runner;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Stack;
+
+import org.apache.commons.cli.CommandLine;
+
+
+import fdiscovery.approach.ColumnOrder;
+import fdiscovery.columns.ColumnCollection;
+import fdiscovery.general.CLIParserMiner;
+import fdiscovery.general.ColumnFiles;
+import fdiscovery.general.FunctionalDependencies;
+import fdiscovery.general.Miner;
+import fdiscovery.partitions.ComposedPartition;
+import fdiscovery.partitions.FileBasedPartition;
+import fdiscovery.partitions.FileBasedPartitions;
+import fdiscovery.partitions.MemoryManagedJoinedPartitions;
+import fdiscovery.partitions.Partition;
+import fdiscovery.preprocessing.SVFileProcessor;
+import fdiscovery.pruning.Dependencies;
+import fdiscovery.pruning.NonDependencies;
+import fdiscovery.pruning.Observation;
+import fdiscovery.pruning.Observations;
+import fdiscovery.pruning.Seed;
+import gnu.trove.map.hash.TLongObjectHashMap;
+import gnu.trove.set.hash.THashSet;
+
+public class DFDMiner extends Miner implements Runnable {
+
+ private int numberOfColumns;
+ private int numberOfRows;
+ private ColumnOrder columnOrder;
+ private Stack trace;
+ private Stack seeds;
+ private Observations observations;
+ private FunctionalDependencies minimalDependencies;
+ private FunctionalDependencies maximalNonDependencies;
+ private FileBasedPartitions fileBasedPartitions;
+ private Dependencies dependencies;
+ private NonDependencies nonDependencies;
+ private MemoryManagedJoinedPartitions joinedPartitions;
+
+ public static void main(String[] args) {
+ createColumDirectory();
+
+ File source = new File(DFDMiner.input);
+ SVFileProcessor inputFileProcessor = null;
+ try {
+ long timeStart = System.currentTimeMillis();
+
+ inputFileProcessor = new SVFileProcessor(source);
+ inputFileProcessor.init();
+ System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter());
+ System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns());
+ System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows());
+ inputFileProcessor.createColumnFiles();
+ DFDMiner dfdRunner = new DFDMiner(inputFileProcessor);
+
+ dfdRunner.run();
+ System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(dfdRunner.minimalDependencies.getCount())));
+ long timeFindFDs = System.currentTimeMillis();
+ System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s");
+ System.out.println(dfdRunner.getDependencies());
+
+ } catch (FileNotFoundException e) {
+ System.out.println("The input file could not be found.");
+ } catch (IOException e) {
+ System.out.println("The input reader could not be reset.");
+ }
+ }
+
+ public static void main2(String[] args) {
+ CLIParserMiner parser = new CLIParserMiner();
+ CommandLine cli = parser.parse(args);
+ String inputFilename = new String();
+ String columnFileDirectory = new String();
+ String resultFile = new String();
+ int numberOfColumns = 0;
+ int numberOfRows = 0;
+
+ if (cli.hasOption("file")) {
+ inputFilename = cli.getOptionValue("file");
+ }
+ if (cli.hasOption("input")) {
+ columnFileDirectory = cli.getOptionValue("input");
+ }
+ if (cli.hasOption("result")) {
+ resultFile = cli.getOptionValue("result");
+ }
+ if (cli.hasOption("columns")) {
+ numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue();
+ }
+ if (cli.hasOption("rows")) {
+ numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue();
+ }
+ ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows);
+ long timeStart = System.currentTimeMillis();
+ DFDMiner runner = new DFDMiner(columnFiles, numberOfRows);
+ try {
+ runner.run();
+ long timeEnd = System.currentTimeMillis();
+ runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename);
+ } catch (OutOfMemoryError e) {
+ System.exit(Miner.STATUS_OOM);
+ }
+ System.exit(0);
+ }
+
+ private void writeOutputSuccessful(String outputFile, long time, String inputFileName) {
+
+ String timeString = (time != -1) ? String.format("%.1f", Double.valueOf((double) (time) / 1000)) : "-1";
+ StringBuilder outputBuilder = new StringBuilder();
+ if (!inputFileName.isEmpty()) {
+ outputBuilder.append(String.format("%s\t", inputFileName));
+ }
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows)));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns)));
+ outputBuilder.append(String.format("%s\t", timeString));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount())));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getCount())));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getTotalCount())));
+ outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory())));
+ outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false)));
+
+ try {
+ BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true));
+ resultFileWriter.write(outputBuilder.toString());
+ System.out.print(outputBuilder.toString());
+ resultFileWriter.close();
+ } catch (IOException e) {
+ System.out.println("Couldn't write output.");
+ }
+ }
+
+ public DFDMiner(SVFileProcessor table) throws OutOfMemoryError {
+ this.observations = new Observations();
+ this.numberOfColumns = table.getNumberOfColumns();
+ this.numberOfRows = table.getNumberOfRows();
+ this.trace = new Stack<>();
+ this.seeds = new Stack<>();
+ this.minimalDependencies = new FunctionalDependencies();
+ this.maximalNonDependencies = new FunctionalDependencies();
+ this.dependencies = new Dependencies(this.numberOfColumns);
+ this.nonDependencies = new NonDependencies(this.numberOfColumns);
+ this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns);
+ this.fileBasedPartitions = new FileBasedPartitions(table);
+ this.columnOrder = new ColumnOrder(fileBasedPartitions);
+ for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) {
+ ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns);
+ columnIdentifier.set(columnIndex);
+ this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex));
+ }
+ }
+
+ public DFDMiner(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError {
+ this.observations = new Observations();
+ this.numberOfColumns = columnFiles.getNumberOfColumns();
+ this.numberOfRows = numberOfRows;
+ this.trace = new Stack<>();
+ this.seeds = new Stack<>();
+ this.minimalDependencies = new FunctionalDependencies();
+ this.maximalNonDependencies = new FunctionalDependencies();
+ this.dependencies = new Dependencies(this.numberOfColumns);
+ this.nonDependencies = new NonDependencies(this.numberOfColumns);
+ this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns);
+ this.fileBasedPartitions = new FileBasedPartitions(columnFiles, numberOfRows);
+ columnFiles.clear();
+ this.columnOrder = new ColumnOrder(fileBasedPartitions);
+ for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) {
+ ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns);
+ columnIdentifier.set(columnIndex);
+ this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex));
+ }
+ }
+
+ public void run() throws OutOfMemoryError {
+
+ ArrayList keys = new ArrayList<>();
+
+ // check each column for uniqueness
+ // if a column is unique it's a key for all other columns
+ // therefore uniquePartition -> schema - uniquePartition
+ for (FileBasedPartition fileBasedPartition : this.fileBasedPartitions) {
+ if (fileBasedPartition.isUnique()) {
+ ColumnCollection uniquePartitionIndices = fileBasedPartition.getIndices();
+ ColumnCollection RHS = uniquePartitionIndices.complementCopy();
+ this.minimalDependencies.put(uniquePartitionIndices, RHS);
+ // add unique columns to minimal uniques
+ keys.add(uniquePartitionIndices);
+ }
+ }
+
+ // do this for all RHS
+ for (int currentRHSIndex = 0; currentRHSIndex < this.numberOfColumns; currentRHSIndex++) {
+
+ this.dependencies = new Dependencies(numberOfColumns);
+ this.nonDependencies = new NonDependencies(numberOfColumns);
+ this.trace.clear();
+ this.observations.clear();
+
+ for (int lhsIndex = 0; lhsIndex < this.numberOfColumns; lhsIndex++) {
+ if (lhsIndex != currentRHSIndex) {
+ ColumnCollection lhs = new ColumnCollection(numberOfColumns);
+ lhs.set(lhsIndex);
+ if (keys.contains(lhs)) {
+ this.dependencies.add(lhs);
+ this.observations.put(lhs, Observation.MINIMAL_DEPENDENCY);
+ }
+ }
+ }
+
+ ColumnCollection currentRHS = new ColumnCollection(numberOfColumns);
+ currentRHS.set(currentRHSIndex);
+
+ // generate seeds
+ for (int partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) {
+ if (partitionIndex != currentRHSIndex) {
+ FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex);
+ this.seeds.push(new Seed(lhsPartition.getIndices()));
+ }
+ }
+
+ do {
+ while (!seeds.isEmpty()) {
+ Seed currentSeed = this.randomTake();
+ do {
+ ColumnCollection lhsIndices = currentSeed.getIndices();
+ Observation observationOfLHS = this.observations.get(currentSeed.getIndices());
+ if (observationOfLHS == null) {
+ observationOfLHS = this.checkDependencyAndStoreIt(currentSeed, currentRHSIndex);
+
+ // if we couldn't find any dependency that is a
+ // subset of the current valid LHS it is minimal
+ if (observationOfLHS == Observation.MINIMAL_DEPENDENCY) {
+ this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
+ }
+ // if we couldn't find any non-dependency that is
+ // superset of the current non-valid LHS it is
+ // maximal
+ else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) {
+ this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
+ }
+ currentSeed = randomWalkStep(currentSeed, currentRHSIndex);
+ } else {
+// System.out.println(String.format("[2]Current [%s]%s\t[%s]", (char) (currentRHSIndex + 65), currentSeed, observationOfLHS));
+ if (observationOfLHS.isCandidate()) {
+ if (observationOfLHS.isDependency()) {
+ Observation updatedDependencyType = this.observations.updateDependencyType(currentSeed.getIndices());
+ // System.out.println(String.format("\tupdated:\t%s",
+ // updatedDependencyType));
+ this.observations.put(lhsIndices, updatedDependencyType);
+ if (updatedDependencyType == Observation.MINIMAL_DEPENDENCY) {
+ // System.out.println("Add min dependency:\t"
+ // + currentSeed);
+ this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
+ }
+ } else {
+ Observation updatedNonDependencyType = this.observations.updateNonDependencyType(currentSeed.getIndices(), currentRHSIndex);
+ this.observations.put(lhsIndices, updatedNonDependencyType);
+ // System.out.println(String.format("\tupdated:\t%s",
+ // updatedNonDependencyType));
+ if (updatedNonDependencyType == Observation.MAXIMAL_NON_DEPENDENCY) {
+ this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex);
+ }
+ }
+ }
+ currentSeed = randomWalkStep(currentSeed, currentRHSIndex);
+ }
+
+ } while (currentSeed != null);
+ }
+ seeds = this.nextSeeds(currentRHSIndex);
+ } while (!seeds.isEmpty());
+ }
+ // System.out.println(String.format("Number partitions:\t%d",
+ // this.joinedPartitions.getCount()));
+ }
+
+ private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) {
+ if (nonDependencies.isRepresented(seed.getIndices())) {
+ // System.out.println("Skip because of nonDependency");
+ Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex);
+ this.observations.put(seed.getIndices(), observationOfLHS);
+ this.nonDependencies.add(seed.getIndices());
+ return observationOfLHS;
+ } else if (dependencies.isRepresented(seed.getIndices())) {
+ // System.out.println("Skip because of dependency");
+ Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices());
+ this.observations.put(seed.getIndices(), observationOfLHS);
+ this.dependencies.add(seed.getIndices());
+ return observationOfLHS;
+ }
+
+ FileBasedPartition currentRHSPartition = this.fileBasedPartitions.get(currentRHSIndex);
+ Partition currentLHSPartition = null;
+ Partition currentLHSJoinedRHSPartition = null;
+
+ if (seed.isAtomic()) {
+ currentLHSPartition = this.joinedPartitions.get(seed.getIndices());
+ currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition);
+ } else {
+
+ // if we went upwards in the lattice we can build the currentLHS
+ // partition directly from the previous partition
+ if (seed.getAdditionalColumnIndex() != -1) {
+ int additionalColumn = seed.getAdditionalColumnIndex();
+ Partition previousLHSPartition = joinedPartitions.get(seed.getBaseIndices());
+ if (previousLHSPartition == null) {
+ ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getBaseIndices());
+ previousLHSPartition = ComposedPartition.buildPartition(partitionsToJoin);
+ }
+ FileBasedPartition additionalColumnPartition = this.fileBasedPartitions.get(additionalColumn);
+ currentLHSPartition = this.joinedPartitions.get(previousLHSPartition.getIndices().setCopy(additionalColumn));
+ if (currentLHSPartition == null) {
+ currentLHSPartition = new ComposedPartition(previousLHSPartition, additionalColumnPartition);
+ this.joinedPartitions.addPartition(currentLHSPartition);
+ }
+ currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex));
+ if (currentLHSJoinedRHSPartition == null) {
+ currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition);
+ this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition);
+ }
+ } else {
+ currentLHSPartition = this.joinedPartitions.get(seed.getIndices());
+ if (currentLHSPartition == null) {
+ ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getIndices());
+ currentLHSPartition = ComposedPartition.buildPartition(partitionsToJoin);
+ this.joinedPartitions.addPartition(currentLHSPartition);
+ }
+ currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex));
+ if (currentLHSJoinedRHSPartition == null) {
+ currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition);
+ this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition);
+ }
+ }
+// this.joinedPartitions.addPartition(currentLHSPartition);
+// this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition);
+ }
+
+ if (Partition.representsFD(currentLHSPartition, currentLHSJoinedRHSPartition)) {
+ Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices());
+ this.observations.put(seed.getIndices(), observationOfLHS);
+ this.dependencies.add(seed.getIndices());
+ return observationOfLHS;
+ }
+ Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex);
+ this.observations.put(seed.getIndices(), observationOfLHS);
+ this.nonDependencies.add(seed.getIndices());
+ return observationOfLHS;
+ }
+
+ private Stack nextSeeds(int currentRHSIndex) {
+// System.out.println("Find holes");
+ THashSet deps = new THashSet<>();
+ ArrayList currentMaximalNonDependencies = maximalNonDependencies.getLHSForRHS(currentRHSIndex);
+ HashSet currentMinimalDependencies = new HashSet<>(minimalDependencies.getLHSForRHS(currentRHSIndex));
+ ArrayList newDeps = new ArrayList<>(numberOfColumns * deps.size());
+// Holes holes = new Holes();
+
+// int i = 0;
+// for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) {
+// ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement();
+// if (deps.isEmpty()) {
+// ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns);
+// for (Integer complementColumnIndex : complement.getSetBits()) {
+// deps.add(emptyColumnIndices.setCopy(complementColumnIndex));
+// }
+// } else {
+// for (ColumnCollection dep : deps) {
+// int[] setBits = complement.getSetBits();
+// for (int setBit = 0; setBit < setBits.length; setBit++) {
+// holes.add(dep.setCopy(setBits[setBit]));
+//// System.out.println("Dep:\t" + dep.setCopy(setBits[setBit]));
+// }
+// }
+// // minimize newDeps
+// System.out.println(i++ + "\t" + currentMaximalNonDependencies.size());
+// System.out.println("total deps:\t" + deps.size());
+// System.out.println("before minimizing:\t" + holes.size());
+//// ArrayList minimizedNewDeps = minimizeSeeds(newDeps);
+// holes.minimize();
+// System.out.println("after minimizing:\t" + holes.size());
+// deps.clear();
+// deps.addAll(holes);
+// holes.clear();
+// }
+// }
+
+ for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) {
+ ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement();
+ if (deps.isEmpty()) {
+ ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns);
+ for (int complementColumnIndex : complement.getSetBits()) {
+ deps.add(emptyColumnIndices.setCopy(complementColumnIndex));
+ }
+ } else {
+ for (ColumnCollection dep : deps) {
+ int[] setBits = complement.getSetBits();
+ for (int setBit = 0; setBit < setBits.length; setBit++) {
+ newDeps.add(dep.setCopy(setBits[setBit]));
+ }
+ }
+ // minimize newDeps
+ ArrayList minimizedNewDeps = minimizeSeeds(newDeps);
+ deps.clear();
+ deps.addAll(minimizedNewDeps);
+ newDeps.clear();
+ }
+ }
+
+ // return only elements that aren't already covered by the minimal
+ // dependencies
+ Stack remainingSeeds = new Stack<>();
+ deps.removeAll(currentMinimalDependencies);
+ for (ColumnCollection remainingSeed : deps) {
+ remainingSeeds.push(new Seed(remainingSeed));
+ }
+
+ return remainingSeeds;
+ }
+
+ private ArrayList minimizeSeeds(ArrayList seeds) {
+ long maxCardinality = 0;
+ TLongObjectHashMap> seedsBySize = new TLongObjectHashMap<>(numberOfColumns);
+ for (ColumnCollection seed : seeds) {
+ long cardinalityOfSeed = seed.cardinality();
+ maxCardinality = Math.max(maxCardinality, cardinalityOfSeed);
+ seedsBySize.putIfAbsent(cardinalityOfSeed, new ArrayList(seeds.size()/numberOfColumns));
+ seedsBySize.get(cardinalityOfSeed).add(seed);
+ }
+
+ for (long lowerBound = 1; lowerBound < maxCardinality; lowerBound++) {
+ ArrayList lowerBoundSeeds = seedsBySize.get(lowerBound);
+ if (lowerBoundSeeds != null) {
+ for (long upperBound = maxCardinality; upperBound > lowerBound; upperBound--) {
+ ArrayList upperBoundSeeds = seedsBySize.get(upperBound);
+ if (upperBoundSeeds != null) {
+ for (Iterator lowerIt = lowerBoundSeeds.iterator(); lowerIt.hasNext();) {
+ ColumnCollection lowerSeed = lowerIt.next();
+ for (Iterator upperIt = upperBoundSeeds.iterator(); upperIt.hasNext();) {
+ if (lowerSeed.isSubsetOf(upperIt.next())) {
+ upperIt.remove();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ ArrayList minimizedSeeds = new ArrayList<>();
+ for (ArrayList seedList : seedsBySize.valueCollection()) {
+ for (ColumnCollection seed : seedList) {
+ minimizedSeeds.add(seed);
+ }
+ }
+ return minimizedSeeds;
+ }
+
+ private Seed randomTake() {
+ if (!this.seeds.isEmpty()) {
+ return this.seeds.pop();
+ }
+ return null;
+ }
+
+ private Seed randomWalkStep(Seed currentSeed, int currentRHSIndex) {
+ Observation observationOfSeed = this.observations.get(currentSeed.getIndices());
+
+ if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) {
+ THashSet uncheckedSubsets = this.observations.getUncheckedMaximalSubsets(currentSeed.getIndices(), columnOrder);
+ THashSet prunedNonDependencySubsets = nonDependencies.getPrunedSupersets(uncheckedSubsets);
+ for (ColumnCollection prunedNonDependencySubset : prunedNonDependencySubsets) {
+ observations.put(prunedNonDependencySubset, Observation.NON_DEPENDENCY);
+ }
+ uncheckedSubsets.removeAll(prunedNonDependencySubsets);
+ if (uncheckedSubsets.isEmpty() && prunedNonDependencySubsets.isEmpty()) {
+ observations.put(currentSeed.getIndices(), Observation.MINIMAL_DEPENDENCY);
+ minimalDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex);
+ } else if (!uncheckedSubsets.isEmpty()) {
+ ColumnCollection notRepresentedUncheckedSubset = uncheckedSubsets.iterator().next();
+ if (notRepresentedUncheckedSubset != null) {
+ trace.push(currentSeed);
+ return new Seed(notRepresentedUncheckedSubset);
+ }
+ }
+ } else if (observationOfSeed == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) {
+ THashSet uncheckedSupersets = this.observations.getUncheckedMinimalSupersets(currentSeed.getIndices(), currentRHSIndex, columnOrder);
+ THashSet prunedNonDependencySupersets = nonDependencies.getPrunedSupersets(uncheckedSupersets);
+ THashSet prunedDependencySupersets = dependencies.getPrunedSubsets(uncheckedSupersets);
+ for (ColumnCollection prunedNonDependencySuperset : prunedNonDependencySupersets) {
+ observations.put(prunedNonDependencySuperset, Observation.NON_DEPENDENCY);
+ }
+ for (ColumnCollection prunedDependencySuperset : prunedDependencySupersets) {
+ observations.put(prunedDependencySuperset, Observation.DEPENDENCY);
+ }
+ uncheckedSupersets.removeAll(prunedDependencySupersets);
+ uncheckedSupersets.removeAll(prunedNonDependencySupersets);
+ if (uncheckedSupersets.isEmpty() && prunedNonDependencySupersets.isEmpty()) {
+ observations.put(currentSeed.getIndices(), Observation.MAXIMAL_NON_DEPENDENCY);
+ maximalNonDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex);
+ } else if (!uncheckedSupersets.isEmpty()) {
+ ColumnCollection notRepresentedUncheckedSuperset = uncheckedSupersets.iterator().next();
+ if (notRepresentedUncheckedSuperset != null) {
+ trace.push(currentSeed);
+ int additionalColumn = notRepresentedUncheckedSuperset.removeCopy(currentSeed.getIndices()).nextSetBit(0);
+ return new Seed(notRepresentedUncheckedSuperset, additionalColumn);
+ }
+ }
+ }
+ if (!this.trace.isEmpty()) {
+ Seed nextSeed = this.trace.pop();
+ return nextSeed;
+ }
+ return null;
+ }
+
+ public FunctionalDependencies getDependencies() {
+ return this.minimalDependencies;
+ }
}
\ No newline at end of file
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java
index 4a74fd5..722cd8e 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java
@@ -1,218 +1,218 @@
-package fdiscovery.columns;
-
-import java.util.BitSet;
-
-public class ColumnCollection extends BitSet implements Comparable {
-
- private static final long serialVersionUID = -5256272139963505719L;
-
- private int formatStringWidth;
- protected int numberOfColumns;
- protected int[] setBits;
-
- public ColumnCollection(int numberOfColumns ) {
- this.numberOfColumns = numberOfColumns;
- this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns));
- }
-
- public int[] getSetBits() {
- int[] setBits = new int[this.cardinality()];
-
- int bitIndex = 0;
- int currentArrayIndex = 0;
- while (bitIndex < this.numberOfColumns) {
- int currentNextSetBit = this.nextSetBit(bitIndex);
- if (currentNextSetBit != -1) {
- setBits[currentArrayIndex++] = currentNextSetBit;
- bitIndex = currentNextSetBit + 1;
- } else {
- bitIndex = this.numberOfColumns;
- }
- }
-
- return setBits;
- }
-
- public boolean isAtomic() {
- return this.cardinality() == 1;
- }
-
- public ColumnCollection addColumn(int columnIndex) {
- ColumnCollection copy = (ColumnCollection) this.clone();
- copy.set(columnIndex);
-
- return copy;
- }
-
- public ColumnCollection andCopy(ColumnCollection other) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.and(other);
-
- return copy;
- }
-
- public ColumnCollection clearCopy(int startBit) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.clear(startBit);
-
- return copy;
- }
-
- public ColumnCollection clearAllCopy() {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.clear(0, this.numberOfColumns);
-
- return copy;
- }
-
- public ColumnCollection andNotCopy(ColumnCollection other) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.andNot(other);
-
- return copy;
- }
-
- public ColumnCollection removeCopy(ColumnCollection other) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.andNot(other);
-
- return copy;
- }
-
- public ColumnCollection orCopy(ColumnCollection other) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.or(other);
-
- return copy;
- }
-
- public ColumnCollection setCopy(int index) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.set(index);
-
- return copy;
- }
-
- public ColumnCollection xorCopy(ColumnCollection other) {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.xor(other);
-
- return copy;
- }
-
- public ColumnCollection complementCopy() {
- ColumnCollection copy = (ColumnCollection)this.clone();
- copy.flip(0, this.numberOfColumns);
-
- return copy;
- }
-
- public ColumnCollection complement() {
- this.flip(0, this.numberOfColumns);
- return this;
- }
-
- public boolean isSubsetOf(ColumnCollection other) {
- return this.unionCount(other) == other.cardinality();
- }
-
- public boolean isSupersetOf(ColumnCollection other) {
- return this.unionCount(other) == this.cardinality();
-
- }
-
- public boolean isProperSubsetOf(ColumnCollection other) {
- long cardinality = this.cardinality();
- long otherCardinality = other.cardinality();
- if (cardinality != otherCardinality) {
- if (this.unionCount(other) == otherCardinality) {
- return true;
- }
- }
- return false;
- }
-
-
- public boolean isProperSupersetOf(ColumnCollection other) {
- long cardinality = this.cardinality();
- long otherCardinality = other.cardinality();
- if (cardinality != otherCardinality) {
- if (this.unionCount(other) == cardinality) {
- return true;
- }
- }
- return false;
- }
-
- public int unionCount(ColumnCollection other) {
- ColumnCollection union = (ColumnCollection) this.clone();
- union.and(other);
- return union.cardinality();
- }
-
- public boolean isSubsetOrSupersetOf(ColumnCollection other) {
- return isSubsetOf(other) || isSupersetOf(other);
- }
-
- public int getNumberOfColumns() {
- return this.numberOfColumns;
- }
-
- public long getMostRightBit() {
- int bitIndex = 0;
- while (bitIndex < this.numberOfColumns) {
- int currentNextSetBit = this.nextSetBit(bitIndex);
- if (currentNextSetBit != -1) {
- bitIndex = currentNextSetBit + 1;
- } else {
- return bitIndex - 1;
- }
- }
- return bitIndex;
- }
-
- public ColumnCollection removeColumnCopy(int columnIndex) {
- ColumnCollection copy = (ColumnCollection) this.clone();
- copy.clear(columnIndex);
-
- return copy;
- }
-
- @Override
- public int compareTo(BitSet other) {
- ColumnCollection copy = (ColumnCollection) this.clone();
- copy.xor(other);
- int lowestBit = copy.nextSetBit(0);
- if (lowestBit == -1) {
- return 0;
- } else if (this.get(lowestBit)) {
- return -1;
- } else {
- return 1;
- }
- }
-
- public String toString() {
- StringBuilder outputBuilder = new StringBuilder();
- if (this.cardinality() > 0) {
- for (int columnIndex : this.getSetBits()) {
- outputBuilder.append(String.format("%0" + formatStringWidth + "d,", Integer.valueOf(columnIndex)));
-
- }
- } else {
- outputBuilder.append("emptyset");
- }
-
- return outputBuilder.toString();
- }
-
- public void remove(ColumnCollection other) {
- this.andNot(other);
- }
-
- public static int intersectionCount(ColumnCollection set1, ColumnCollection set2) {
- ColumnCollection intersection = (ColumnCollection) set1.clone();
- intersection.and(set2);
- return intersection.cardinality();
- }
-}
+package fdiscovery.columns;
+
+import java.util.BitSet;
+
+public class ColumnCollection extends BitSet implements Comparable {
+
+ private static final long serialVersionUID = -5256272139963505719L;
+
+ private int formatStringWidth;
+ protected int numberOfColumns;
+ protected int[] setBits;
+
+ public ColumnCollection(int numberOfColumns ) {
+ this.numberOfColumns = numberOfColumns;
+ this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns));
+ }
+
+ public int[] getSetBits() {
+ int[] setBits = new int[this.cardinality()];
+
+ int bitIndex = 0;
+ int currentArrayIndex = 0;
+ while (bitIndex < this.numberOfColumns) {
+ int currentNextSetBit = this.nextSetBit(bitIndex);
+ if (currentNextSetBit != -1) {
+ setBits[currentArrayIndex++] = currentNextSetBit;
+ bitIndex = currentNextSetBit + 1;
+ } else {
+ bitIndex = this.numberOfColumns;
+ }
+ }
+
+ return setBits;
+ }
+
+ public boolean isAtomic() {
+ return this.cardinality() == 1;
+ }
+
+ public ColumnCollection addColumn(int columnIndex) {
+ ColumnCollection copy = (ColumnCollection) this.clone();
+ copy.set(columnIndex);
+
+ return copy;
+ }
+
+ public ColumnCollection andCopy(ColumnCollection other) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.and(other);
+
+ return copy;
+ }
+
+ public ColumnCollection clearCopy(int startBit) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.clear(startBit);
+
+ return copy;
+ }
+
+ public ColumnCollection clearAllCopy() {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.clear(0, this.numberOfColumns);
+
+ return copy;
+ }
+
+ public ColumnCollection andNotCopy(ColumnCollection other) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.andNot(other);
+
+ return copy;
+ }
+
+ public ColumnCollection removeCopy(ColumnCollection other) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.andNot(other);
+
+ return copy;
+ }
+
+ public ColumnCollection orCopy(ColumnCollection other) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.or(other);
+
+ return copy;
+ }
+
+ public ColumnCollection setCopy(int index) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.set(index);
+
+ return copy;
+ }
+
+ public ColumnCollection xorCopy(ColumnCollection other) {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.xor(other);
+
+ return copy;
+ }
+
+ public ColumnCollection complementCopy() {
+ ColumnCollection copy = (ColumnCollection)this.clone();
+ copy.flip(0, this.numberOfColumns);
+
+ return copy;
+ }
+
+ public ColumnCollection complement() {
+ this.flip(0, this.numberOfColumns);
+ return this;
+ }
+
+ public boolean isSubsetOf(ColumnCollection other) {
+ return this.unionCount(other) == other.cardinality();
+ }
+
+ public boolean isSupersetOf(ColumnCollection other) {
+ return this.unionCount(other) == this.cardinality();
+
+ }
+
+ public boolean isProperSubsetOf(ColumnCollection other) {
+ long cardinality = this.cardinality();
+ long otherCardinality = other.cardinality();
+ if (cardinality != otherCardinality) {
+ if (this.unionCount(other) == otherCardinality) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+
+ public boolean isProperSupersetOf(ColumnCollection other) {
+ long cardinality = this.cardinality();
+ long otherCardinality = other.cardinality();
+ if (cardinality != otherCardinality) {
+ if (this.unionCount(other) == cardinality) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public int unionCount(ColumnCollection other) {
+ ColumnCollection union = (ColumnCollection) this.clone();
+ union.or(other);
+ return union.cardinality();
+ }
+
+ public boolean isSubsetOrSupersetOf(ColumnCollection other) {
+ return isSubsetOf(other) || isSupersetOf(other);
+ }
+
+ public int getNumberOfColumns() {
+ return this.numberOfColumns;
+ }
+
+ public long getMostRightBit() {
+ int bitIndex = 0;
+ while (bitIndex < this.numberOfColumns) {
+ int currentNextSetBit = this.nextSetBit(bitIndex);
+ if (currentNextSetBit != -1) {
+ bitIndex = currentNextSetBit + 1;
+ } else {
+ return bitIndex - 1;
+ }
+ }
+ return bitIndex;
+ }
+
+ public ColumnCollection removeColumnCopy(int columnIndex) {
+ ColumnCollection copy = (ColumnCollection) this.clone();
+ copy.clear(columnIndex);
+
+ return copy;
+ }
+
+ @Override
+ public int compareTo(BitSet other) {
+ ColumnCollection copy = (ColumnCollection) this.clone();
+ copy.xor(other);
+ int lowestBit = copy.nextSetBit(0);
+ if (lowestBit == -1) {
+ return 0;
+ } else if (this.get(lowestBit)) {
+ return -1;
+ } else {
+ return 1;
+ }
+ }
+
+ public String toString() {
+ StringBuilder outputBuilder = new StringBuilder();
+ if (this.cardinality() > 0) {
+ for (int columnIndex : this.getSetBits()) {
+ outputBuilder.append(String.format("%0" + formatStringWidth + "d,", Integer.valueOf(columnIndex)));
+
+ }
+ } else {
+ outputBuilder.append("emptyset");
+ }
+
+ return outputBuilder.toString();
+ }
+
+ public void remove(ColumnCollection other) {
+ this.andNot(other);
+ }
+
+ public static int intersectionCount(ColumnCollection set1, ColumnCollection set2) {
+ ColumnCollection intersection = (ColumnCollection) set1.clone();
+ intersection.and(set2);
+ return intersection.cardinality();
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java
index 9b5cda4..44a6445 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java
@@ -1,14 +1,14 @@
-package fdiscovery.columns;
-
-
-public class DifferenceSet extends ColumnCollection {
-
- private static final long serialVersionUID = -5174627424398542681L;
-
- public DifferenceSet(AgreeSet agreeSet) {
- super(agreeSet.getNumberOfColumns());
-
- this.or(agreeSet);
- this.flip(0, this.numberOfColumns);
- }
-}
+package fdiscovery.columns;
+
+
+public class DifferenceSet extends ColumnCollection {
+
+ private static final long serialVersionUID = -5174627424398542681L;
+
+ public DifferenceSet(AgreeSet agreeSet) {
+ super(agreeSet.getNumberOfColumns());
+
+ this.or(agreeSet);
+ this.flip(0, this.numberOfColumns);
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java
index 06b713b..7136090 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java
@@ -1,26 +1,26 @@
-package fdiscovery.columns;
-
-import java.util.ArrayList;
-
-
-public class Path extends ColumnCollection {
-
- private static final long serialVersionUID = -6451347203736964695L;
-
- public Path(int numberOfColumns) {
- super(numberOfColumns);
- }
-
- public ArrayList getMaximalSubsets() {
- ArrayList maximalSubsetPaths = new ArrayList<>();
-
- if (this.isEmpty()) {
- return new ArrayList<>();
- }
- for (int columnIndex : this.getSetBits()) {
- maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex));
- }
-
- return maximalSubsetPaths;
- }
-}
+package fdiscovery.columns;
+
+import java.util.ArrayList;
+
+
+public class Path extends ColumnCollection {
+
+ private static final long serialVersionUID = -6451347203736964695L;
+
+ public Path(int numberOfColumns) {
+ super(numberOfColumns);
+ }
+
+ public ArrayList getMaximalSubsets() {
+ ArrayList maximalSubsetPaths = new ArrayList<>();
+
+ if (this.isEmpty()) {
+ return new ArrayList<>();
+ }
+ for (int columnIndex : this.getSetBits()) {
+ maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex));
+ }
+
+ return maximalSubsetPaths;
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java
index 9fd3d05..9fee75a 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java
@@ -1,62 +1,62 @@
-package fdiscovery.columns;
-
-import fdiscovery.partitions.FileBasedPartition;
-import fdiscovery.partitions.Partition;
-
-public class Seed implements Comparable {
-
- private ColumnCollection indices;
- private int additionalColumnIndex;
- private double distinctiveness;
-
- public Seed(Partition a, FileBasedPartition b) {
- this.indices = a.getIndices().orCopy(b.getIndices());
- this.additionalColumnIndex = b.getIndex();
- this.distinctiveness = Partition.estimateDistinctiveness(a, b);
- }
-
- // inverse order
- @Override
- public int compareTo(Seed o) {
- if (this.distinctiveness != o.distinctiveness) {
- if (o.distinctiveness - this.distinctiveness < 0) {
- return -1;
- }
- return 1;
- }
- return this.indices.compareTo(o.indices);
- }
-
- @Override
- public boolean equals(Object o) {
- if (o == null) {
- return false;
- }
- if (o == this) {
- return true;
- }
- if (!(o instanceof Seed)) {
- return false;
- }
- Seed otherSeed = (Seed) o;
- return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0;
- }
-
- public ColumnCollection getBaseIndices() {
- return this.indices.removeColumnCopy(additionalColumnIndex);
- }
-
- public ColumnCollection getIndices() {
- return this.indices;
- }
-
- public int getAdditionalColumnIndex() {
- return this.additionalColumnIndex;
- }
-
- public String toString() {
- StringBuilder outputBuilder = new StringBuilder();
- outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, Double.valueOf(this.distinctiveness)));
- return outputBuilder.toString();
- }
-}
+package fdiscovery.columns;
+
+import fdiscovery.partitions.FileBasedPartition;
+import fdiscovery.partitions.Partition;
+
+public class Seed implements Comparable {
+
+ private ColumnCollection indices;
+ private int additionalColumnIndex;
+ private double distinctiveness;
+
+ public Seed(Partition a, FileBasedPartition b) {
+ this.indices = a.getIndices().orCopy(b.getIndices());
+ this.additionalColumnIndex = b.getIndex();
+ this.distinctiveness = Partition.estimateDistinctiveness(a, b);
+ }
+
+ // inverse order
+ @Override
+ public int compareTo(Seed o) {
+ if (this.distinctiveness != o.distinctiveness) {
+ if (o.distinctiveness - this.distinctiveness < 0) {
+ return -1;
+ }
+ return 1;
+ }
+ return this.indices.compareTo(o.indices);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ }
+ if (o == this) {
+ return true;
+ }
+ if (!(o instanceof Seed)) {
+ return false;
+ }
+ Seed otherSeed = (Seed) o;
+ return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0;
+ }
+
+ public ColumnCollection getBaseIndices() {
+ return this.indices.removeColumnCopy(additionalColumnIndex);
+ }
+
+ public ColumnCollection getIndices() {
+ return this.indices;
+ }
+
+ public int getAdditionalColumnIndex() {
+ return this.additionalColumnIndex;
+ }
+
+ public String toString() {
+ StringBuilder outputBuilder = new StringBuilder();
+ outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, Double.valueOf(this.distinctiveness)));
+ return outputBuilder.toString();
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java
index 5e3c2c0..2cb0e39 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java
@@ -1,50 +1,50 @@
-package fdiscovery.equivalence;
-
-import java.util.HashSet;
-import java.util.Set;
-
-public class EquivalenceGroupHashSet extends HashSet implements Comparable, Equivalence {
-
- private static final long serialVersionUID = 8411462245069900864L;
-
- private int identifier;
-
- public EquivalenceGroupHashSet() {
- this.identifier = Equivalence.unassignedIdentifier;
- }
-
- public EquivalenceGroupHashSet(int identifier) {
- this.identifier = identifier;
- }
-
- @Override
- public int compareTo(EquivalenceGroupHashSet o) {
- if (this.size() != o.size()) {
- return this.size() - o.size();
- }
- return this.identifier - o.identifier;
- }
-
- @Override
- public int getIdentifier() {
- return this.identifier;
- }
-
- @Override
- public > boolean isProperSubset(T other) {
- if (this.size() >= other.size()) {
- return false;
- }
-
- return other.containsAll(this);
- }
-
- @Override
- public void add(int value) {
- if (this.identifier == Equivalence.unassignedIdentifier) {
- this.identifier = value;
- }
-
- super.add(Integer.valueOf(value));
- }
+package fdiscovery.equivalence;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class EquivalenceGroupHashSet extends HashSet implements Comparable, Equivalence {
+
+ private static final long serialVersionUID = 8411462245069900864L;
+
+ private int identifier;
+
+ public EquivalenceGroupHashSet() {
+ this.identifier = Equivalence.unassignedIdentifier;
+ }
+
+ public EquivalenceGroupHashSet(int identifier) {
+ this.identifier = identifier;
+ }
+
+ @Override
+ public int compareTo(EquivalenceGroupHashSet o) {
+ if (this.size() != o.size()) {
+ return this.size() - o.size();
+ }
+ return this.identifier - o.identifier;
+ }
+
+ @Override
+ public int getIdentifier() {
+ return this.identifier;
+ }
+
+ @Override
+ public > boolean isProperSubset(T other) {
+ if (this.size() >= other.size()) {
+ return false;
+ }
+
+ return other.containsAll(this);
+ }
+
+ @Override
+ public void add(int value) {
+ if (this.identifier == Equivalence.unassignedIdentifier) {
+ this.identifier = value;
+ }
+
+ super.add(Integer.valueOf(value));
+ }
}
\ No newline at end of file
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java
index ce33016..f89ce2d 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java
@@ -43,6 +43,6 @@ public int compareTo(EquivalenceGroupTIntHashSet o) {
if (this.size() != o.size()) {
return this.size() - o.size();
}
- return this.identifier - o.identifier;
+ return this.identifier - o.identifier;
}
}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java
index 05ead74..35cc7c9 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java
@@ -1,75 +1,75 @@
-package fdiscovery.fastfds;
-
-import gnu.trove.map.hash.TIntIntHashMap;
-
-import java.util.ArrayList;
-import java.util.Collections;
-
-import fdiscovery.columns.DifferenceSet;
-import fdiscovery.columns.DifferenceSets;
-
-public class PartialOrder extends ArrayList {
-
- private static final long serialVersionUID = -4312148937513750522L;
-
- public PartialOrder(DifferenceSets differenceSets) {
- TIntIntHashMap orderMap = new TIntIntHashMap();
-
- for (DifferenceSet differenceSet : differenceSets) {
- // increase the cover count for set columns
- int bitIndex = 0;
- while (bitIndex < differenceSet.getNumberOfColumns()) {
- int currentNextSetBit = differenceSet.nextSetBit(bitIndex);
- if (currentNextSetBit != -1) {
- bitIndex = currentNextSetBit + 1;
- orderMap.putIfAbsent(currentNextSetBit, 0);
- orderMap.increment(currentNextSetBit);
- } else {
- bitIndex = differenceSet.getNumberOfColumns();
- }
- }
- }
-
- for (int index : orderMap.keys()) {
- this.add(new CoverOrder(index, orderMap.get(index)));
- }
-
- Collections.sort(this, Collections.reverseOrder());
-
- }
-
- public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) {
- TIntIntHashMap orderMap = new TIntIntHashMap();
-
- for (DifferenceSet differenceSet : differenceSets) {
- // increase the cover count for set columns
- int bitIndex = columnIndexToSkip;
- while (bitIndex < differenceSet.getNumberOfColumns()) {
- int currentNextSetBit = differenceSet.nextSetBit(bitIndex);
- if (currentNextSetBit != -1) {
- bitIndex = currentNextSetBit + 1;
- orderMap.putIfAbsent(currentNextSetBit, 0);
- orderMap.increment(currentNextSetBit);
- } else {
- bitIndex = differenceSet.getNumberOfColumns();
- }
- }
- }
-
- for (int index : orderMap.keys()) {
- this.add(new CoverOrder(index, orderMap.get(index)));
- }
-
- Collections.sort(this, Collections.reverseOrder());
-
- }
-
- public ArrayList getOrderedColumns() {
- ArrayList orderedColumns = new ArrayList<>();
- for (CoverOrder order : this) {
- orderedColumns.add(Integer.valueOf(order.getColumnIndex()));
- }
-
- return orderedColumns;
- }
-}
+package fdiscovery.fastfds;
+
+import gnu.trove.map.hash.TIntIntHashMap;
+
+import java.util.ArrayList;
+import java.util.Collections;
+
+import fdiscovery.columns.DifferenceSet;
+import fdiscovery.columns.DifferenceSets;
+
+public class PartialOrder extends ArrayList {
+
+ private static final long serialVersionUID = -4312148937513750522L;
+
+ public PartialOrder(DifferenceSets differenceSets) {
+ TIntIntHashMap orderMap = new TIntIntHashMap();
+
+ for (DifferenceSet differenceSet : differenceSets) {
+ // increase the cover count for set columns
+ int bitIndex = 0;
+ while (bitIndex < differenceSet.getNumberOfColumns()) {
+ int currentNextSetBit = differenceSet.nextSetBit(bitIndex);
+ if (currentNextSetBit != -1) {
+ bitIndex = currentNextSetBit + 1;
+ orderMap.putIfAbsent(currentNextSetBit, 0);
+ orderMap.increment(currentNextSetBit);
+ } else {
+ bitIndex = differenceSet.getNumberOfColumns();
+ }
+ }
+ }
+
+ for (int index : orderMap.keys()) {
+ this.add(new CoverOrder(index, orderMap.get(index)));
+ }
+
+ Collections.sort(this, Collections.reverseOrder());
+
+ }
+
+ public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) {
+ TIntIntHashMap orderMap = new TIntIntHashMap();
+
+ for (DifferenceSet differenceSet : differenceSets) {
+ // increase the cover count for set columns
+ int bitIndex = columnIndexToSkip;
+ while (bitIndex < differenceSet.getNumberOfColumns()) {
+ int currentNextSetBit = differenceSet.nextSetBit(bitIndex);
+ if (currentNextSetBit != -1) {
+ bitIndex = currentNextSetBit + 1;
+ orderMap.putIfAbsent(currentNextSetBit, 0);
+ orderMap.increment(currentNextSetBit);
+ } else {
+ bitIndex = differenceSet.getNumberOfColumns();
+ }
+ }
+ }
+
+ for (int index : orderMap.keys()) {
+ this.add(new CoverOrder(index, orderMap.get(index)));
+ }
+
+ Collections.sort(this, Collections.reverseOrder());
+
+ }
+
+ public ArrayList getOrderedColumns() {
+ ArrayList orderedColumns = new ArrayList<>();
+ for (CoverOrder order : this) {
+ orderedColumns.add(Integer.valueOf(order.getColumnIndex()));
+ }
+
+ return orderedColumns;
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java
index e8e89c9..26dd8c2 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java
@@ -1,219 +1,219 @@
-package fdiscovery.fastfds.runner;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.IOException;
-
-import org.apache.commons.cli.CommandLine;
-
-import fdiscovery.columns.AgreeSets;
-import fdiscovery.columns.ColumnCollection;
-import fdiscovery.columns.DifferenceSets;
-import fdiscovery.columns.Path;
-
-import com.rits.cloning.Cloner;
-
-import fdiscovery.partitions.StrippedPartitions;
-import fdiscovery.preprocessing.SVFileProcessor;
-import fdiscovery.fastfds.EquivalenceClasses;
-import fdiscovery.fastfds.MaximalEquivalenceClasses;
-import fdiscovery.fastfds.PartialOrder;
-import fdiscovery.general.CLIParserMiner;
-import fdiscovery.general.ColumnFiles;
-import fdiscovery.general.FunctionalDependencies;
-import fdiscovery.general.Miner;
-
-public class FastFDs extends Miner {
-
- private int numberOfColumns;
- private int numberOfRows;
- private FunctionalDependencies minimalDependencies;
- private DifferenceSets differenceSets;
-
- @SuppressWarnings("unused")
- public static void main2(String[] args) {
- createColumDirectory();
- createResultDirectory();
-
- File source = new File(Miner.input);
- SVFileProcessor inputFileProcessor = null;
- try {
- long timeStart = System.currentTimeMillis();
-
- inputFileProcessor = new SVFileProcessor(source);
- inputFileProcessor.init();
- System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter());
- System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns());
- System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows());
- inputFileProcessor.createColumnFiles();
- FastFDs fastFDRunner = new FastFDs(inputFileProcessor);
-
- fastFDRunner.run();
- System.out.println(String.format("Dependencies: %d.", Integer.valueOf(fastFDRunner.minimalDependencies.getCount())));
- long timeFindFDs = System.currentTimeMillis();
- System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s");
- System.out.println(fastFDRunner.getDependencies());
- } catch (FileNotFoundException e) {
- System.out.println("The input file could not be found.");
- } catch (IOException e) {
- System.out.println("The input reader could not be reset.");
- }
- }
-
- public static void main(String[] args) {
- CLIParserMiner parser = new CLIParserMiner();
- CommandLine cli = parser.parse(args);
- String inputFilename = new String();
- String columnFileDirectory = new String();
- String resultFile = new String();
- int numberOfColumns = 0;
- int numberOfRows = 0;
-
- if (cli.hasOption("file")) {
- inputFilename = cli.getOptionValue("file");
- }
- if (cli.hasOption("input")) {
- columnFileDirectory = cli.getOptionValue("input");
- }
- if (cli.hasOption("result")) {
- resultFile = cli.getOptionValue("result");
- }
- if (cli.hasOption("columns")) {
- numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue();
- }
- if (cli.hasOption("rows")) {
- numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue();
- }
- ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows);
- long timeStart = System.currentTimeMillis();
- try {
- FastFDs runner = new FastFDs(columnFiles, numberOfRows);
- runner.run();
- long timeEnd = System.currentTimeMillis();
- runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename);
- } catch(OutOfMemoryError e) {
- System.exit(Miner.STATUS_OOM);
- }
- System.exit(0);
- }
-
- private void writeOutputSuccessful(String outputFile, long time, String inputFileName) {
- String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1";
-
- StringBuilder outputBuilder = new StringBuilder();
- if (!inputFileName.isEmpty()) {
- outputBuilder.append(String.format("%s\t", inputFileName));
- }
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows)));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns)));
- outputBuilder.append(String.format("%s\t", timeString));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount())));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(0)));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(0)));
- outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory())));
- outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false)));
-
- try {
- BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true));
- resultFileWriter.write(outputBuilder.toString());
- System.out.print(outputBuilder.toString());
- resultFileWriter.close();
- } catch (IOException e) {
- System.out.println("Couldn't write output.");
- }
- }
-
- public FastFDs(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError {
- this.minimalDependencies = new FunctionalDependencies();
- this.numberOfColumns = columnFiles.getNumberOfColumns();
- this.numberOfRows = numberOfRows;
-
- StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles);
- EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions);
- MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions);
- strippedPartitions.clear();
- AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows);
- maximalEquivalenceClasses.clear();
- equivalenceClasses.clear();
- this.differenceSets = new DifferenceSets(agreeSets);
- agreeSets.clear();
- }
-
- public FastFDs(SVFileProcessor table) throws OutOfMemoryError {
- this.minimalDependencies = new FunctionalDependencies();
- this.numberOfColumns = table.getNumberOfColumns();
- this.numberOfRows = table.getNumberOfRows();
-
- ColumnFiles columnFiles = table.getColumnFiles();
- StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles);
- EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions);
- MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions);
- strippedPartitions.clear();
- AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows);
- maximalEquivalenceClasses.clear();
- equivalenceClasses.clear();
- this.differenceSets = new DifferenceSets(agreeSets);
- agreeSets.clear();
- }
-
- public void run() throws OutOfMemoryError {
- int numberOfColumns = this.numberOfColumns;
-
- DifferenceSets[] differenceSetsModulo = this.differenceSets.allModulo(this.numberOfColumns);
- for (int rhsIndex = 0; rhsIndex < numberOfColumns; rhsIndex++) {
- DifferenceSets orig = differenceSetsModulo[rhsIndex];
- Cloner cloner = new Cloner();
- DifferenceSets uncovered = cloner.deepClone(orig);
- if (orig.isEmpty()) {
- ColumnCollection lhs = new ColumnCollection(this.numberOfColumns);
-
- for (int lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) {
- this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex);
- }
- }
- else if (!orig.containsEmptySet()) {
- PartialOrder currentOrder = new PartialOrder(orig);
- Path path = new Path(numberOfColumns);
- findCovers(rhsIndex, orig, uncovered, path, currentOrder);
- }
- }
- }
-
- public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets uncovered, Path currentPath, PartialOrder currentOrder) {
- // no dependencies here
- if (currentOrder.isEmpty() && !uncovered.isEmpty()) {
- return;
- }
-
- if (uncovered.isEmpty()) {
- if (!orig.maximumSubsetCoversDifferenceSet(currentPath)) {
- this.minimalDependencies.addRHSColumn(currentPath, columnIndex);
- } else {
- // dependency not minimal
- return;
- }
- }
-
- // RECURSIVE CASE
- for (int remainingColumn : currentOrder.getOrderedColumns()) {
- DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn);
- PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn);
- Path nextPath = (Path) currentPath.addColumn(remainingColumn);
-
- nextPath.addColumn(remainingColumn);
- findCovers(columnIndex, orig, nextDifferenceSets, nextPath, nextOrder);
- }
- }
-
- public FunctionalDependencies getDependencies() {
- return this.minimalDependencies;
- }
-}
+package fdiscovery.fastfds.runner;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+
+import org.apache.commons.cli.CommandLine;
+
+import fdiscovery.columns.AgreeSets;
+import fdiscovery.columns.ColumnCollection;
+import fdiscovery.columns.DifferenceSets;
+import fdiscovery.columns.Path;
+
+import com.rits.cloning.Cloner;
+
+import fdiscovery.partitions.StrippedPartitions;
+import fdiscovery.preprocessing.SVFileProcessor;
+import fdiscovery.fastfds.EquivalenceClasses;
+import fdiscovery.fastfds.MaximalEquivalenceClasses;
+import fdiscovery.fastfds.PartialOrder;
+import fdiscovery.general.CLIParserMiner;
+import fdiscovery.general.ColumnFiles;
+import fdiscovery.general.FunctionalDependencies;
+import fdiscovery.general.Miner;
+
+public class FastFDs extends Miner {
+
+ private int numberOfColumns;
+ private int numberOfRows;
+ private FunctionalDependencies minimalDependencies;
+ private DifferenceSets differenceSets;
+
+ @SuppressWarnings("unused")
+ public static void main2(String[] args) {
+ createColumDirectory();
+ createResultDirectory();
+
+ File source = new File(Miner.input);
+ SVFileProcessor inputFileProcessor = null;
+ try {
+ long timeStart = System.currentTimeMillis();
+
+ inputFileProcessor = new SVFileProcessor(source);
+ inputFileProcessor.init();
+ System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter());
+ System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns());
+ System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows());
+ inputFileProcessor.createColumnFiles();
+ FastFDs fastFDRunner = new FastFDs(inputFileProcessor);
+
+ fastFDRunner.run();
+ System.out.println(String.format("Dependencies: %d.", Integer.valueOf(fastFDRunner.minimalDependencies.getCount())));
+ long timeFindFDs = System.currentTimeMillis();
+ System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s");
+ System.out.println(fastFDRunner.getDependencies());
+ } catch (FileNotFoundException e) {
+ System.out.println("The input file could not be found.");
+ } catch (IOException e) {
+ System.out.println("The input reader could not be reset.");
+ }
+ }
+
+ public static void main(String[] args) {
+ CLIParserMiner parser = new CLIParserMiner();
+ CommandLine cli = parser.parse(args);
+ String inputFilename = new String();
+ String columnFileDirectory = new String();
+ String resultFile = new String();
+ int numberOfColumns = 0;
+ int numberOfRows = 0;
+
+ if (cli.hasOption("file")) {
+ inputFilename = cli.getOptionValue("file");
+ }
+ if (cli.hasOption("input")) {
+ columnFileDirectory = cli.getOptionValue("input");
+ }
+ if (cli.hasOption("result")) {
+ resultFile = cli.getOptionValue("result");
+ }
+ if (cli.hasOption("columns")) {
+ numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue();
+ }
+ if (cli.hasOption("rows")) {
+ numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue();
+ }
+ ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows);
+ long timeStart = System.currentTimeMillis();
+ try {
+ FastFDs runner = new FastFDs(columnFiles, numberOfRows);
+ runner.run();
+ long timeEnd = System.currentTimeMillis();
+ runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename);
+ } catch(OutOfMemoryError e) {
+ System.exit(Miner.STATUS_OOM);
+ }
+ System.exit(0);
+ }
+
+ private void writeOutputSuccessful(String outputFile, long time, String inputFileName) {
+ String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1";
+
+ StringBuilder outputBuilder = new StringBuilder();
+ if (!inputFileName.isEmpty()) {
+ outputBuilder.append(String.format("%s\t", inputFileName));
+ }
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows)));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns)));
+ outputBuilder.append(String.format("%s\t", timeString));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount())));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(0)));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(0)));
+ outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory())));
+ outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false)));
+
+ try {
+ BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true));
+ resultFileWriter.write(outputBuilder.toString());
+ System.out.print(outputBuilder.toString());
+ resultFileWriter.close();
+ } catch (IOException e) {
+ System.out.println("Couldn't write output.");
+ }
+ }
+
+ public FastFDs(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError {
+ this.minimalDependencies = new FunctionalDependencies();
+ this.numberOfColumns = columnFiles.getNumberOfColumns();
+ this.numberOfRows = numberOfRows;
+
+ StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles);
+ EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions);
+ MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions);
+ strippedPartitions.clear();
+ AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows);
+ maximalEquivalenceClasses.clear();
+ equivalenceClasses.clear();
+ this.differenceSets = new DifferenceSets(agreeSets);
+ agreeSets.clear();
+ }
+
+ public FastFDs(SVFileProcessor table) throws OutOfMemoryError {
+ this.minimalDependencies = new FunctionalDependencies();
+ this.numberOfColumns = table.getNumberOfColumns();
+ this.numberOfRows = table.getNumberOfRows();
+
+ ColumnFiles columnFiles = table.getColumnFiles();
+ StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles);
+ EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions);
+ MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions);
+ strippedPartitions.clear();
+ AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows);
+ maximalEquivalenceClasses.clear();
+ equivalenceClasses.clear();
+ this.differenceSets = new DifferenceSets(agreeSets);
+ agreeSets.clear();
+ }
+
+ public void run() throws OutOfMemoryError {
+ int numberOfColumns = this.numberOfColumns;
+
+ DifferenceSets[] differenceSetsModulo = this.differenceSets.allModulo(this.numberOfColumns);
+ for (int rhsIndex = 0; rhsIndex < numberOfColumns; rhsIndex++) {
+ DifferenceSets orig = differenceSetsModulo[rhsIndex];
+ Cloner cloner = new Cloner();
+ DifferenceSets uncovered = cloner.deepClone(orig);
+ if (orig.isEmpty()) {
+ ColumnCollection lhs = new ColumnCollection(this.numberOfColumns);
+
+ for (int lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) {
+ this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex);
+ }
+ }
+ else if (!orig.containsEmptySet()) {
+ PartialOrder currentOrder = new PartialOrder(orig);
+ Path path = new Path(numberOfColumns);
+ findCovers(rhsIndex, orig, uncovered, path, currentOrder);
+ }
+ }
+ }
+
+ public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets uncovered, Path currentPath, PartialOrder currentOrder) {
+ // no dependencies here
+ if (currentOrder.isEmpty() && !uncovered.isEmpty()) {
+ return;
+ }
+
+ if (uncovered.isEmpty()) {
+ if (!orig.maximumSubsetCoversDifferenceSet(currentPath)) {
+ this.minimalDependencies.addRHSColumn(currentPath, columnIndex);
+ } else {
+ // dependency not minimal
+ return;
+ }
+ }
+
+ // RECURSIVE CASE
+ for (int remainingColumn : currentOrder.getOrderedColumns()) {
+ DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn);
+ PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn);
+ Path nextPath = (Path) currentPath.addColumn(remainingColumn);
+
+ nextPath.addColumn(remainingColumn);
+ findCovers(columnIndex, orig, nextDifferenceSets, nextPath, nextOrder);
+ }
+ }
+
+ public FunctionalDependencies getDependencies() {
+ return this.minimalDependencies;
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java
index 43d0172..afc934c 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java
@@ -1,219 +1,219 @@
-package fdiscovery.general;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.FilenameFilter;
-import java.io.IOException;
-import java.util.Arrays;
-
-import org.apache.commons.exec.CommandLine;
-import org.apache.commons.exec.DefaultExecuteResultHandler;
-import org.apache.commons.exec.DefaultExecutor;
-import org.apache.commons.exec.ExecuteWatchdog;
-import org.apache.commons.exec.PumpStreamHandler;
-
-import fdiscovery.preprocessing.SVFileProcessor;
-import gnu.trove.map.hash.THashMap;
-
-public class Benchmarker {
-
- protected static File[] getBenchmarkFilesWithPattern(File benchmarkDirectory) {
- File[] benchmarkFiles = benchmarkDirectory.listFiles(new FilenameFilter() {
-
- @Override
- public boolean accept(File dir, String name) {
- return name.matches(Miner.BENCHMARK_FILE_REGEX);
- }
- });
- return benchmarkFiles;
- }
-
- protected static final String getResultFileName(String inputDirectory, String miner) {
- String[] splitInputDirectory = inputDirectory.split("\\" + File.separator);
- if (splitInputDirectory.length >= 2) {
- String staticComponent = splitInputDirectory[splitInputDirectory.length-1];
- String source = splitInputDirectory[splitInputDirectory.length-2];
- return String.format("%s%s-%s-%s.dat", Miner.RESULT_FILE_PATH, miner, staticComponent, source);
- }
- return new String();
- }
-
- protected static final void writeErrorCode(File resultFile, int exitCode) {
- try {
- BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile, true));
- if (exitCode == Miner.STATUS_OOT) {
- resultFileWriter.write("#OOT");
- } else if (exitCode == Miner.STATUS_OOM) {
- resultFileWriter.write("#OOM");
- }
- resultFileWriter.close();
- } catch (IOException e) {
- System.out.println("Couldn't write meta data.");
- }
- }
-
- protected static final void writeMetaData(File resultFile, THashMap cmdLine) {
- StringBuilder metaDataLineBuilder = new StringBuilder();
- for (String optionKey : cmdLine.keySet()) {
- if (cmdLine.get(optionKey) != null) {
- metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey)));
- System.out.print(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey)));
- } else {
- metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, "true"));
- System.out.print(String.format("# %s :\t%s\n", optionKey, "true"));
- }
- }
- metaDataLineBuilder.append("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n");
- System.out.println("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n");
- try {
- BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile));
- resultFileWriter.write(metaDataLineBuilder.toString());
- resultFileWriter.close();
- } catch (IOException e) {
- System.out.println("Couldn't write meta data.");
- }
- }
-
- public static void main(String[] args) {
- CLIParserBenchmarker parser = new CLIParserBenchmarker();
- THashMap cmdLine = parser.parse(args);
- String inputDirectoryName = new String();
- String miner = new String();
- char delimiter = '\t';
- String xmx = new String();
- int timeout = -1;
- boolean allFiles = false;
-
- if (cmdLine.contains("input")) {
- inputDirectoryName = cmdLine.get("input");
- }
- if (cmdLine.contains("miner")) {
- miner = cmdLine.get("miner");
- }
- if (cmdLine.contains("delimiter")) {
- delimiter = (cmdLine.get("delimiter")).charAt(0);
- }
- if (cmdLine.contains("xmx")) {
- xmx = cmdLine.get("xmx");
- }
- if (cmdLine.contains("timeout")) {
- System.out.println(String.format("Timeout:%s", cmdLine.get("timeout")));
- timeout = Integer.valueOf(cmdLine.get("timeout")).intValue();
- }
- if (cmdLine.containsKey("all")) {
- System.out.println("Use all files.");
- allFiles = true;
- }
- File executable = null;
- if (miner.equals("tane")) {
- executable = new File("tane.jar");
- } else if (miner.equals("fastfds")) {
- executable = new File("fastfds.jar");
- } else if (miner.equals("dfd")) {
- executable = new File("dfd.jar");
- }
- else {
- System.out.println(String.format("No valid miner:\t%s", miner));
- System.exit(1);
- }
-
- File inputDirectory = new File(inputDirectoryName);
- if (!inputDirectory.exists()) {
- System.out.println("Input directory doesn't exist.");
- System.exit(1);
- }
-
- File[] benchmarkFiles = new File[0];
- if (allFiles) {
- benchmarkFiles = inputDirectory.listFiles();
- } else {
- benchmarkFiles = getBenchmarkFilesWithPattern(inputDirectory);
- }
- Arrays.sort(benchmarkFiles);
-
- if (benchmarkFiles.length != 0) {
- Miner.createColumDirectory();
- Miner.createResultDirectory();
- String resultFilename = getResultFileName(inputDirectory.getAbsolutePath(), miner);
- File resultFile = new File(resultFilename);
- writeMetaData(resultFile, cmdLine);
- boolean errors = false;
- for (File benchmarkFile : benchmarkFiles) {
- if (!errors) {
- try {
- // create columns files and collect meta data
- SVFileProcessor fileProcessor = new SVFileProcessor(benchmarkFile);
- fileProcessor.init(delimiter);
- fileProcessor.createColumnFiles();
-
- // build command line with parameters
- CommandLine processCmdLine = new CommandLine("java");
- processCmdLine.addArgument("-d64");
- processCmdLine.addArgument("-XX:GCTimeLimit=90");
- processCmdLine.addArgument("-XX:GCHeapFreeLimit=10");
- processCmdLine.addArgument("-XX:+UseSerialGC");
- processCmdLine.addArgument(String.format("-Xmx%s", xmx));
- processCmdLine.addArgument("-jar");
- processCmdLine.addArgument(executable.getName());
- processCmdLine.addArgument("-file");
- processCmdLine.addArgument(String.valueOf(benchmarkFile.getName()));
- processCmdLine.addArgument("-columns");
- processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfColumns()));
- processCmdLine.addArgument("-rows");
- processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfRows()));
- processCmdLine.addArgument("-result");
- processCmdLine.addArgument(resultFile.getAbsolutePath());
- processCmdLine.addArgument("-input");
- processCmdLine.addArgument(fileProcessor.getColumnDirectoryName());
-
- // build process with watchdog
- DefaultExecutor executor = new DefaultExecutor();
- ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout);
- executor.setWatchdog(watchdog);
-
- // handle results
- DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler();
- PumpStreamHandler streamHandler = new PumpStreamHandler();
- executor.setStreamHandler(streamHandler);
- long timeStart = System.currentTimeMillis();
- executor.execute(processCmdLine, resultHandler);
- resultHandler.waitFor(timeout);
-
- long timeEnd = System.currentTimeMillis();
- System.out.println(String.format("Time:%.1f", Double.valueOf((double)(timeEnd - timeStart) / 1000)));
-
- int exitCode = 0;
- if (resultHandler.hasResult()) {
- exitCode = resultHandler.getExitValue();
- } else {
- exitCode = Miner.STATUS_OOT;
- executor.getWatchdog().destroyProcess();
- }
-
- if (watchdog.killedProcess()) {
- exitCode = Miner.STATUS_OOT;
- executor.getWatchdog().destroyProcess();
- } else {
- }
- System.out.println(String.format("ExitCode %d", Integer.valueOf(exitCode)));
- if (exitCode == Miner.STATUS_OK) {
-
- } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) {
- writeErrorCode(resultFile, exitCode);
- errors = true;
- }
- } catch (FileNotFoundException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } catch (InterruptedException e) {
- e.printStackTrace();
- }
- }
- }
- }
-
- }
-}
+package fdiscovery.general;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecuteResultHandler;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+
+import fdiscovery.preprocessing.SVFileProcessor;
+import gnu.trove.map.hash.THashMap;
+
+public class Benchmarker {
+
+ protected static File[] getBenchmarkFilesWithPattern(File benchmarkDirectory) {
+ File[] benchmarkFiles = benchmarkDirectory.listFiles(new FilenameFilter() {
+
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.matches(Miner.BENCHMARK_FILE_REGEX);
+ }
+ });
+ return benchmarkFiles;
+ }
+
+ protected static final String getResultFileName(String inputDirectory, String miner) {
+ String[] splitInputDirectory = inputDirectory.split("\\" + File.separator);
+ if (splitInputDirectory.length >= 2) {
+ String staticComponent = splitInputDirectory[splitInputDirectory.length-1];
+ String source = splitInputDirectory[splitInputDirectory.length-2];
+ return String.format("%s%s-%s-%s.dat", Miner.RESULT_FILE_PATH, miner, staticComponent, source);
+ }
+ return new String();
+ }
+
+ protected static final void writeErrorCode(File resultFile, int exitCode) {
+ try {
+ BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile, true));
+ if (exitCode == Miner.STATUS_OOT) {
+ resultFileWriter.write("#OOT");
+ } else if (exitCode == Miner.STATUS_OOM) {
+ resultFileWriter.write("#OOM");
+ }
+ resultFileWriter.close();
+ } catch (IOException e) {
+ System.out.println("Couldn't write meta data.");
+ }
+ }
+
+ protected static final void writeMetaData(File resultFile, THashMap cmdLine) {
+ StringBuilder metaDataLineBuilder = new StringBuilder();
+ for (String optionKey : cmdLine.keySet()) {
+ if (cmdLine.get(optionKey) != null) {
+ metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey)));
+ System.out.print(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey)));
+ } else {
+ metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, "true"));
+ System.out.print(String.format("# %s :\t%s\n", optionKey, "true"));
+ }
+ }
+ metaDataLineBuilder.append("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n");
+ System.out.println("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n");
+ try {
+ BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile));
+ resultFileWriter.write(metaDataLineBuilder.toString());
+ resultFileWriter.close();
+ } catch (IOException e) {
+ System.out.println("Couldn't write meta data.");
+ }
+ }
+
+ public static void main(String[] args) {
+ CLIParserBenchmarker parser = new CLIParserBenchmarker();
+ THashMap cmdLine = parser.parse(args);
+ String inputDirectoryName = new String();
+ String miner = new String();
+ char delimiter = '\t';
+ String xmx = new String();
+ int timeout = -1;
+ boolean allFiles = false;
+
+ if (cmdLine.contains("input")) {
+ inputDirectoryName = cmdLine.get("input");
+ }
+ if (cmdLine.contains("miner")) {
+ miner = cmdLine.get("miner");
+ }
+ if (cmdLine.contains("delimiter")) {
+ delimiter = (cmdLine.get("delimiter")).charAt(0);
+ }
+ if (cmdLine.contains("xmx")) {
+ xmx = cmdLine.get("xmx");
+ }
+ if (cmdLine.contains("timeout")) {
+ System.out.println(String.format("Timeout:%s", cmdLine.get("timeout")));
+ timeout = Integer.valueOf(cmdLine.get("timeout")).intValue();
+ }
+ if (cmdLine.containsKey("all")) {
+ System.out.println("Use all files.");
+ allFiles = true;
+ }
+ File executable = null;
+ if (miner.equals("tane")) {
+ executable = new File("tane.jar");
+ } else if (miner.equals("fastfds")) {
+ executable = new File("fastfds.jar");
+ } else if (miner.equals("dfd")) {
+ executable = new File("dfd.jar");
+ }
+ else {
+ System.out.println(String.format("No valid miner:\t%s", miner));
+ System.exit(1);
+ }
+
+ File inputDirectory = new File(inputDirectoryName);
+ if (!inputDirectory.exists()) {
+ System.out.println("Input directory doesn't exist.");
+ System.exit(1);
+ }
+
+ File[] benchmarkFiles = new File[0];
+ if (allFiles) {
+ benchmarkFiles = inputDirectory.listFiles();
+ } else {
+ benchmarkFiles = getBenchmarkFilesWithPattern(inputDirectory);
+ }
+ Arrays.sort(benchmarkFiles);
+
+ if (benchmarkFiles.length != 0) {
+ Miner.createColumDirectory();
+ Miner.createResultDirectory();
+ String resultFilename = getResultFileName(inputDirectory.getAbsolutePath(), miner);
+ File resultFile = new File(resultFilename);
+ writeMetaData(resultFile, cmdLine);
+ boolean errors = false;
+ for (File benchmarkFile : benchmarkFiles) {
+ if (!errors) {
+ try {
+ // create columns files and collect meta data
+ SVFileProcessor fileProcessor = new SVFileProcessor(benchmarkFile);
+ fileProcessor.init(delimiter);
+ fileProcessor.createColumnFiles();
+
+ // build command line with parameters
+ CommandLine processCmdLine = new CommandLine("java");
+ processCmdLine.addArgument("-d64");
+ processCmdLine.addArgument("-XX:GCTimeLimit=90");
+ processCmdLine.addArgument("-XX:GCHeapFreeLimit=10");
+ processCmdLine.addArgument("-XX:+UseSerialGC");
+ processCmdLine.addArgument(String.format("-Xmx%s", xmx));
+ processCmdLine.addArgument("-jar");
+ processCmdLine.addArgument(executable.getName());
+ processCmdLine.addArgument("-file");
+ processCmdLine.addArgument(String.valueOf(benchmarkFile.getName()));
+ processCmdLine.addArgument("-columns");
+ processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfColumns()));
+ processCmdLine.addArgument("-rows");
+ processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfRows()));
+ processCmdLine.addArgument("-result");
+ processCmdLine.addArgument(resultFile.getAbsolutePath());
+ processCmdLine.addArgument("-input");
+ processCmdLine.addArgument(fileProcessor.getColumnDirectoryName());
+
+ // build process with watchdog
+ DefaultExecutor executor = new DefaultExecutor();
+ ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout);
+ executor.setWatchdog(watchdog);
+
+ // handle results
+ DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler();
+ PumpStreamHandler streamHandler = new PumpStreamHandler();
+ executor.setStreamHandler(streamHandler);
+ long timeStart = System.currentTimeMillis();
+ executor.execute(processCmdLine, resultHandler);
+ resultHandler.waitFor(timeout);
+
+ long timeEnd = System.currentTimeMillis();
+ System.out.println(String.format("Time:%.1f", Double.valueOf((double)(timeEnd - timeStart) / 1000)));
+
+ int exitCode = 0;
+ if (resultHandler.hasResult()) {
+ exitCode = resultHandler.getExitValue();
+ } else {
+ exitCode = Miner.STATUS_OOT;
+ executor.getWatchdog().destroyProcess();
+ }
+
+ if (watchdog.killedProcess()) {
+ exitCode = Miner.STATUS_OOT;
+ executor.getWatchdog().destroyProcess();
+ } else {
+ }
+ System.out.println(String.format("ExitCode %d", Integer.valueOf(exitCode)));
+ if (exitCode == Miner.STATUS_OK) {
+
+ } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) {
+ writeErrorCode(resultFile, exitCode);
+ errors = true;
+ }
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ }
+
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java
index 98af5aa..326adc3 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java
@@ -58,6 +58,6 @@ public boolean accept(File file) {
}
private final String getColumnFileName(final int columnIndex) {
- return String.format(this.formatString, Integer.valueOf(columnIndex));
+ return String.format(this.formatString, Integer.valueOf(columnIndex));
}
}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java
index dbf1c4e..7868f32 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java
@@ -140,11 +140,11 @@ public String toString() {
StringBuilder outputBuilder = new StringBuilder();
for (ColumnCollection determining : this.keySet()) {
- for (int dependentColumn : this.get(determining).getSetBits()) {
- for (int determiningColumn : determining.getSetBits()) {
- outputBuilder.append(String.format("c%04d\t", Integer.valueOf(determiningColumn)));
+ for (Integer dependentColumn : this.get(determining).getSetBits()) {
+ for (Integer determiningColumn : determining.getSetBits()) {
+ outputBuilder.append(String.format("c%04d\t", determiningColumn));
}
- outputBuilder.append(String.format("->\tc%04d\n", Integer.valueOf(dependentColumn)));
+ outputBuilder.append(String.format("->\tc%04d\n", dependentColumn));
}
}
return outputBuilder.toString();
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java
index ecb4d79..3282ba9 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java
@@ -1,171 +1,171 @@
-package fdiscovery.partitions;
-
-import java.util.TreeSet;
-
-import fdiscovery.columns.ColumnCollection;
-import fdiscovery.equivalence.TEquivalence;
-import gnu.trove.iterator.TIntIterator;
-
-public abstract class Partition extends TreeSet implements Comparable {
-
- private static final long serialVersionUID = 174046028525977844L;
-
- protected static int[] probeTable;
- protected ColumnCollection indices;
- protected int numberOfRows;
- protected double error;
- protected double distinctiveness;
-// protected long hashNumber;
-
- public Partition(int columnIndex, int numberOfColumns, int numberOfRows) {
- this.indices = new ColumnCollection(numberOfColumns);
- this.indices.set(columnIndex);
- this.numberOfRows = numberOfRows;
- this.error = -1;
- this.distinctiveness = -1;
- if (Partition.probeTable == null || Partition.probeTable.length != numberOfRows) {
- Partition.probeTable = new int[numberOfRows+1];
- for (int i = 0; i < Partition.probeTable.length; i++) {
- Partition.probeTable[i] = -1;
- }
- }
- }
-
- public void init(int numberOfRows) {
- if (Partition.probeTable.length != numberOfRows) {
- Partition.probeTable = new int[numberOfRows+1];
- }
- }
-
- public Partition(Partition base, Partition additional) {
- this.indices = base.indices.orCopy(additional.indices);
- this.error = -1;
- this.numberOfRows = base.numberOfRows;
- this.distinctiveness = -1;
- if (Partition.probeTable == null) {
- Partition.probeTable = new int[numberOfRows+1];
- for (int i = 0; i < Partition.probeTable.length; i++) {
- Partition.probeTable[i] = -1;
- }
- }
-
- }
-
- private void resetProbeTable() {
- for (int i = 0; i < Partition.probeTable.length; i++) {
- Partition.probeTable[i] = -1;
- }
- }
-
- @Override
- public int compareTo(Partition o) {
- if (this.getDistinctiveness() == o.getDistinctiveness()) {
- return this.indices.compareTo(o.indices);
- }
- return Double.valueOf(this.getDistinctiveness()).compareTo(Double.valueOf(o.getDistinctiveness()));
- }
-
- public int getNumberOfRows() {
- return this.numberOfRows;
- }
-
- public ColumnCollection getIndices() {
- return this.indices;
- }
-
- protected double getDistinctiveness() {
- if (this.distinctiveness == -1) {
- double distinctiveness = (double)(this.numberOfRows - this.size())/this.numberOfRows;
- this.distinctiveness = distinctiveness;
- }
- return this.distinctiveness;
- }
-
- public static double estimateDistinctiveness(Partition a, Partition b) {
- return a.getDistinctiveness() + b.getDistinctiveness() - a.getDistinctiveness() * b.getDistinctiveness();
- }
-
- protected double getError() {
- if (this.error == -1) {
- int cumulatedEqClassSizes = 0;
- for (TEquivalence equivalenceGroup : this) {
- cumulatedEqClassSizes += equivalenceGroup.size();
- }
- double error = (double)(cumulatedEqClassSizes - this.size())/this.numberOfRows;
- this.error = error;
- }
- return this.error;
- }
-
- public static boolean representsFD(Partition base, Partition baseMergedWithRHS) {
- if (base.getError() == baseMergedWithRHS.getError()) {
- return true;
- }
- return false;
- }
-
- public boolean isUnique() {
- return this.size() == 0;
- }
-
- public boolean equals(Partition other) {
- int numberOfValues = 0;
- int groupIndex = 0;
- for (TEquivalence equivalenceGroup : this) {
- for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) {
- Partition.probeTable[equivalenceGroupIt.next()] = groupIndex;
- numberOfValues++;
- }
- groupIndex++;
- }
- for (TEquivalence equivalenceGroup : other) {
- groupIndex = -2;
- for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) {
- int currentGroupIndex = Partition.probeTable[equivalenceGroupIt.next()];
- if (groupIndex == -2 || currentGroupIndex == groupIndex) {
- groupIndex = currentGroupIndex;
- } else {
- resetProbeTable();
- return false;
- }
- numberOfValues--;
- }
- }
- resetProbeTable();
- if (numberOfValues == 0) {
- return true;
- }
- return false;
- }
-
- public String printIndices() {
- StringBuilder outputBuilder = new StringBuilder(this.indices.size());
-
- for (int i=0; i < this.indices.size(); i++) {
- if (this.indices.get(i)) {
- outputBuilder.append("1");
- } else {
- outputBuilder.append("0");
- }
- }
- return outputBuilder.toString();
- }
-
- @Override
- public String toString() {
- StringBuilder outputBuilder = new StringBuilder();
- outputBuilder.append(String.format("[%s]{", this.indices));
-
- for(TEquivalence equivalenceGroup : this) {
- outputBuilder.append("{");
- for (TIntIterator valueIt=equivalenceGroup.iterator(); valueIt.hasNext(); ) {
- outputBuilder.append(valueIt.next());
- outputBuilder.append(",");
- }
- outputBuilder.append("}");
- }
- outputBuilder.append("}");
-
- return outputBuilder.toString();
- }
-}
+package fdiscovery.partitions;
+
+import java.util.TreeSet;
+
+import fdiscovery.columns.ColumnCollection;
+import fdiscovery.equivalence.TEquivalence;
+import gnu.trove.iterator.TIntIterator;
+
+public abstract class Partition extends TreeSet implements Comparable {
+
+ private static final long serialVersionUID = 174046028525977844L;
+
+ protected static int[] probeTable;
+ protected ColumnCollection indices;
+ protected int numberOfRows;
+ protected double error;
+ protected double distinctiveness;
+// protected long hashNumber;
+
+ public Partition(int columnIndex, int numberOfColumns, int numberOfRows) {
+ this.indices = new ColumnCollection(numberOfColumns);
+ this.indices.set(columnIndex);
+ this.numberOfRows = numberOfRows;
+ this.error = -1;
+ this.distinctiveness = -1;
+ if (Partition.probeTable == null || Partition.probeTable.length != numberOfRows) {
+ Partition.probeTable = new int[numberOfRows+1];
+ for (int i = 0; i < Partition.probeTable.length; i++) {
+ Partition.probeTable[i] = -1;
+ }
+ }
+ }
+
+ public void init(int numberOfRows) {
+ if (Partition.probeTable.length != numberOfRows) {
+ Partition.probeTable = new int[numberOfRows+1];
+ }
+ }
+
+ public Partition(Partition base, Partition additional) {
+ this.indices = base.indices.orCopy(additional.indices);
+ this.error = -1;
+ this.numberOfRows = base.numberOfRows;
+ this.distinctiveness = -1;
+ if (Partition.probeTable == null) {
+ Partition.probeTable = new int[numberOfRows+1];
+ for (int i = 0; i < Partition.probeTable.length; i++) {
+ Partition.probeTable[i] = -1;
+ }
+ }
+
+ }
+
+ private void resetProbeTable() {
+ for (int i = 0; i < Partition.probeTable.length; i++) {
+ Partition.probeTable[i] = -1;
+ }
+ }
+
+ @Override
+ public int compareTo(Partition o) {
+ if (this.getDistinctiveness() == o.getDistinctiveness()) {
+ return this.indices.compareTo(o.indices);
+ }
+ return Double.valueOf(this.getDistinctiveness()).compareTo(Double.valueOf(o.getDistinctiveness()));
+ }
+
+ public int getNumberOfRows() {
+ return this.numberOfRows;
+ }
+
+ public ColumnCollection getIndices() {
+ return this.indices;
+ }
+
+ protected double getDistinctiveness() {
+ if (this.distinctiveness == -1) {
+ double distinctiveness = (double)(this.numberOfRows - this.size())/this.numberOfRows;
+ this.distinctiveness = distinctiveness;
+ }
+ return this.distinctiveness;
+ }
+
+ public static double estimateDistinctiveness(Partition a, Partition b) {
+ return a.getDistinctiveness() + b.getDistinctiveness() - a.getDistinctiveness() * b.getDistinctiveness();
+ }
+
+ protected double getError() {
+ if (this.error == -1) {
+ int cumulatedEqClassSizes = 0;
+ for (TEquivalence equivalenceGroup : this) {
+ cumulatedEqClassSizes += equivalenceGroup.size();
+ }
+ double error = (double)(cumulatedEqClassSizes - this.size())/this.numberOfRows;
+ this.error = error;
+ }
+ return this.error;
+ }
+
+ public static boolean representsFD(Partition base, Partition baseMergedWithRHS) {
+ if (base.getError() == baseMergedWithRHS.getError()) {
+ return true;
+ }
+ return false;
+ }
+
+ public boolean isUnique() {
+ return this.size() == 0;
+ }
+
+ public boolean equals(Partition other) {
+ int numberOfValues = 0;
+ int groupIndex = 0;
+ for (TEquivalence equivalenceGroup : this) {
+ for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) {
+ Partition.probeTable[equivalenceGroupIt.next()] = groupIndex;
+ numberOfValues++;
+ }
+ groupIndex++;
+ }
+ for (TEquivalence equivalenceGroup : other) {
+ groupIndex = -2;
+ for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) {
+ int currentGroupIndex = Partition.probeTable[equivalenceGroupIt.next()];
+ if (groupIndex == -2 || currentGroupIndex == groupIndex) {
+ groupIndex = currentGroupIndex;
+ } else {
+ resetProbeTable();
+ return false;
+ }
+ numberOfValues--;
+ }
+ }
+ resetProbeTable();
+ if (numberOfValues == 0) {
+ return true;
+ }
+ return false;
+ }
+
+ public String printIndices() {
+ StringBuilder outputBuilder = new StringBuilder(this.indices.size());
+
+ for (int i=0; i < this.indices.size(); i++) {
+ if (this.indices.get(i)) {
+ outputBuilder.append("1");
+ } else {
+ outputBuilder.append("0");
+ }
+ }
+ return outputBuilder.toString();
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder outputBuilder = new StringBuilder();
+ outputBuilder.append(String.format("[%s]{", this.indices));
+
+ for(TEquivalence equivalenceGroup : this) {
+ outputBuilder.append("{");
+ for (TIntIterator valueIt=equivalenceGroup.iterator(); valueIt.hasNext(); ) {
+ outputBuilder.append(valueIt.next());
+ outputBuilder.append(",");
+ }
+ outputBuilder.append("}");
+ }
+ outputBuilder.append("}");
+
+ return outputBuilder.toString();
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java
index a3462fe..93bb615 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java
@@ -1,40 +1,40 @@
-package fdiscovery.partitions;
-
-import java.util.ArrayList;
-
-import fdiscovery.columns.ColumnCollection;
-import gnu.trove.iterator.TIntObjectIterator;
-import gnu.trove.iterator.TLongObjectIterator;
-import gnu.trove.map.hash.TIntObjectHashMap;
-import gnu.trove.map.hash.TLongObjectHashMap;
-import gnu.trove.map.hash.TObjectIntHashMap;
-
-public class PartitionStatistics extends TObjectIntHashMap {
-
- public String getStatistics() {
- TLongObjectHashMap>> statsAndCountsByLevel = new TLongObjectHashMap<>();
- for (ColumnCollection partitionKey : this.keySet()) {
- long keyCardinality = partitionKey.cardinality();
- int usageCount = this.get(partitionKey);
- statsAndCountsByLevel.putIfAbsent(keyCardinality, new TIntObjectHashMap>());
- statsAndCountsByLevel.get(keyCardinality).putIfAbsent(usageCount, new ArrayList());
- statsAndCountsByLevel.get(keyCardinality).get(usageCount).add(partitionKey);
- }
- StringBuilder statisticsBuilder = new StringBuilder();
- statisticsBuilder.append("Statistics:\n");
- for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) {
- statsByLevelIt.advance();
- long levelCardinality = statsByLevelIt.key();
- statisticsBuilder.append(String.format("%d attributes {\n", Long.valueOf(levelCardinality)));
- for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) {
- countByLevelIt.advance();
- int usageCount = countByLevelIt.key();
- int numberOfElements = countByLevelIt.value().size();
- statisticsBuilder.append(String.format("\t%d elements used %d times\n", Integer.valueOf(numberOfElements), Integer.valueOf(usageCount)));
- }
- statisticsBuilder.append("}\n");
- }
-
- return statisticsBuilder.toString();
- }
-}
+package fdiscovery.partitions;
+
+import java.util.ArrayList;
+
+import fdiscovery.columns.ColumnCollection;
+import gnu.trove.iterator.TIntObjectIterator;
+import gnu.trove.iterator.TLongObjectIterator;
+import gnu.trove.map.hash.TIntObjectHashMap;
+import gnu.trove.map.hash.TLongObjectHashMap;
+import gnu.trove.map.hash.TObjectIntHashMap;
+
+public class PartitionStatistics extends TObjectIntHashMap {
+
+ public String getStatistics() {
+ TLongObjectHashMap>> statsAndCountsByLevel = new TLongObjectHashMap<>();
+ for (ColumnCollection partitionKey : this.keySet()) {
+ long keyCardinality = partitionKey.cardinality();
+ int usageCount = this.get(partitionKey);
+ statsAndCountsByLevel.putIfAbsent(keyCardinality, new TIntObjectHashMap>());
+ statsAndCountsByLevel.get(keyCardinality).putIfAbsent(usageCount, new ArrayList());
+ statsAndCountsByLevel.get(keyCardinality).get(usageCount).add(partitionKey);
+ }
+ StringBuilder statisticsBuilder = new StringBuilder();
+ statisticsBuilder.append("Statistics:\n");
+ for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) {
+ statsByLevelIt.advance();
+ long levelCardinality = statsByLevelIt.key();
+ statisticsBuilder.append(String.format("%d attributes {\n", Long.valueOf(levelCardinality)));
+ for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) {
+ countByLevelIt.advance();
+ int usageCount = countByLevelIt.key();
+ int numberOfElements = countByLevelIt.value().size();
+ statisticsBuilder.append(String.format("\t%d elements used %d times\n", Integer.valueOf(numberOfElements), Integer.valueOf(usageCount)));
+ }
+ statisticsBuilder.append("}\n");
+ }
+
+ return statisticsBuilder.toString();
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java
index 8c527a5..f812e92 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java
@@ -1,78 +1,78 @@
-package fdiscovery.partitions;
-
-import fdiscovery.equivalence.EquivalenceGroupTIntHashSet;
-import fdiscovery.equivalence.TEquivalence;
-import gnu.trove.iterator.TIntIterator;
-import gnu.trove.map.hash.TObjectIntHashMap;
-
-import java.util.Iterator;
-import java.util.LinkedHashMap;
-import java.util.Map;
-import java.util.TreeSet;
-
-public class StrippedPartition extends TreeSet {
-
- private static final long serialVersionUID = -10500424753490842L;
-
- // constructor for TANEs strippedProduct
- public StrippedPartition() {
-
- }
-
- @SuppressWarnings("unused")
- public StrippedPartition(StrippedPartition base, StrippedPartition additional) {
-
- }
-
- public StrippedPartition(String[] columnContent) {
- TObjectIntHashMap valueToIndex = new TObjectIntHashMap<>();
- LinkedHashMap helpMap = new LinkedHashMap<>();
-
- for (int rowIndex = 0; rowIndex < columnContent.length; rowIndex++) {
- String value = columnContent[rowIndex];
- // if the value wasn't there yet, the row index becomes the representative
- // for that equivalence class
- if (!valueToIndex.containsKey(value)) {
- valueToIndex.put(value, rowIndex);
- TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet();
- equivalenceGroup.add(rowIndex);
- helpMap.put(Integer.valueOf(rowIndex), equivalenceGroup);
- }
- // otherwise find the right equivalence class and add the current element index
- else {
- int equivalenceGroupIndex = valueToIndex.get(value);
- TEquivalence equivalenceClass = helpMap.get(Integer.valueOf(equivalenceGroupIndex));
- equivalenceClass.add(rowIndex);
- }
- }
- // remove equivalence classes with only one element
- for(Iterator> it=helpMap.entrySet().iterator(); it.hasNext();) {
- Map.Entry entry = it.next();
- if (entry.getValue().size() <= 1) {
- it.remove();
- }
- }
-
- // sort the stripped partition by equivalence group sizes
- this.addAll(helpMap.values());
- }
-
- @Override
- public String toString() {
- StringBuilder outputBuilder = new StringBuilder();
- outputBuilder.append("{");
-
- for(TEquivalence entry : this) {
- outputBuilder.append("{");
- for (TIntIterator valueIt=entry.iterator(); valueIt.hasNext(); ) {
-// for (TIntIteratorInteger value : entry) {
- outputBuilder.append(valueIt.next());
- outputBuilder.append(",");
- }
- outputBuilder.append("}");
- }
- outputBuilder.append("}");
-
- return outputBuilder.toString();
- }
-}
+package fdiscovery.partitions;
+
+import fdiscovery.equivalence.EquivalenceGroupTIntHashSet;
+import fdiscovery.equivalence.TEquivalence;
+import gnu.trove.iterator.TIntIterator;
+import gnu.trove.map.hash.TObjectIntHashMap;
+
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.Map;
+import java.util.TreeSet;
+
+public class StrippedPartition extends TreeSet {
+
+ private static final long serialVersionUID = -10500424753490842L;
+
+ // constructor for TANEs strippedProduct
+ public StrippedPartition() {
+
+ }
+
+ @SuppressWarnings("unused")
+ public StrippedPartition(StrippedPartition base, StrippedPartition additional) {
+
+ }
+
+ public StrippedPartition(String[] columnContent) {
+ TObjectIntHashMap valueToIndex = new TObjectIntHashMap<>();
+ LinkedHashMap helpMap = new LinkedHashMap<>();
+
+ for (int rowIndex = 0; rowIndex < columnContent.length; rowIndex++) {
+ String value = columnContent[rowIndex];
+ // if the value wasn't there yet, the row index becomes the representative
+ // for that equivalence class
+ if (!valueToIndex.containsKey(value)) {
+ valueToIndex.put(value, rowIndex);
+ TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet();
+ equivalenceGroup.add(rowIndex);
+ helpMap.put(Integer.valueOf(rowIndex), equivalenceGroup);
+ }
+ // otherwise find the right equivalence class and add the current element index
+ else {
+ int equivalenceGroupIndex = valueToIndex.get(value);
+ TEquivalence equivalenceClass = helpMap.get(Integer.valueOf(equivalenceGroupIndex));
+ equivalenceClass.add(rowIndex);
+ }
+ }
+ // remove equivalence classes with only one element
+ for(Iterator> it=helpMap.entrySet().iterator(); it.hasNext();) {
+ Map.Entry entry = it.next();
+ if (entry.getValue().size() <= 1) {
+ it.remove();
+ }
+ }
+
+ // sort the stripped partition by equivalence group sizes
+ this.addAll(helpMap.values());
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder outputBuilder = new StringBuilder();
+ outputBuilder.append("{");
+
+ for(TEquivalence entry : this) {
+ outputBuilder.append("{");
+ for (TIntIterator valueIt=entry.iterator(); valueIt.hasNext(); ) {
+// for (TIntIteratorInteger value : entry) {
+ outputBuilder.append(valueIt.next());
+ outputBuilder.append(",");
+ }
+ outputBuilder.append("}");
+ }
+ outputBuilder.append("}");
+
+ return outputBuilder.toString();
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java
index d354e37..205b4d0 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java
@@ -24,7 +24,7 @@ public THashSet getUncheckedMaximalSubsets(ColumnCollection lh
THashSet uncheckedMaximalSubsets = new THashSet<>();
// if (lhs.cardinality() > 2) {
- for (int columnIndex : order.getOrderHighDistinctCount(lhs)) {
+ for (int columnIndex : order.getOrderHighDistinctCount(lhs)) {
ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex);
if (!this.containsKey(subsetIndices)) {
uncheckedMaximalSubsets.add(subsetIndices);
@@ -39,7 +39,7 @@ public THashSet getUncheckedOrCandidateMaximalSubsets(ColumnCo
// we only want to check subsets with at least 2 columns
if (lhs.cardinality() > 2) {
- for (int columnIndex : order.getOrderHighDistinctCount(lhs)) {
+ for (int columnIndex : order.getOrderHighDistinctCount(lhs)) {
ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex);
if (!this.containsKey(subsetIndices) || this.get(subsetIndices) == Observation.CANDIDATE_MINIMAL_DEPENDENCY) {
uncheckedMaximalSubsets.add(subsetIndices);
@@ -54,7 +54,7 @@ public THashSet getMaximalSubsets(ColumnCollection lhs, Column
// we only want to check subsets with at least 2 columns
if (lhs.cardinality() > 2) {
- for (int columnIndex : order.getOrderHighDistinctCount(lhs)) {
+ for (int columnIndex : order.getOrderHighDistinctCount(lhs)) {
ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex);
uncheckedMaximalSubsets.add(subsetIndices);
}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java
index 1a11432..25f43d0 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java
@@ -1,70 +1,70 @@
-package fdiscovery.pruning;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-
-import fdiscovery.columns.ColumnCollection;
-
-public class PruneHashSet extends HashMap> implements PruneInterface {
-
- private static final long serialVersionUID = 8012444410589325434L;
-
- public PruneHashSet(int numberOfColumns) {
- super(numberOfColumns);
- ColumnCollection key = new ColumnCollection(numberOfColumns);
- for (int columnIndex = 0; columnIndex < numberOfColumns; columnIndex++) {
- this.put(key.setCopy(columnIndex), new HashSet());
- }
- }
-
- public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDependencies nonDependencies, ArrayList candidates) {
- for (ColumnCollection candidate : candidates) {
- if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) {
- return candidate;
- }
- }
- return null;
- }
-
- @Override
- public void rebalance() {
- boolean rebalancedGroup = false;
-
- do {
- rebalancedGroup = false;
- ArrayList groupKeys = new ArrayList<>(this.keySet());
- for (ColumnCollection key : groupKeys) {
- if (this.get(key).size() > SPLIT_THRESHOLD) {
- rebalanceGroup(key);
- rebalancedGroup = true;
- }
- }
- } while (rebalancedGroup);
- }
-
- @Override
- public void rebalanceGroup(ColumnCollection groupKey) {
- HashSet depsOfGroup = this.get(groupKey);
- for (int columnIndex : groupKey.complementCopy().getSetBits()) {
- ColumnCollection newKey = groupKey.setCopy(columnIndex);
- HashSet newGroup = new HashSet();
- this.put(newKey, newGroup);
-
- for (ColumnCollection depOfGroup : depsOfGroup) {
- // when splitting a group it cannot contain the key itself
- // because otherwise the group cannot contain any other
- // element since it would be a superset of the key and be pruned
- // OR
- // when splitting a group it cannot contain the key itself
- // because otherwise all supersets of the key would have
- // been pruned and it wouldn't need to be split
- if (newKey.isSubsetOf(depOfGroup)) {
- newGroup.add(depOfGroup);
- }
- }
- }
- // remove the old group
- this.remove(groupKey);
- }
-}
+package fdiscovery.pruning;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import fdiscovery.columns.ColumnCollection;
+
+public class PruneHashSet extends HashMap> implements PruneInterface {
+
+ private static final long serialVersionUID = 8012444410589325434L;
+
+ public PruneHashSet(int numberOfColumns) {
+ super(numberOfColumns);
+ ColumnCollection key = new ColumnCollection(numberOfColumns);
+ for (int columnIndex = 0; columnIndex < numberOfColumns; columnIndex++) {
+ this.put(key.setCopy(columnIndex), new HashSet());
+ }
+ }
+
+ public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDependencies nonDependencies, ArrayList candidates) {
+ for (ColumnCollection candidate : candidates) {
+ if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) {
+ return candidate;
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public void rebalance() {
+ boolean rebalancedGroup = false;
+
+ do {
+ rebalancedGroup = false;
+ ArrayList groupKeys = new ArrayList<>(this.keySet());
+ for (ColumnCollection key : groupKeys) {
+ if (this.get(key).size() > SPLIT_THRESHOLD) {
+ rebalanceGroup(key);
+ rebalancedGroup = true;
+ }
+ }
+ } while (rebalancedGroup);
+ }
+
+ @Override
+ public void rebalanceGroup(ColumnCollection groupKey) {
+ HashSet depsOfGroup = this.get(groupKey);
+ for (int columnIndex : groupKey.complementCopy().getSetBits()) {
+ ColumnCollection newKey = groupKey.setCopy(columnIndex);
+ HashSet newGroup = new HashSet();
+ this.put(newKey, newGroup);
+
+ for (ColumnCollection depOfGroup : depsOfGroup) {
+ // when splitting a group it cannot contain the key itself
+ // because otherwise the group cannot contain any other
+ // element since it would be a superset of the key and be pruned
+ // OR
+ // when splitting a group it cannot contain the key itself
+ // because otherwise all supersets of the key would have
+ // been pruned and it wouldn't need to be split
+ if (newKey.isSubsetOf(depOfGroup)) {
+ newGroup.add(depOfGroup);
+ }
+ }
+ }
+ // remove the old group
+ this.remove(groupKey);
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java
index cf045e2..f6fe76a 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java
@@ -1,37 +1,37 @@
-package fdiscovery.pruning;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import fdiscovery.columns.ColumnCollection;
-
-// from rhs to lhs
-public abstract class PruneTable extends HashMap>> {
-
- private static final long serialVersionUID = 4470955427882698208L;
-
- public int getCount(ColumnCollection RHS) {
- int count = 0;
- if (this.containsKey(RHS)) {
- for (ArrayList collection : this.get(RHS).values()) {
- count += collection.size();
- }
- }
- return count;
- }
-
-
- public void addValue(ColumnCollection RHS, ColumnCollection LHS) {
- if (!this.containsKey(RHS)) {
- this.put(RHS, new HashMap>());
- }
- if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) {
- this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList());
- }
-// System.out.println(this.get(RHS));
-// System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality()));
- ArrayList dependencies = this.get(RHS).get(Integer.valueOf(LHS.cardinality()));
-// System.out.println(dependencies);
- dependencies.add(LHS);
- }
-}
+package fdiscovery.pruning;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+import fdiscovery.columns.ColumnCollection;
+
+// from rhs to lhs
+public abstract class PruneTable extends HashMap>> {
+
+ private static final long serialVersionUID = 4470955427882698208L;
+
+ public int getCount(ColumnCollection RHS) {
+ int count = 0;
+ if (this.containsKey(RHS)) {
+ for (ArrayList collection : this.get(RHS).values()) {
+ count += collection.size();
+ }
+ }
+ return count;
+ }
+
+
+ public void addValue(ColumnCollection RHS, ColumnCollection LHS) {
+ if (!this.containsKey(RHS)) {
+ this.put(RHS, new HashMap>());
+ }
+ if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) {
+ this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList());
+ }
+// System.out.println(this.get(RHS));
+// System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality()));
+ ArrayList dependencies = this.get(RHS).get(Integer.valueOf(LHS.cardinality()));
+// System.out.println(dependencies);
+ dependencies.add(LHS);
+ }
+}
diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java
index 0909653..5740f8b 100755
--- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java
+++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java
@@ -1,431 +1,431 @@
-package fdiscovery.tane.runner;
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-
-import org.apache.commons.cli.CommandLine;
-
-import fdiscovery.columns.ColumnCollection;
-
-import com.rits.cloning.Cloner;
-
-import fdiscovery.equivalence.EquivalenceGroupTIntHashSet;
-import fdiscovery.equivalence.TEquivalence;
-import fdiscovery.partitions.StrippedPartition;
-import fdiscovery.partitions.StrippedPartitions;
-import fdiscovery.preprocessing.SVFileProcessor;
-import fdiscovery.tane.AprioriGeneration;
-import fdiscovery.general.CLIParserMiner;
-import fdiscovery.general.CollectionSet;
-import fdiscovery.general.ColumnFiles;
-import fdiscovery.general.FunctionalDependencies;
-import fdiscovery.general.Miner;
-import gnu.trove.iterator.TIntIterator;
-import gnu.trove.map.hash.THashMap;
-
-public class Tane extends Miner {
-
- private int numberOfColumns;
- private int numberOfRows;
- private int[] T, Te;
- private FunctionalDependencies minimalDependencies;
- private StrippedPartitions strippedPartitions;
- private HashMap cPlus;
- private ArrayList> levels;
- private ColumnCollection rSet;
-
- public FunctionalDependencies getDependencies() {
- return this.minimalDependencies;
- }
-
- @SuppressWarnings("unused")
- public static void main2(String[] args) {
- createColumDirectory();
- createResultDirectory();
-
- File source = new File(Miner.input);
- SVFileProcessor inputFileProcessor = null;
- try {
- long timeStart = System.currentTimeMillis();
-
- inputFileProcessor = new SVFileProcessor(source);
- inputFileProcessor.init();
- System.out.println("TANE");
- System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter());
- System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns());
- System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows());
- inputFileProcessor.createColumnFiles();
- Tane taneRunner = new Tane(inputFileProcessor);
- taneRunner.run();
-
- System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(taneRunner.minimalDependencies.getCount())));;
- long timeFindFDs = System.currentTimeMillis();
- System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s");
- System.out.println(taneRunner.getDependencies());
-
- } catch (FileNotFoundException e) {
- System.out.println("The input file could not be found.");
- } catch (IOException e) {
- System.out.println("The input reader could not be reset.");
- }
- }
-
- public static void main(String[] args) {
- CLIParserMiner parser = new CLIParserMiner();
- CommandLine cli = parser.parse(args);
- String inputFilename = new String();
- String columnFileDirectory = new String();
- String resultFile = new String();
- int numberOfColumns = 0;
- int numberOfRows = 0;
-
- if (cli.hasOption("file")) {
- inputFilename = cli.getOptionValue("file");
- }
- if (cli.hasOption("input")) {
- columnFileDirectory = cli.getOptionValue("input");
- }
- if (cli.hasOption("result")) {
- resultFile = cli.getOptionValue("result");
- }
- if (cli.hasOption("columns")) {
- numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue();
- }
- if (cli.hasOption("rows")) {
- numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue();
- }
- ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows);
- long timeStart = System.currentTimeMillis();
- try {
- Tane runner = new Tane(columnFiles, numberOfRows);
- runner.run();
- long timeEnd = System.currentTimeMillis();
- runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename);
- } catch(OutOfMemoryError e) {
- System.exit(Miner.STATUS_OOM);
- }
- System.exit(0);
- }
-
- private void writeOutputSuccessful(String outputFile, long time, String inputFileName) {
- String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1";
-
- StringBuilder outputBuilder = new StringBuilder();
- if (!inputFileName.isEmpty()) {
- outputBuilder.append(String.format("%s\t", inputFileName));
- }
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows)));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns)));
- outputBuilder.append(String.format("%s\t", timeString));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount())));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5))));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size())));
- outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size())));
- outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory())));
- outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false)));
-
- try {
- BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true));
- resultFileWriter.write(outputBuilder.toString());
- System.out.print(outputBuilder.toString());
- resultFileWriter.close();
- } catch (IOException e) {
- System.out.println("Couldn't write output.");
- }
- }
-
- public Tane(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError {
- this.numberOfColumns = columnFiles.getNumberOfColumns();
- this.numberOfRows = numberOfRows;
- this.minimalDependencies = new FunctionalDependencies();
- this.strippedPartitions = new StrippedPartitions(columnFiles);
- columnFiles.clear();
- }
-
-
- public Tane(SVFileProcessor table) throws OutOfMemoryError {
- this.numberOfColumns = table.getNumberOfColumns();
- this.numberOfRows = table.getNumberOfRows();
- this.minimalDependencies = new FunctionalDependencies();
- this.strippedPartitions = new StrippedPartitions(table.getColumnFiles());
- }
-
- public THashMap run() throws OutOfMemoryError {
-
- levels = new ArrayList<>();
- cPlus = new HashMap<>();
-
- // Level 0 is the empty set
- levels.add(new CollectionSet());
- // Level 1 initialization
- levels.add(new CollectionSet());
-
- ColumnCollection emptyLHSSet = new ColumnCollection(this.numberOfColumns);
- rSet = new ColumnCollection(this.numberOfColumns);
-
- cPlus.put(emptyLHSSet, rSet);
-
- this.T = new int[this.numberOfRows + 1];
- this.Te = new int[this.numberOfRows + 1];
- // initialize T to all -1, because it is specified to be all "NULL"
- // (!=0) in TANE
- for (int i = 0; i < T.length; i++) {
- T[i] = -1;
- }
-
- // Initialization
- for (int i = 0; i < this.numberOfColumns; i++) {
- // set all bits in R
- rSet.set(i);
- // build atomic attribute-sets
- ColumnCollection subset = new ColumnCollection(this.numberOfColumns);
- subset.set(i);
- // add to first level
- levels.get(1).add(subset);
- }
-
- // main algorithm
- int level = 1;
- while (!levels.get(level).isEmpty()) {
-// System.out.println("Level:\t" + level);
- this.computeDependencies(levels.get(level));
- this.prune(levels.get(level));
- levels.add(this.generateNextLevel(levels.get(level)));
- levels.get(level).clear();
- level++;
- }
- return minimalDependencies;
- }
-
- private CollectionSet generateNextLevel(CollectionSet currentLevel) {
- CollectionSet nextLevel = new CollectionSet<>();
-
- Cloner cloner = new Cloner();
- AprioriGeneration prefixBlockGenerator = new AprioriGeneration<>(cloner.deepClone(currentLevel));
- for (CollectionSet k : prefixBlockGenerator.prefixBlocks()) {
- for (ColumnCollection y : k) {
- for (ColumnCollection z : k.tailSet(y)) {
- ColumnCollection x = y.orCopy(z);
- boolean xInNextLevel = true;
- for (int a : x.getSetBits()) {
- x.clear(a);
- if (!currentLevel.contains(x)) {
- xInNextLevel = false;
- break;
- }
- x.set(a);
- }
- if (xInNextLevel) {
- nextLevel.add(x);
- strippedPartitions.put(x, strippedProduct(strippedPartitions.get(y), strippedPartitions.get(z)));
- }
- }
- }
- }
-
- return nextLevel;
- }
-
- private void computeDependencies(CollectionSet currentLevel) {
- for (ColumnCollection x : currentLevel) {
- addCPlusOfX(x);
- }
-
- for (ColumnCollection x : currentLevel) {
- for (int a : x.andCopy(cPlus.get(x)).getSetBits()) {
- boolean isDependency = isValidDependency(x.clearCopy(a), Integer.valueOf(a));
-
- if (isDependency) {
- minimalDependencies.addRHSColumn(x.clearCopy(a), a);
- cPlus.get(x).clear(a);
-
- for (int B : rSet.removeCopy(x).getSetBits()) {
- cPlus.get(x).clear(B);
- }
- }
- }
-
- }
- }
-
- private ColumnCollection addCPlusOfX(ColumnCollection x) {
- ColumnCollection cPlusOfX = cPlus.get(x.clearCopy(x.nextSetBit(0)));
-
- // if cPlusOfX was not in the list it has to be computed recursively
- if (cPlusOfX == null) {
- cPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(x.nextSetBit(0))).clone();
- } else {
- cPlusOfX = (ColumnCollection) cPlusOfX.clone();
- }
- for (int a : x.getSetBits()) {
- ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a));
-
- if (nextCPlusOfX == null) {
- nextCPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(a)).clone();
- } else {
- nextCPlusOfX = (ColumnCollection) nextCPlusOfX.clone();
- }
-
- cPlusOfX.and(nextCPlusOfX);
- }
- cPlus.put(x, cPlusOfX);
-
- return cPlusOfX;
- }
-
- private void prune(CollectionSet currentLevel) {
- Iterator currentLevelIterator = currentLevel.iterator();
-
- while (currentLevelIterator.hasNext()) {
- ColumnCollection x = currentLevelIterator.next();
-
- ColumnCollection cPlusOfX = cPlus.get(x);
- if (cPlusOfX == null) {
- cPlusOfX = addCPlusOfX(x);
- }
-
- if (cPlusOfX.isEmpty()) {
- currentLevelIterator.remove();
- continue;
- }
-
- boolean isSuperKey = isSuperKey(x);
- if (isSuperKey) {
- for (int a : cPlus.get(x).removeCopy(x).getSetBits()) {
- ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0));
- ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey);
- if (firstCPlusCandidates == null) {
- firstCPlusCandidates = (ColumnCollection) addCPlusOfX(firstCPlusCandidatesKey).clone();
- } else {
- firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone();
- }
- for (int b : x.getSetBits()) {
-
- ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b));
- if (nextCPlusCandidates == null) {
- nextCPlusCandidates = (ColumnCollection) addCPlusOfX(x.setCopy(a).clearCopy(b)).clone();
- } else {
- nextCPlusCandidates = (ColumnCollection) nextCPlusCandidates.clone();
- }
-
- firstCPlusCandidates.and(nextCPlusCandidates);
- }
- if (firstCPlusCandidates.get(a)) {
- minimalDependencies.addRHSColumn(x, a);
- }
- }
- currentLevelIterator.remove();
- }
- }
- }
-
- protected boolean isSuperKey(ColumnCollection LHS) {
- StrippedPartition partitionOfX = strippedPartitions.get(LHS);
-
- int sumOfSizesOfEquivalenceClasses = 0;
- int numberOfEquivalenceClasses = 0;
-
- for (TEquivalence equivalenceGroup : partitionOfX) {
- sumOfSizesOfEquivalenceClasses += equivalenceGroup.size();
- numberOfEquivalenceClasses++;
- }
-
- // equation (1) in the paper
- boolean result = (((sumOfSizesOfEquivalenceClasses - numberOfEquivalenceClasses) / (double) this.numberOfColumns) == 0);
-
- return result;
- }
-
- private double error(StrippedPartition xPartition, StrippedPartition xUnionAPartition) {
- int e = 0;
-
- for (TEquivalence equivalenceGroup : xUnionAPartition) {
- Te[equivalenceGroup.getIdentifier()] = equivalenceGroup.size();
- }
- for (TEquivalence equivalenceGroup : xPartition) {
- int m = 1;
-
- for (TIntIterator tIt=equivalenceGroup.iterator(); tIt.hasNext(); ) {
-// for (Integer t : equivalenceGroup) {
- m = Math.max(m, Te[tIt.next()]);
- }
- e = e + equivalenceGroup.size() - m;
-
- }
- for (TEquivalence equivalenceGroup : xUnionAPartition) {
- Te[equivalenceGroup.getIdentifier()] = 0;
- }
-
- return (double)e / this.numberOfRows;
- }
-
-
- private boolean isValidDependency(ColumnCollection LHS, Integer RHS) {
- if (LHS.isEmpty()) {
- return false;
- }
-
- return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS.intValue()))) == 0);
- }
-
- public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) {
- StrippedPartition xPartition = new StrippedPartition();
- HashMap S = new HashMap<>();
-
- if (yPartition.size() > zPartition.size()) {
- StrippedPartition swap = zPartition;
- zPartition = yPartition;
- yPartition = swap;
- }
-
- // build some kind of probe table
- int i = 1;
- for (TEquivalence cI : yPartition) {
- for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
- int tValue = tIt.next();
- T[tValue] = i;
-
- }
- S.put(Integer.valueOf(i), new EquivalenceGroupTIntHashSet());
- i++;
- }
-
- for (TEquivalence cI : zPartition) {
- for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
- int tValue = tIt.next();
- if (T[tValue] != -1) {
- TEquivalence sOld = S.get(Integer.valueOf(T[tValue]));
- sOld.add(tValue);
- }
- }
- for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
- int tValue = tIt.next();
- TEquivalence s = S.get(Integer.valueOf(T[tValue]));
- if (s != null && s.size() > 1) {
- xPartition.add(s);
- }
- S.put(Integer.valueOf(T[tValue]), new EquivalenceGroupTIntHashSet());
- }
- }
- i = 1;
- for (TEquivalence cI : yPartition) {
- for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
- int tValue = tIt.next();
- T[tValue] = -1;
- }
- }
-
- return xPartition;
- }
-}
+package fdiscovery.tane.runner;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.commons.cli.CommandLine;
+
+import fdiscovery.columns.ColumnCollection;
+
+import com.rits.cloning.Cloner;
+
+import fdiscovery.equivalence.EquivalenceGroupTIntHashSet;
+import fdiscovery.equivalence.TEquivalence;
+import fdiscovery.partitions.StrippedPartition;
+import fdiscovery.partitions.StrippedPartitions;
+import fdiscovery.preprocessing.SVFileProcessor;
+import fdiscovery.tane.AprioriGeneration;
+import fdiscovery.general.CLIParserMiner;
+import fdiscovery.general.CollectionSet;
+import fdiscovery.general.ColumnFiles;
+import fdiscovery.general.FunctionalDependencies;
+import fdiscovery.general.Miner;
+import gnu.trove.iterator.TIntIterator;
+import gnu.trove.map.hash.THashMap;
+
+public class Tane extends Miner {
+
+ private int numberOfColumns;
+ private int numberOfRows;
+ private int[] T, Te;
+ private FunctionalDependencies minimalDependencies;
+ private StrippedPartitions strippedPartitions;
+ private HashMap cPlus;
+ private ArrayList> levels;
+ private ColumnCollection rSet;
+
+ public FunctionalDependencies getDependencies() {
+ return this.minimalDependencies;
+ }
+
+ @SuppressWarnings("unused")
+ public static void main2(String[] args) {
+ createColumDirectory();
+ createResultDirectory();
+
+ File source = new File(Miner.input);
+ SVFileProcessor inputFileProcessor = null;
+ try {
+ long timeStart = System.currentTimeMillis();
+
+ inputFileProcessor = new SVFileProcessor(source);
+ inputFileProcessor.init();
+ System.out.println("TANE");
+ System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter());
+ System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns());
+ System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows());
+ inputFileProcessor.createColumnFiles();
+ Tane taneRunner = new Tane(inputFileProcessor);
+ taneRunner.run();
+
+ System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(taneRunner.minimalDependencies.getCount())));;
+ long timeFindFDs = System.currentTimeMillis();
+ System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s");
+ System.out.println(taneRunner.getDependencies());
+
+ } catch (FileNotFoundException e) {
+ System.out.println("The input file could not be found.");
+ } catch (IOException e) {
+ System.out.println("The input reader could not be reset.");
+ }
+ }
+
+ public static void main(String[] args) {
+ CLIParserMiner parser = new CLIParserMiner();
+ CommandLine cli = parser.parse(args);
+ String inputFilename = new String();
+ String columnFileDirectory = new String();
+ String resultFile = new String();
+ int numberOfColumns = 0;
+ int numberOfRows = 0;
+
+ if (cli.hasOption("file")) {
+ inputFilename = cli.getOptionValue("file");
+ }
+ if (cli.hasOption("input")) {
+ columnFileDirectory = cli.getOptionValue("input");
+ }
+ if (cli.hasOption("result")) {
+ resultFile = cli.getOptionValue("result");
+ }
+ if (cli.hasOption("columns")) {
+ numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue();
+ }
+ if (cli.hasOption("rows")) {
+ numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue();
+ }
+ ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows);
+ long timeStart = System.currentTimeMillis();
+ try {
+ Tane runner = new Tane(columnFiles, numberOfRows);
+ runner.run();
+ long timeEnd = System.currentTimeMillis();
+ runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename);
+ } catch(OutOfMemoryError e) {
+ System.exit(Miner.STATUS_OOM);
+ }
+ System.exit(0);
+ }
+
+ private void writeOutputSuccessful(String outputFile, long time, String inputFileName) {
+ String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1";
+
+ StringBuilder outputBuilder = new StringBuilder();
+ if (!inputFileName.isEmpty()) {
+ outputBuilder.append(String.format("%s\t", inputFileName));
+ }
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows)));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns)));
+ outputBuilder.append(String.format("%s\t", timeString));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount())));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5))));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size())));
+ outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size())));
+ outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory())));
+ outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false)));
+
+ try {
+ BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true));
+ resultFileWriter.write(outputBuilder.toString());
+ System.out.print(outputBuilder.toString());
+ resultFileWriter.close();
+ } catch (IOException e) {
+ System.out.println("Couldn't write output.");
+ }
+ }
+
+ public Tane(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError {
+ this.numberOfColumns = columnFiles.getNumberOfColumns();
+ this.numberOfRows = numberOfRows;
+ this.minimalDependencies = new FunctionalDependencies();
+ this.strippedPartitions = new StrippedPartitions(columnFiles);
+ columnFiles.clear();
+ }
+
+
+ public Tane(SVFileProcessor table) throws OutOfMemoryError {
+ this.numberOfColumns = table.getNumberOfColumns();
+ this.numberOfRows = table.getNumberOfRows();
+ this.minimalDependencies = new FunctionalDependencies();
+ this.strippedPartitions = new StrippedPartitions(table.getColumnFiles());
+ }
+
+ public THashMap run() throws OutOfMemoryError {
+
+ levels = new ArrayList<>();
+ cPlus = new HashMap<>();
+
+ // Level 0 is the empty set
+ levels.add(new CollectionSet());
+ // Level 1 initialization
+ levels.add(new CollectionSet());
+
+ ColumnCollection emptyLHSSet = new ColumnCollection(this.numberOfColumns);
+ rSet = new ColumnCollection(this.numberOfColumns);
+
+ cPlus.put(emptyLHSSet, rSet);
+
+ this.T = new int[this.numberOfRows + 1];
+ this.Te = new int[this.numberOfRows + 1];
+ // initialize T to all -1, because it is specified to be all "NULL"
+ // (!=0) in TANE
+ for (int i = 0; i < T.length; i++) {
+ T[i] = -1;
+ }
+
+ // Initialization
+ for (int i = 0; i < this.numberOfColumns; i++) {
+ // set all bits in R
+ rSet.set(i);
+ // build atomic attribute-sets
+ ColumnCollection subset = new ColumnCollection(this.numberOfColumns);
+ subset.set(i);
+ // add to first level
+ levels.get(1).add(subset);
+ }
+
+ // main algorithm
+ int level = 1;
+ while (!levels.get(level).isEmpty()) {
+// System.out.println("Level:\t" + level);
+ this.computeDependencies(levels.get(level));
+ this.prune(levels.get(level));
+ levels.add(this.generateNextLevel(levels.get(level)));
+ levels.get(level).clear();
+ level++;
+ }
+ return minimalDependencies;
+ }
+
+ private CollectionSet generateNextLevel(CollectionSet currentLevel) {
+ CollectionSet nextLevel = new CollectionSet<>();
+
+ Cloner cloner = new Cloner();
+ AprioriGeneration prefixBlockGenerator = new AprioriGeneration<>(cloner.deepClone(currentLevel));
+ for (CollectionSet k : prefixBlockGenerator.prefixBlocks()) {
+ for (ColumnCollection y : k) {
+ for (ColumnCollection z : k.tailSet(y)) {
+ ColumnCollection x = y.orCopy(z);
+ boolean xInNextLevel = true;
+ for (int a : x.getSetBits()) {
+ x.clear(a);
+ if (!currentLevel.contains(x)) {
+ xInNextLevel = false;
+ break;
+ }
+ x.set(a);
+ }
+ if (xInNextLevel) {
+ nextLevel.add(x);
+ strippedPartitions.put(x, strippedProduct(strippedPartitions.get(y), strippedPartitions.get(z)));
+ }
+ }
+ }
+ }
+
+ return nextLevel;
+ }
+
+ private void computeDependencies(CollectionSet currentLevel) {
+ for (ColumnCollection x : currentLevel) {
+ addCPlusOfX(x);
+ }
+
+ for (ColumnCollection x : currentLevel) {
+ for (int a : x.andCopy(cPlus.get(x)).getSetBits()) {
+ boolean isDependency = isValidDependency(x.clearCopy(a), Integer.valueOf(a));
+
+ if (isDependency) {
+ minimalDependencies.addRHSColumn(x.clearCopy(a), a);
+ cPlus.get(x).clear(a);
+
+ for (int B : rSet.removeCopy(x).getSetBits()) {
+ cPlus.get(x).clear(B);
+ }
+ }
+ }
+
+ }
+ }
+
+ private ColumnCollection addCPlusOfX(ColumnCollection x) {
+ ColumnCollection cPlusOfX = cPlus.get(x.clearCopy(x.nextSetBit(0)));
+
+ // if cPlusOfX was not in the list it has to be computed recursively
+ if (cPlusOfX == null) {
+ cPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(x.nextSetBit(0))).clone();
+ } else {
+ cPlusOfX = (ColumnCollection) cPlusOfX.clone();
+ }
+ for (int a : x.getSetBits()) {
+ ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a));
+
+ if (nextCPlusOfX == null) {
+ nextCPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(a)).clone();
+ } else {
+ nextCPlusOfX = (ColumnCollection) nextCPlusOfX.clone();
+ }
+
+ cPlusOfX.and(nextCPlusOfX);
+ }
+ cPlus.put(x, cPlusOfX);
+
+ return cPlusOfX;
+ }
+
+ private void prune(CollectionSet currentLevel) {
+ Iterator currentLevelIterator = currentLevel.iterator();
+
+ while (currentLevelIterator.hasNext()) {
+ ColumnCollection x = currentLevelIterator.next();
+
+ ColumnCollection cPlusOfX = cPlus.get(x);
+ if (cPlusOfX == null) {
+ cPlusOfX = addCPlusOfX(x);
+ }
+
+ if (cPlusOfX.isEmpty()) {
+ currentLevelIterator.remove();
+ continue;
+ }
+
+ boolean isSuperKey = isSuperKey(x);
+ if (isSuperKey) {
+ for (int a : cPlus.get(x).removeCopy(x).getSetBits()) {
+ ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0));
+ ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey);
+ if (firstCPlusCandidates == null) {
+ firstCPlusCandidates = (ColumnCollection) addCPlusOfX(firstCPlusCandidatesKey).clone();
+ } else {
+ firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone();
+ }
+ for (int b : x.getSetBits()) {
+
+ ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b));
+ if (nextCPlusCandidates == null) {
+ nextCPlusCandidates = (ColumnCollection) addCPlusOfX(x.setCopy(a).clearCopy(b)).clone();
+ } else {
+ nextCPlusCandidates = (ColumnCollection) nextCPlusCandidates.clone();
+ }
+
+ firstCPlusCandidates.and(nextCPlusCandidates);
+ }
+ if (firstCPlusCandidates.get(a)) {
+ minimalDependencies.addRHSColumn(x, a);
+ }
+ }
+ currentLevelIterator.remove();
+ }
+ }
+ }
+
+ protected boolean isSuperKey(ColumnCollection LHS) {
+ StrippedPartition partitionOfX = strippedPartitions.get(LHS);
+
+ int sumOfSizesOfEquivalenceClasses = 0;
+ int numberOfEquivalenceClasses = 0;
+
+ for (TEquivalence equivalenceGroup : partitionOfX) {
+ sumOfSizesOfEquivalenceClasses += equivalenceGroup.size();
+ numberOfEquivalenceClasses++;
+ }
+
+ // equation (1) in the paper
+ boolean result = (((sumOfSizesOfEquivalenceClasses - numberOfEquivalenceClasses) / (double) this.numberOfColumns) == 0);
+
+ return result;
+ }
+
+ private double error(StrippedPartition xPartition, StrippedPartition xUnionAPartition) {
+ int e = 0;
+
+ for (TEquivalence equivalenceGroup : xUnionAPartition) {
+ Te[equivalenceGroup.getIdentifier()] = equivalenceGroup.size();
+ }
+ for (TEquivalence equivalenceGroup : xPartition) {
+ int m = 1;
+
+ for (TIntIterator tIt=equivalenceGroup.iterator(); tIt.hasNext(); ) {
+// for (Integer t : equivalenceGroup) {
+ m = Math.max(m, Te[tIt.next()]);
+ }
+ e = e + equivalenceGroup.size() - m;
+
+ }
+ for (TEquivalence equivalenceGroup : xUnionAPartition) {
+ Te[equivalenceGroup.getIdentifier()] = 0;
+ }
+
+ return (double)e / this.numberOfRows;
+ }
+
+
+ private boolean isValidDependency(ColumnCollection LHS, Integer RHS) {
+ if (LHS.isEmpty()) {
+ return false;
+ }
+
+ return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS.intValue()))) == 0);
+ }
+
+ public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) {
+ StrippedPartition xPartition = new StrippedPartition();
+ HashMap S = new HashMap<>();
+
+ if (yPartition.size() > zPartition.size()) {
+ StrippedPartition swap = zPartition;
+ zPartition = yPartition;
+ yPartition = swap;
+ }
+
+ // build some kind of probe table
+ int i = 1;
+ for (TEquivalence cI : yPartition) {
+ for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
+ int tValue = tIt.next();
+ T[tValue] = i;
+
+ }
+ S.put(Integer.valueOf(i), new EquivalenceGroupTIntHashSet());
+ i++;
+ }
+
+ for (TEquivalence cI : zPartition) {
+ for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
+ int tValue = tIt.next();
+ if (T[tValue] != -1) {
+ TEquivalence sOld = S.get(Integer.valueOf(T[tValue]));
+ sOld.add(tValue);
+ }
+ }
+ for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
+ int tValue = tIt.next();
+ TEquivalence s = S.get(Integer.valueOf(T[tValue]));
+ if (s != null && s.size() > 1) {
+ xPartition.add(s);
+ }
+ S.put(Integer.valueOf(T[tValue]), new EquivalenceGroupTIntHashSet());
+ }
+ }
+ i = 1;
+ for (TEquivalence cI : yPartition) {
+ for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) {
+ int tValue = tIt.next();
+ T[tValue] = -1;
+ }
+ }
+
+ return xPartition;
+ }
+}
diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java
similarity index 99%
rename from dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java
rename to dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java
index 1c8fd82..e631ab7 100644
--- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java
+++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java
@@ -71,11 +71,11 @@ public void execute() throws AlgorithmExecutionException {
DFDMiner dfdMiner = new DFDMiner(inputFileProcessor);
dfdMiner.run();
FunctionalDependencies fds = dfdMiner.getDependencies();
-
+
RelationalInput input = fileInput.generateNewCopy();
String relationName = input.relationName();
List columnNames = input.columnNames();
-
+
for (ColumnCollection determining : fds.keySet()) {
for (int dependentColumn : fds.get(determining).getSetBits()) {
ColumnIdentifier[]