diff --git a/dfd/dfdAlgorithm/pom.xml b/dfd/dfdAlgorithm/pom.xml index 128a457..ec91c77 100755 --- a/dfd/dfdAlgorithm/pom.xml +++ b/dfd/dfdAlgorithm/pom.xml @@ -2,7 +2,7 @@ xmlns="http://maven.apache.org/POM/4.0.0" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - + DFDAlgorithm jar diff --git a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java index cd29e27..62fedb3 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/approach/runner/DFDMiner.java @@ -1,534 +1,534 @@ -package fdiscovery.approach.runner; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Stack; - -import org.apache.commons.cli.CommandLine; - - -import fdiscovery.approach.ColumnOrder; -import fdiscovery.columns.ColumnCollection; -import fdiscovery.general.CLIParserMiner; -import fdiscovery.general.ColumnFiles; -import fdiscovery.general.FunctionalDependencies; -import fdiscovery.general.Miner; -import fdiscovery.partitions.ComposedPartition; -import fdiscovery.partitions.FileBasedPartition; -import fdiscovery.partitions.FileBasedPartitions; -import fdiscovery.partitions.MemoryManagedJoinedPartitions; -import fdiscovery.partitions.Partition; -import fdiscovery.preprocessing.SVFileProcessor; -import fdiscovery.pruning.Dependencies; -import fdiscovery.pruning.NonDependencies; -import fdiscovery.pruning.Observation; -import fdiscovery.pruning.Observations; -import fdiscovery.pruning.Seed; -import gnu.trove.map.hash.TLongObjectHashMap; -import gnu.trove.set.hash.THashSet; - -public class DFDMiner extends Miner implements Runnable { - - private int numberOfColumns; - private int numberOfRows; - private ColumnOrder columnOrder; - private Stack trace; - private Stack seeds; - private Observations observations; - private FunctionalDependencies minimalDependencies; - private FunctionalDependencies maximalNonDependencies; - private FileBasedPartitions fileBasedPartitions; - private Dependencies dependencies; - private NonDependencies nonDependencies; - private MemoryManagedJoinedPartitions joinedPartitions; - - public static void main(String[] args) { - createColumDirectory(); - - File source = new File(DFDMiner.input); - SVFileProcessor inputFileProcessor = null; - try { - long timeStart = System.currentTimeMillis(); - - inputFileProcessor = new SVFileProcessor(source); - inputFileProcessor.init(); - System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); - System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); - System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); - inputFileProcessor.createColumnFiles(); - DFDMiner dfdRunner = new DFDMiner(inputFileProcessor); - - dfdRunner.run(); - System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(dfdRunner.minimalDependencies.getCount()))); - long timeFindFDs = System.currentTimeMillis(); - System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s"); - System.out.println(dfdRunner.getDependencies()); - - } catch (FileNotFoundException e) { - System.out.println("The input file could not be found."); - } catch (IOException e) { - System.out.println("The input reader could not be reset."); - } - } - - public static void main2(String[] args) { - CLIParserMiner parser = new CLIParserMiner(); - CommandLine cli = parser.parse(args); - String inputFilename = new String(); - String columnFileDirectory = new String(); - String resultFile = new String(); - int numberOfColumns = 0; - int numberOfRows = 0; - - if (cli.hasOption("file")) { - inputFilename = cli.getOptionValue("file"); - } - if (cli.hasOption("input")) { - columnFileDirectory = cli.getOptionValue("input"); - } - if (cli.hasOption("result")) { - resultFile = cli.getOptionValue("result"); - } - if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); - } - if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); - } - ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); - long timeStart = System.currentTimeMillis(); - DFDMiner runner = new DFDMiner(columnFiles, numberOfRows); - try { - runner.run(); - long timeEnd = System.currentTimeMillis(); - runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); - } catch (OutOfMemoryError e) { - System.exit(Miner.STATUS_OOM); - } - System.exit(0); - } - - private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - - String timeString = (time != -1) ? String.format("%.1f", Double.valueOf((double) (time) / 1000)) : "-1"; - StringBuilder outputBuilder = new StringBuilder(); - if (!inputFileName.isEmpty()) { - outputBuilder.append(String.format("%s\t", inputFileName)); - } - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); - outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getTotalCount()))); - outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); - outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); - - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); - resultFileWriter.write(outputBuilder.toString()); - System.out.print(outputBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write output."); - } - } - - public DFDMiner(SVFileProcessor table) throws OutOfMemoryError { - this.observations = new Observations(); - this.numberOfColumns = table.getNumberOfColumns(); - this.numberOfRows = table.getNumberOfRows(); - this.trace = new Stack<>(); - this.seeds = new Stack<>(); - this.minimalDependencies = new FunctionalDependencies(); - this.maximalNonDependencies = new FunctionalDependencies(); - this.dependencies = new Dependencies(this.numberOfColumns); - this.nonDependencies = new NonDependencies(this.numberOfColumns); - this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); - this.fileBasedPartitions = new FileBasedPartitions(table); - this.columnOrder = new ColumnOrder(fileBasedPartitions); - for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { - ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); - columnIdentifier.set(columnIndex); - this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); - } - } - - public DFDMiner(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { - this.observations = new Observations(); - this.numberOfColumns = columnFiles.getNumberOfColumns(); - this.numberOfRows = numberOfRows; - this.trace = new Stack<>(); - this.seeds = new Stack<>(); - this.minimalDependencies = new FunctionalDependencies(); - this.maximalNonDependencies = new FunctionalDependencies(); - this.dependencies = new Dependencies(this.numberOfColumns); - this.nonDependencies = new NonDependencies(this.numberOfColumns); - this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); - this.fileBasedPartitions = new FileBasedPartitions(columnFiles, numberOfRows); - columnFiles.clear(); - this.columnOrder = new ColumnOrder(fileBasedPartitions); - for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { - ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); - columnIdentifier.set(columnIndex); - this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); - } - } - - public void run() throws OutOfMemoryError { - - ArrayList keys = new ArrayList<>(); - - // check each column for uniqueness - // if a column is unique it's a key for all other columns - // therefore uniquePartition -> schema - uniquePartition - for (FileBasedPartition fileBasedPartition : this.fileBasedPartitions) { - if (fileBasedPartition.isUnique()) { - ColumnCollection uniquePartitionIndices = fileBasedPartition.getIndices(); - ColumnCollection RHS = uniquePartitionIndices.complementCopy(); - this.minimalDependencies.put(uniquePartitionIndices, RHS); - // add unique columns to minimal uniques - keys.add(uniquePartitionIndices); - } - } - - // do this for all RHS - for (int currentRHSIndex = 0; currentRHSIndex < this.numberOfColumns; currentRHSIndex++) { - - this.dependencies = new Dependencies(numberOfColumns); - this.nonDependencies = new NonDependencies(numberOfColumns); - this.trace.clear(); - this.observations.clear(); - - for (int lhsIndex = 0; lhsIndex < this.numberOfColumns; lhsIndex++) { - if (lhsIndex != currentRHSIndex) { - ColumnCollection lhs = new ColumnCollection(numberOfColumns); - lhs.set(lhsIndex); - if (keys.contains(lhs)) { - this.dependencies.add(lhs); - this.observations.put(lhs, Observation.MINIMAL_DEPENDENCY); - } - } - } - - ColumnCollection currentRHS = new ColumnCollection(numberOfColumns); - currentRHS.set(currentRHSIndex); - - // generate seeds - for (int partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) { - if (partitionIndex != currentRHSIndex) { - FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex); - this.seeds.push(new Seed(lhsPartition.getIndices())); - } - } - - do { - while (!seeds.isEmpty()) { - Seed currentSeed = this.randomTake(); - do { - ColumnCollection lhsIndices = currentSeed.getIndices(); - Observation observationOfLHS = this.observations.get(currentSeed.getIndices()); - if (observationOfLHS == null) { - observationOfLHS = this.checkDependencyAndStoreIt(currentSeed, currentRHSIndex); - - // if we couldn't find any dependency that is a - // subset of the current valid LHS it is minimal - if (observationOfLHS == Observation.MINIMAL_DEPENDENCY) { - this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - // if we couldn't find any non-dependency that is - // superset of the current non-valid LHS it is - // maximal - else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) { - this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - currentSeed = randomWalkStep(currentSeed, currentRHSIndex); - } else { -// System.out.println(String.format("[2]Current [%s]%s\t[%s]", (char) (currentRHSIndex + 65), currentSeed, observationOfLHS)); - if (observationOfLHS.isCandidate()) { - if (observationOfLHS.isDependency()) { - Observation updatedDependencyType = this.observations.updateDependencyType(currentSeed.getIndices()); - // System.out.println(String.format("\tupdated:\t%s", - // updatedDependencyType)); - this.observations.put(lhsIndices, updatedDependencyType); - if (updatedDependencyType == Observation.MINIMAL_DEPENDENCY) { - // System.out.println("Add min dependency:\t" - // + currentSeed); - this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - } else { - Observation updatedNonDependencyType = this.observations.updateNonDependencyType(currentSeed.getIndices(), currentRHSIndex); - this.observations.put(lhsIndices, updatedNonDependencyType); - // System.out.println(String.format("\tupdated:\t%s", - // updatedNonDependencyType)); - if (updatedNonDependencyType == Observation.MAXIMAL_NON_DEPENDENCY) { - this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); - } - } - } - currentSeed = randomWalkStep(currentSeed, currentRHSIndex); - } - - } while (currentSeed != null); - } - seeds = this.nextSeeds(currentRHSIndex); - } while (!seeds.isEmpty()); - } - // System.out.println(String.format("Number partitions:\t%d", - // this.joinedPartitions.getCount())); - } - - private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) { - if (nonDependencies.isRepresented(seed.getIndices())) { - // System.out.println("Skip because of nonDependency"); - Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); - this.observations.put(seed.getIndices(), observationOfLHS); - this.nonDependencies.add(seed.getIndices()); - return observationOfLHS; - } else if (dependencies.isRepresented(seed.getIndices())) { - // System.out.println("Skip because of dependency"); - Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); - this.observations.put(seed.getIndices(), observationOfLHS); - this.dependencies.add(seed.getIndices()); - return observationOfLHS; - } - - FileBasedPartition currentRHSPartition = this.fileBasedPartitions.get(currentRHSIndex); - Partition currentLHSPartition = null; - Partition currentLHSJoinedRHSPartition = null; - - if (seed.isAtomic()) { - currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); - currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); - } else { - - // if we went upwards in the lattice we can build the currentLHS - // partition directly from the previous partition - if (seed.getAdditionalColumnIndex() != -1) { - int additionalColumn = seed.getAdditionalColumnIndex(); - Partition previousLHSPartition = joinedPartitions.get(seed.getBaseIndices()); - if (previousLHSPartition == null) { - ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getBaseIndices()); - previousLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); - } - FileBasedPartition additionalColumnPartition = this.fileBasedPartitions.get(additionalColumn); - currentLHSPartition = this.joinedPartitions.get(previousLHSPartition.getIndices().setCopy(additionalColumn)); - if (currentLHSPartition == null) { - currentLHSPartition = new ComposedPartition(previousLHSPartition, additionalColumnPartition); - this.joinedPartitions.addPartition(currentLHSPartition); - } - currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); - if (currentLHSJoinedRHSPartition == null) { - currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); - this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); - } - } else { - currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); - if (currentLHSPartition == null) { - ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getIndices()); - currentLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); - this.joinedPartitions.addPartition(currentLHSPartition); - } - currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); - if (currentLHSJoinedRHSPartition == null) { - currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); - this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); - } - } -// this.joinedPartitions.addPartition(currentLHSPartition); -// this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); - } - - if (Partition.representsFD(currentLHSPartition, currentLHSJoinedRHSPartition)) { - Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); - this.observations.put(seed.getIndices(), observationOfLHS); - this.dependencies.add(seed.getIndices()); - return observationOfLHS; - } - Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); - this.observations.put(seed.getIndices(), observationOfLHS); - this.nonDependencies.add(seed.getIndices()); - return observationOfLHS; - } - - private Stack nextSeeds(int currentRHSIndex) { -// System.out.println("Find holes"); - THashSet deps = new THashSet<>(); - ArrayList currentMaximalNonDependencies = maximalNonDependencies.getLHSForRHS(currentRHSIndex); - HashSet currentMinimalDependencies = new HashSet<>(minimalDependencies.getLHSForRHS(currentRHSIndex)); - ArrayList newDeps = new ArrayList<>(numberOfColumns * deps.size()); -// Holes holes = new Holes(); - -// int i = 0; -// for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { -// ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); -// if (deps.isEmpty()) { -// ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); -// for (Integer complementColumnIndex : complement.getSetBits()) { -// deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); -// } -// } else { -// for (ColumnCollection dep : deps) { -// int[] setBits = complement.getSetBits(); -// for (int setBit = 0; setBit < setBits.length; setBit++) { -// holes.add(dep.setCopy(setBits[setBit])); -//// System.out.println("Dep:\t" + dep.setCopy(setBits[setBit])); -// } -// } -// // minimize newDeps -// System.out.println(i++ + "\t" + currentMaximalNonDependencies.size()); -// System.out.println("total deps:\t" + deps.size()); -// System.out.println("before minimizing:\t" + holes.size()); -//// ArrayList minimizedNewDeps = minimizeSeeds(newDeps); -// holes.minimize(); -// System.out.println("after minimizing:\t" + holes.size()); -// deps.clear(); -// deps.addAll(holes); -// holes.clear(); -// } -// } - - for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { - ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); - if (deps.isEmpty()) { - ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); - for (int complementColumnIndex : complement.getSetBits()) { - deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); - } - } else { - for (ColumnCollection dep : deps) { - int[] setBits = complement.getSetBits(); - for (int setBit = 0; setBit < setBits.length; setBit++) { - newDeps.add(dep.setCopy(setBits[setBit])); - } - } - // minimize newDeps - ArrayList minimizedNewDeps = minimizeSeeds(newDeps); - deps.clear(); - deps.addAll(minimizedNewDeps); - newDeps.clear(); - } - } - - // return only elements that aren't already covered by the minimal - // dependencies - Stack remainingSeeds = new Stack<>(); - deps.removeAll(currentMinimalDependencies); - for (ColumnCollection remainingSeed : deps) { - remainingSeeds.push(new Seed(remainingSeed)); - } - - return remainingSeeds; - } - - private ArrayList minimizeSeeds(ArrayList seeds) { - long maxCardinality = 0; - TLongObjectHashMap> seedsBySize = new TLongObjectHashMap<>(numberOfColumns); - for (ColumnCollection seed : seeds) { - long cardinalityOfSeed = seed.cardinality(); - maxCardinality = Math.max(maxCardinality, cardinalityOfSeed); - seedsBySize.putIfAbsent(cardinalityOfSeed, new ArrayList(seeds.size()/numberOfColumns)); - seedsBySize.get(cardinalityOfSeed).add(seed); - } - - for (long lowerBound = 1; lowerBound < maxCardinality; lowerBound++) { - ArrayList lowerBoundSeeds = seedsBySize.get(lowerBound); - if (lowerBoundSeeds != null) { - for (long upperBound = maxCardinality; upperBound > lowerBound; upperBound--) { - ArrayList upperBoundSeeds = seedsBySize.get(upperBound); - if (upperBoundSeeds != null) { - for (Iterator lowerIt = lowerBoundSeeds.iterator(); lowerIt.hasNext();) { - ColumnCollection lowerSeed = lowerIt.next(); - for (Iterator upperIt = upperBoundSeeds.iterator(); upperIt.hasNext();) { - if (lowerSeed.isSubsetOf(upperIt.next())) { - upperIt.remove(); - } - } - } - } - } - } - } - ArrayList minimizedSeeds = new ArrayList<>(); - for (ArrayList seedList : seedsBySize.valueCollection()) { - for (ColumnCollection seed : seedList) { - minimizedSeeds.add(seed); - } - } - return minimizedSeeds; - } - - private Seed randomTake() { - if (!this.seeds.isEmpty()) { - return this.seeds.pop(); - } - return null; - } - - private Seed randomWalkStep(Seed currentSeed, int currentRHSIndex) { - Observation observationOfSeed = this.observations.get(currentSeed.getIndices()); - - if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { - THashSet uncheckedSubsets = this.observations.getUncheckedMaximalSubsets(currentSeed.getIndices(), columnOrder); - THashSet prunedNonDependencySubsets = nonDependencies.getPrunedSupersets(uncheckedSubsets); - for (ColumnCollection prunedNonDependencySubset : prunedNonDependencySubsets) { - observations.put(prunedNonDependencySubset, Observation.NON_DEPENDENCY); - } - uncheckedSubsets.removeAll(prunedNonDependencySubsets); - if (uncheckedSubsets.isEmpty() && prunedNonDependencySubsets.isEmpty()) { - observations.put(currentSeed.getIndices(), Observation.MINIMAL_DEPENDENCY); - minimalDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); - } else if (!uncheckedSubsets.isEmpty()) { - ColumnCollection notRepresentedUncheckedSubset = uncheckedSubsets.iterator().next(); - if (notRepresentedUncheckedSubset != null) { - trace.push(currentSeed); - return new Seed(notRepresentedUncheckedSubset); - } - } - } else if (observationOfSeed == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) { - THashSet uncheckedSupersets = this.observations.getUncheckedMinimalSupersets(currentSeed.getIndices(), currentRHSIndex, columnOrder); - THashSet prunedNonDependencySupersets = nonDependencies.getPrunedSupersets(uncheckedSupersets); - THashSet prunedDependencySupersets = dependencies.getPrunedSubsets(uncheckedSupersets); - for (ColumnCollection prunedNonDependencySuperset : prunedNonDependencySupersets) { - observations.put(prunedNonDependencySuperset, Observation.NON_DEPENDENCY); - } - for (ColumnCollection prunedDependencySuperset : prunedDependencySupersets) { - observations.put(prunedDependencySuperset, Observation.DEPENDENCY); - } - uncheckedSupersets.removeAll(prunedDependencySupersets); - uncheckedSupersets.removeAll(prunedNonDependencySupersets); - if (uncheckedSupersets.isEmpty() && prunedNonDependencySupersets.isEmpty()) { - observations.put(currentSeed.getIndices(), Observation.MAXIMAL_NON_DEPENDENCY); - maximalNonDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); - } else if (!uncheckedSupersets.isEmpty()) { - ColumnCollection notRepresentedUncheckedSuperset = uncheckedSupersets.iterator().next(); - if (notRepresentedUncheckedSuperset != null) { - trace.push(currentSeed); - int additionalColumn = notRepresentedUncheckedSuperset.removeCopy(currentSeed.getIndices()).nextSetBit(0); - return new Seed(notRepresentedUncheckedSuperset, additionalColumn); - } - } - } - if (!this.trace.isEmpty()) { - Seed nextSeed = this.trace.pop(); - return nextSeed; - } - return null; - } - - public FunctionalDependencies getDependencies() { - return this.minimalDependencies; - } +package fdiscovery.approach.runner; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Stack; + +import org.apache.commons.cli.CommandLine; + + +import fdiscovery.approach.ColumnOrder; +import fdiscovery.columns.ColumnCollection; +import fdiscovery.general.CLIParserMiner; +import fdiscovery.general.ColumnFiles; +import fdiscovery.general.FunctionalDependencies; +import fdiscovery.general.Miner; +import fdiscovery.partitions.ComposedPartition; +import fdiscovery.partitions.FileBasedPartition; +import fdiscovery.partitions.FileBasedPartitions; +import fdiscovery.partitions.MemoryManagedJoinedPartitions; +import fdiscovery.partitions.Partition; +import fdiscovery.preprocessing.SVFileProcessor; +import fdiscovery.pruning.Dependencies; +import fdiscovery.pruning.NonDependencies; +import fdiscovery.pruning.Observation; +import fdiscovery.pruning.Observations; +import fdiscovery.pruning.Seed; +import gnu.trove.map.hash.TLongObjectHashMap; +import gnu.trove.set.hash.THashSet; + +public class DFDMiner extends Miner implements Runnable { + + private int numberOfColumns; + private int numberOfRows; + private ColumnOrder columnOrder; + private Stack trace; + private Stack seeds; + private Observations observations; + private FunctionalDependencies minimalDependencies; + private FunctionalDependencies maximalNonDependencies; + private FileBasedPartitions fileBasedPartitions; + private Dependencies dependencies; + private NonDependencies nonDependencies; + private MemoryManagedJoinedPartitions joinedPartitions; + + public static void main(String[] args) { + createColumDirectory(); + + File source = new File(DFDMiner.input); + SVFileProcessor inputFileProcessor = null; + try { + long timeStart = System.currentTimeMillis(); + + inputFileProcessor = new SVFileProcessor(source); + inputFileProcessor.init(); + System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); + System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); + System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); + inputFileProcessor.createColumnFiles(); + DFDMiner dfdRunner = new DFDMiner(inputFileProcessor); + + dfdRunner.run(); + System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(dfdRunner.minimalDependencies.getCount()))); + long timeFindFDs = System.currentTimeMillis(); + System.out.println("Total time:\t" + (timeFindFDs - timeStart) / 1000 + "s"); + System.out.println(dfdRunner.getDependencies()); + + } catch (FileNotFoundException e) { + System.out.println("The input file could not be found."); + } catch (IOException e) { + System.out.println("The input reader could not be reset."); + } + } + + public static void main2(String[] args) { + CLIParserMiner parser = new CLIParserMiner(); + CommandLine cli = parser.parse(args); + String inputFilename = new String(); + String columnFileDirectory = new String(); + String resultFile = new String(); + int numberOfColumns = 0; + int numberOfRows = 0; + + if (cli.hasOption("file")) { + inputFilename = cli.getOptionValue("file"); + } + if (cli.hasOption("input")) { + columnFileDirectory = cli.getOptionValue("input"); + } + if (cli.hasOption("result")) { + resultFile = cli.getOptionValue("result"); + } + if (cli.hasOption("columns")) { + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + } + if (cli.hasOption("rows")) { + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + } + ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); + long timeStart = System.currentTimeMillis(); + DFDMiner runner = new DFDMiner(columnFiles, numberOfRows); + try { + runner.run(); + long timeEnd = System.currentTimeMillis(); + runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); + } catch (OutOfMemoryError e) { + System.exit(Miner.STATUS_OOM); + } + System.exit(0); + } + + private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { + + String timeString = (time != -1) ? String.format("%.1f", Double.valueOf((double) (time) / 1000)) : "-1"; + StringBuilder outputBuilder = new StringBuilder(); + if (!inputFileName.isEmpty()) { + outputBuilder.append(String.format("%s\t", inputFileName)); + } + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%s\t", timeString)); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.joinedPartitions.getTotalCount()))); + outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); + + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); + resultFileWriter.write(outputBuilder.toString()); + System.out.print(outputBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write output."); + } + } + + public DFDMiner(SVFileProcessor table) throws OutOfMemoryError { + this.observations = new Observations(); + this.numberOfColumns = table.getNumberOfColumns(); + this.numberOfRows = table.getNumberOfRows(); + this.trace = new Stack<>(); + this.seeds = new Stack<>(); + this.minimalDependencies = new FunctionalDependencies(); + this.maximalNonDependencies = new FunctionalDependencies(); + this.dependencies = new Dependencies(this.numberOfColumns); + this.nonDependencies = new NonDependencies(this.numberOfColumns); + this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); + this.fileBasedPartitions = new FileBasedPartitions(table); + this.columnOrder = new ColumnOrder(fileBasedPartitions); + for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { + ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); + columnIdentifier.set(columnIndex); + this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); + } + } + + public DFDMiner(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { + this.observations = new Observations(); + this.numberOfColumns = columnFiles.getNumberOfColumns(); + this.numberOfRows = numberOfRows; + this.trace = new Stack<>(); + this.seeds = new Stack<>(); + this.minimalDependencies = new FunctionalDependencies(); + this.maximalNonDependencies = new FunctionalDependencies(); + this.dependencies = new Dependencies(this.numberOfColumns); + this.nonDependencies = new NonDependencies(this.numberOfColumns); + this.joinedPartitions = new MemoryManagedJoinedPartitions(this.numberOfColumns); + this.fileBasedPartitions = new FileBasedPartitions(columnFiles, numberOfRows); + columnFiles.clear(); + this.columnOrder = new ColumnOrder(fileBasedPartitions); + for (int columnIndex = 0; columnIndex < this.numberOfColumns; columnIndex++) { + ColumnCollection columnIdentifier = new ColumnCollection(this.numberOfColumns); + columnIdentifier.set(columnIndex); + this.joinedPartitions.addPartition(this.fileBasedPartitions.get(columnIndex)); + } + } + + public void run() throws OutOfMemoryError { + + ArrayList keys = new ArrayList<>(); + + // check each column for uniqueness + // if a column is unique it's a key for all other columns + // therefore uniquePartition -> schema - uniquePartition + for (FileBasedPartition fileBasedPartition : this.fileBasedPartitions) { + if (fileBasedPartition.isUnique()) { + ColumnCollection uniquePartitionIndices = fileBasedPartition.getIndices(); + ColumnCollection RHS = uniquePartitionIndices.complementCopy(); + this.minimalDependencies.put(uniquePartitionIndices, RHS); + // add unique columns to minimal uniques + keys.add(uniquePartitionIndices); + } + } + + // do this for all RHS + for (int currentRHSIndex = 0; currentRHSIndex < this.numberOfColumns; currentRHSIndex++) { + + this.dependencies = new Dependencies(numberOfColumns); + this.nonDependencies = new NonDependencies(numberOfColumns); + this.trace.clear(); + this.observations.clear(); + + for (int lhsIndex = 0; lhsIndex < this.numberOfColumns; lhsIndex++) { + if (lhsIndex != currentRHSIndex) { + ColumnCollection lhs = new ColumnCollection(numberOfColumns); + lhs.set(lhsIndex); + if (keys.contains(lhs)) { + this.dependencies.add(lhs); + this.observations.put(lhs, Observation.MINIMAL_DEPENDENCY); + } + } + } + + ColumnCollection currentRHS = new ColumnCollection(numberOfColumns); + currentRHS.set(currentRHSIndex); + + // generate seeds + for (int partitionIndex : columnOrder.getOrderHighDistinctCount(currentRHS.complementCopy())) { + if (partitionIndex != currentRHSIndex) { + FileBasedPartition lhsPartition = this.fileBasedPartitions.get(partitionIndex); + this.seeds.push(new Seed(lhsPartition.getIndices())); + } + } + + do { + while (!seeds.isEmpty()) { + Seed currentSeed = this.randomTake(); + do { + ColumnCollection lhsIndices = currentSeed.getIndices(); + Observation observationOfLHS = this.observations.get(currentSeed.getIndices()); + if (observationOfLHS == null) { + observationOfLHS = this.checkDependencyAndStoreIt(currentSeed, currentRHSIndex); + + // if we couldn't find any dependency that is a + // subset of the current valid LHS it is minimal + if (observationOfLHS == Observation.MINIMAL_DEPENDENCY) { + this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + // if we couldn't find any non-dependency that is + // superset of the current non-valid LHS it is + // maximal + else if (observationOfLHS == Observation.MAXIMAL_NON_DEPENDENCY) { + this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + currentSeed = randomWalkStep(currentSeed, currentRHSIndex); + } else { +// System.out.println(String.format("[2]Current [%s]%s\t[%s]", (char) (currentRHSIndex + 65), currentSeed, observationOfLHS)); + if (observationOfLHS.isCandidate()) { + if (observationOfLHS.isDependency()) { + Observation updatedDependencyType = this.observations.updateDependencyType(currentSeed.getIndices()); + // System.out.println(String.format("\tupdated:\t%s", + // updatedDependencyType)); + this.observations.put(lhsIndices, updatedDependencyType); + if (updatedDependencyType == Observation.MINIMAL_DEPENDENCY) { + // System.out.println("Add min dependency:\t" + // + currentSeed); + this.minimalDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + } else { + Observation updatedNonDependencyType = this.observations.updateNonDependencyType(currentSeed.getIndices(), currentRHSIndex); + this.observations.put(lhsIndices, updatedNonDependencyType); + // System.out.println(String.format("\tupdated:\t%s", + // updatedNonDependencyType)); + if (updatedNonDependencyType == Observation.MAXIMAL_NON_DEPENDENCY) { + this.maximalNonDependencies.addRHSColumn(lhsIndices, currentRHSIndex); + } + } + } + currentSeed = randomWalkStep(currentSeed, currentRHSIndex); + } + + } while (currentSeed != null); + } + seeds = this.nextSeeds(currentRHSIndex); + } while (!seeds.isEmpty()); + } + // System.out.println(String.format("Number partitions:\t%d", + // this.joinedPartitions.getCount())); + } + + private Observation checkDependencyAndStoreIt(Seed seed, int currentRHSIndex) { + if (nonDependencies.isRepresented(seed.getIndices())) { + // System.out.println("Skip because of nonDependency"); + Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); + this.observations.put(seed.getIndices(), observationOfLHS); + this.nonDependencies.add(seed.getIndices()); + return observationOfLHS; + } else if (dependencies.isRepresented(seed.getIndices())) { + // System.out.println("Skip because of dependency"); + Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); + this.observations.put(seed.getIndices(), observationOfLHS); + this.dependencies.add(seed.getIndices()); + return observationOfLHS; + } + + FileBasedPartition currentRHSPartition = this.fileBasedPartitions.get(currentRHSIndex); + Partition currentLHSPartition = null; + Partition currentLHSJoinedRHSPartition = null; + + if (seed.isAtomic()) { + currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); + currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); + } else { + + // if we went upwards in the lattice we can build the currentLHS + // partition directly from the previous partition + if (seed.getAdditionalColumnIndex() != -1) { + int additionalColumn = seed.getAdditionalColumnIndex(); + Partition previousLHSPartition = joinedPartitions.get(seed.getBaseIndices()); + if (previousLHSPartition == null) { + ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getBaseIndices()); + previousLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); + } + FileBasedPartition additionalColumnPartition = this.fileBasedPartitions.get(additionalColumn); + currentLHSPartition = this.joinedPartitions.get(previousLHSPartition.getIndices().setCopy(additionalColumn)); + if (currentLHSPartition == null) { + currentLHSPartition = new ComposedPartition(previousLHSPartition, additionalColumnPartition); + this.joinedPartitions.addPartition(currentLHSPartition); + } + currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); + if (currentLHSJoinedRHSPartition == null) { + currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); + this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); + } + } else { + currentLHSPartition = this.joinedPartitions.get(seed.getIndices()); + if (currentLHSPartition == null) { + ArrayList partitionsToJoin = joinedPartitions.getBestMatchingPartitions(seed.getIndices()); + currentLHSPartition = ComposedPartition.buildPartition(partitionsToJoin); + this.joinedPartitions.addPartition(currentLHSPartition); + } + currentLHSJoinedRHSPartition = this.joinedPartitions.get(currentLHSPartition.getIndices().setCopy(currentRHSIndex)); + if (currentLHSJoinedRHSPartition == null) { + currentLHSJoinedRHSPartition = new ComposedPartition(currentLHSPartition, currentRHSPartition); + this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); + } + } +// this.joinedPartitions.addPartition(currentLHSPartition); +// this.joinedPartitions.addPartition(currentLHSJoinedRHSPartition); + } + + if (Partition.representsFD(currentLHSPartition, currentLHSJoinedRHSPartition)) { + Observation observationOfLHS = this.observations.updateDependencyType(seed.getIndices()); + this.observations.put(seed.getIndices(), observationOfLHS); + this.dependencies.add(seed.getIndices()); + return observationOfLHS; + } + Observation observationOfLHS = this.observations.updateNonDependencyType(seed.getIndices(), currentRHSIndex); + this.observations.put(seed.getIndices(), observationOfLHS); + this.nonDependencies.add(seed.getIndices()); + return observationOfLHS; + } + + private Stack nextSeeds(int currentRHSIndex) { +// System.out.println("Find holes"); + THashSet deps = new THashSet<>(); + ArrayList currentMaximalNonDependencies = maximalNonDependencies.getLHSForRHS(currentRHSIndex); + HashSet currentMinimalDependencies = new HashSet<>(minimalDependencies.getLHSForRHS(currentRHSIndex)); + ArrayList newDeps = new ArrayList<>(numberOfColumns * deps.size()); +// Holes holes = new Holes(); + +// int i = 0; +// for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { +// ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); +// if (deps.isEmpty()) { +// ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); +// for (Integer complementColumnIndex : complement.getSetBits()) { +// deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); +// } +// } else { +// for (ColumnCollection dep : deps) { +// int[] setBits = complement.getSetBits(); +// for (int setBit = 0; setBit < setBits.length; setBit++) { +// holes.add(dep.setCopy(setBits[setBit])); +//// System.out.println("Dep:\t" + dep.setCopy(setBits[setBit])); +// } +// } +// // minimize newDeps +// System.out.println(i++ + "\t" + currentMaximalNonDependencies.size()); +// System.out.println("total deps:\t" + deps.size()); +// System.out.println("before minimizing:\t" + holes.size()); +//// ArrayList minimizedNewDeps = minimizeSeeds(newDeps); +// holes.minimize(); +// System.out.println("after minimizing:\t" + holes.size()); +// deps.clear(); +// deps.addAll(holes); +// holes.clear(); +// } +// } + + for (ColumnCollection maximalNonDependency : currentMaximalNonDependencies) { + ColumnCollection complement = maximalNonDependency.setCopy(currentRHSIndex).complement(); + if (deps.isEmpty()) { + ColumnCollection emptyColumnIndices = new ColumnCollection(numberOfColumns); + for (int complementColumnIndex : complement.getSetBits()) { + deps.add(emptyColumnIndices.setCopy(complementColumnIndex)); + } + } else { + for (ColumnCollection dep : deps) { + int[] setBits = complement.getSetBits(); + for (int setBit = 0; setBit < setBits.length; setBit++) { + newDeps.add(dep.setCopy(setBits[setBit])); + } + } + // minimize newDeps + ArrayList minimizedNewDeps = minimizeSeeds(newDeps); + deps.clear(); + deps.addAll(minimizedNewDeps); + newDeps.clear(); + } + } + + // return only elements that aren't already covered by the minimal + // dependencies + Stack remainingSeeds = new Stack<>(); + deps.removeAll(currentMinimalDependencies); + for (ColumnCollection remainingSeed : deps) { + remainingSeeds.push(new Seed(remainingSeed)); + } + + return remainingSeeds; + } + + private ArrayList minimizeSeeds(ArrayList seeds) { + long maxCardinality = 0; + TLongObjectHashMap> seedsBySize = new TLongObjectHashMap<>(numberOfColumns); + for (ColumnCollection seed : seeds) { + long cardinalityOfSeed = seed.cardinality(); + maxCardinality = Math.max(maxCardinality, cardinalityOfSeed); + seedsBySize.putIfAbsent(cardinalityOfSeed, new ArrayList(seeds.size()/numberOfColumns)); + seedsBySize.get(cardinalityOfSeed).add(seed); + } + + for (long lowerBound = 1; lowerBound < maxCardinality; lowerBound++) { + ArrayList lowerBoundSeeds = seedsBySize.get(lowerBound); + if (lowerBoundSeeds != null) { + for (long upperBound = maxCardinality; upperBound > lowerBound; upperBound--) { + ArrayList upperBoundSeeds = seedsBySize.get(upperBound); + if (upperBoundSeeds != null) { + for (Iterator lowerIt = lowerBoundSeeds.iterator(); lowerIt.hasNext();) { + ColumnCollection lowerSeed = lowerIt.next(); + for (Iterator upperIt = upperBoundSeeds.iterator(); upperIt.hasNext();) { + if (lowerSeed.isSubsetOf(upperIt.next())) { + upperIt.remove(); + } + } + } + } + } + } + } + ArrayList minimizedSeeds = new ArrayList<>(); + for (ArrayList seedList : seedsBySize.valueCollection()) { + for (ColumnCollection seed : seedList) { + minimizedSeeds.add(seed); + } + } + return minimizedSeeds; + } + + private Seed randomTake() { + if (!this.seeds.isEmpty()) { + return this.seeds.pop(); + } + return null; + } + + private Seed randomWalkStep(Seed currentSeed, int currentRHSIndex) { + Observation observationOfSeed = this.observations.get(currentSeed.getIndices()); + + if (observationOfSeed == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { + THashSet uncheckedSubsets = this.observations.getUncheckedMaximalSubsets(currentSeed.getIndices(), columnOrder); + THashSet prunedNonDependencySubsets = nonDependencies.getPrunedSupersets(uncheckedSubsets); + for (ColumnCollection prunedNonDependencySubset : prunedNonDependencySubsets) { + observations.put(prunedNonDependencySubset, Observation.NON_DEPENDENCY); + } + uncheckedSubsets.removeAll(prunedNonDependencySubsets); + if (uncheckedSubsets.isEmpty() && prunedNonDependencySubsets.isEmpty()) { + observations.put(currentSeed.getIndices(), Observation.MINIMAL_DEPENDENCY); + minimalDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); + } else if (!uncheckedSubsets.isEmpty()) { + ColumnCollection notRepresentedUncheckedSubset = uncheckedSubsets.iterator().next(); + if (notRepresentedUncheckedSubset != null) { + trace.push(currentSeed); + return new Seed(notRepresentedUncheckedSubset); + } + } + } else if (observationOfSeed == Observation.CANDIDATE_MAXIMAL_NON_DEPENDENCY) { + THashSet uncheckedSupersets = this.observations.getUncheckedMinimalSupersets(currentSeed.getIndices(), currentRHSIndex, columnOrder); + THashSet prunedNonDependencySupersets = nonDependencies.getPrunedSupersets(uncheckedSupersets); + THashSet prunedDependencySupersets = dependencies.getPrunedSubsets(uncheckedSupersets); + for (ColumnCollection prunedNonDependencySuperset : prunedNonDependencySupersets) { + observations.put(prunedNonDependencySuperset, Observation.NON_DEPENDENCY); + } + for (ColumnCollection prunedDependencySuperset : prunedDependencySupersets) { + observations.put(prunedDependencySuperset, Observation.DEPENDENCY); + } + uncheckedSupersets.removeAll(prunedDependencySupersets); + uncheckedSupersets.removeAll(prunedNonDependencySupersets); + if (uncheckedSupersets.isEmpty() && prunedNonDependencySupersets.isEmpty()) { + observations.put(currentSeed.getIndices(), Observation.MAXIMAL_NON_DEPENDENCY); + maximalNonDependencies.addRHSColumn(currentSeed.getIndices(), currentRHSIndex); + } else if (!uncheckedSupersets.isEmpty()) { + ColumnCollection notRepresentedUncheckedSuperset = uncheckedSupersets.iterator().next(); + if (notRepresentedUncheckedSuperset != null) { + trace.push(currentSeed); + int additionalColumn = notRepresentedUncheckedSuperset.removeCopy(currentSeed.getIndices()).nextSetBit(0); + return new Seed(notRepresentedUncheckedSuperset, additionalColumn); + } + } + } + if (!this.trace.isEmpty()) { + Seed nextSeed = this.trace.pop(); + return nextSeed; + } + return null; + } + + public FunctionalDependencies getDependencies() { + return this.minimalDependencies; + } } \ No newline at end of file diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java index 4a74fd5..722cd8e 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/ColumnCollection.java @@ -1,218 +1,218 @@ -package fdiscovery.columns; - -import java.util.BitSet; - -public class ColumnCollection extends BitSet implements Comparable { - - private static final long serialVersionUID = -5256272139963505719L; - - private int formatStringWidth; - protected int numberOfColumns; - protected int[] setBits; - - public ColumnCollection(int numberOfColumns ) { - this.numberOfColumns = numberOfColumns; - this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns)); - } - - public int[] getSetBits() { - int[] setBits = new int[this.cardinality()]; - - int bitIndex = 0; - int currentArrayIndex = 0; - while (bitIndex < this.numberOfColumns) { - int currentNextSetBit = this.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - setBits[currentArrayIndex++] = currentNextSetBit; - bitIndex = currentNextSetBit + 1; - } else { - bitIndex = this.numberOfColumns; - } - } - - return setBits; - } - - public boolean isAtomic() { - return this.cardinality() == 1; - } - - public ColumnCollection addColumn(int columnIndex) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.set(columnIndex); - - return copy; - } - - public ColumnCollection andCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.and(other); - - return copy; - } - - public ColumnCollection clearCopy(int startBit) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.clear(startBit); - - return copy; - } - - public ColumnCollection clearAllCopy() { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.clear(0, this.numberOfColumns); - - return copy; - } - - public ColumnCollection andNotCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.andNot(other); - - return copy; - } - - public ColumnCollection removeCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.andNot(other); - - return copy; - } - - public ColumnCollection orCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.or(other); - - return copy; - } - - public ColumnCollection setCopy(int index) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.set(index); - - return copy; - } - - public ColumnCollection xorCopy(ColumnCollection other) { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.xor(other); - - return copy; - } - - public ColumnCollection complementCopy() { - ColumnCollection copy = (ColumnCollection)this.clone(); - copy.flip(0, this.numberOfColumns); - - return copy; - } - - public ColumnCollection complement() { - this.flip(0, this.numberOfColumns); - return this; - } - - public boolean isSubsetOf(ColumnCollection other) { - return this.unionCount(other) == other.cardinality(); - } - - public boolean isSupersetOf(ColumnCollection other) { - return this.unionCount(other) == this.cardinality(); - - } - - public boolean isProperSubsetOf(ColumnCollection other) { - long cardinality = this.cardinality(); - long otherCardinality = other.cardinality(); - if (cardinality != otherCardinality) { - if (this.unionCount(other) == otherCardinality) { - return true; - } - } - return false; - } - - - public boolean isProperSupersetOf(ColumnCollection other) { - long cardinality = this.cardinality(); - long otherCardinality = other.cardinality(); - if (cardinality != otherCardinality) { - if (this.unionCount(other) == cardinality) { - return true; - } - } - return false; - } - - public int unionCount(ColumnCollection other) { - ColumnCollection union = (ColumnCollection) this.clone(); - union.and(other); - return union.cardinality(); - } - - public boolean isSubsetOrSupersetOf(ColumnCollection other) { - return isSubsetOf(other) || isSupersetOf(other); - } - - public int getNumberOfColumns() { - return this.numberOfColumns; - } - - public long getMostRightBit() { - int bitIndex = 0; - while (bitIndex < this.numberOfColumns) { - int currentNextSetBit = this.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - bitIndex = currentNextSetBit + 1; - } else { - return bitIndex - 1; - } - } - return bitIndex; - } - - public ColumnCollection removeColumnCopy(int columnIndex) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.clear(columnIndex); - - return copy; - } - - @Override - public int compareTo(BitSet other) { - ColumnCollection copy = (ColumnCollection) this.clone(); - copy.xor(other); - int lowestBit = copy.nextSetBit(0); - if (lowestBit == -1) { - return 0; - } else if (this.get(lowestBit)) { - return -1; - } else { - return 1; - } - } - - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - if (this.cardinality() > 0) { - for (int columnIndex : this.getSetBits()) { - outputBuilder.append(String.format("%0" + formatStringWidth + "d,", Integer.valueOf(columnIndex))); - - } - } else { - outputBuilder.append("emptyset"); - } - - return outputBuilder.toString(); - } - - public void remove(ColumnCollection other) { - this.andNot(other); - } - - public static int intersectionCount(ColumnCollection set1, ColumnCollection set2) { - ColumnCollection intersection = (ColumnCollection) set1.clone(); - intersection.and(set2); - return intersection.cardinality(); - } -} +package fdiscovery.columns; + +import java.util.BitSet; + +public class ColumnCollection extends BitSet implements Comparable { + + private static final long serialVersionUID = -5256272139963505719L; + + private int formatStringWidth; + protected int numberOfColumns; + protected int[] setBits; + + public ColumnCollection(int numberOfColumns ) { + this.numberOfColumns = numberOfColumns; + this.formatStringWidth = (int)Math.ceil(Math.log10(this.numberOfColumns)); + } + + public int[] getSetBits() { + int[] setBits = new int[this.cardinality()]; + + int bitIndex = 0; + int currentArrayIndex = 0; + while (bitIndex < this.numberOfColumns) { + int currentNextSetBit = this.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + setBits[currentArrayIndex++] = currentNextSetBit; + bitIndex = currentNextSetBit + 1; + } else { + bitIndex = this.numberOfColumns; + } + } + + return setBits; + } + + public boolean isAtomic() { + return this.cardinality() == 1; + } + + public ColumnCollection addColumn(int columnIndex) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.set(columnIndex); + + return copy; + } + + public ColumnCollection andCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.and(other); + + return copy; + } + + public ColumnCollection clearCopy(int startBit) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.clear(startBit); + + return copy; + } + + public ColumnCollection clearAllCopy() { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.clear(0, this.numberOfColumns); + + return copy; + } + + public ColumnCollection andNotCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.andNot(other); + + return copy; + } + + public ColumnCollection removeCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.andNot(other); + + return copy; + } + + public ColumnCollection orCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.or(other); + + return copy; + } + + public ColumnCollection setCopy(int index) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.set(index); + + return copy; + } + + public ColumnCollection xorCopy(ColumnCollection other) { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.xor(other); + + return copy; + } + + public ColumnCollection complementCopy() { + ColumnCollection copy = (ColumnCollection)this.clone(); + copy.flip(0, this.numberOfColumns); + + return copy; + } + + public ColumnCollection complement() { + this.flip(0, this.numberOfColumns); + return this; + } + + public boolean isSubsetOf(ColumnCollection other) { + return this.unionCount(other) == other.cardinality(); + } + + public boolean isSupersetOf(ColumnCollection other) { + return this.unionCount(other) == this.cardinality(); + + } + + public boolean isProperSubsetOf(ColumnCollection other) { + long cardinality = this.cardinality(); + long otherCardinality = other.cardinality(); + if (cardinality != otherCardinality) { + if (this.unionCount(other) == otherCardinality) { + return true; + } + } + return false; + } + + + public boolean isProperSupersetOf(ColumnCollection other) { + long cardinality = this.cardinality(); + long otherCardinality = other.cardinality(); + if (cardinality != otherCardinality) { + if (this.unionCount(other) == cardinality) { + return true; + } + } + return false; + } + + public int unionCount(ColumnCollection other) { + ColumnCollection union = (ColumnCollection) this.clone(); + union.or(other); + return union.cardinality(); + } + + public boolean isSubsetOrSupersetOf(ColumnCollection other) { + return isSubsetOf(other) || isSupersetOf(other); + } + + public int getNumberOfColumns() { + return this.numberOfColumns; + } + + public long getMostRightBit() { + int bitIndex = 0; + while (bitIndex < this.numberOfColumns) { + int currentNextSetBit = this.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + bitIndex = currentNextSetBit + 1; + } else { + return bitIndex - 1; + } + } + return bitIndex; + } + + public ColumnCollection removeColumnCopy(int columnIndex) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.clear(columnIndex); + + return copy; + } + + @Override + public int compareTo(BitSet other) { + ColumnCollection copy = (ColumnCollection) this.clone(); + copy.xor(other); + int lowestBit = copy.nextSetBit(0); + if (lowestBit == -1) { + return 0; + } else if (this.get(lowestBit)) { + return -1; + } else { + return 1; + } + } + + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + if (this.cardinality() > 0) { + for (int columnIndex : this.getSetBits()) { + outputBuilder.append(String.format("%0" + formatStringWidth + "d,", Integer.valueOf(columnIndex))); + + } + } else { + outputBuilder.append("emptyset"); + } + + return outputBuilder.toString(); + } + + public void remove(ColumnCollection other) { + this.andNot(other); + } + + public static int intersectionCount(ColumnCollection set1, ColumnCollection set2) { + ColumnCollection intersection = (ColumnCollection) set1.clone(); + intersection.and(set2); + return intersection.cardinality(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java index 9b5cda4..44a6445 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/DifferenceSet.java @@ -1,14 +1,14 @@ -package fdiscovery.columns; - - -public class DifferenceSet extends ColumnCollection { - - private static final long serialVersionUID = -5174627424398542681L; - - public DifferenceSet(AgreeSet agreeSet) { - super(agreeSet.getNumberOfColumns()); - - this.or(agreeSet); - this.flip(0, this.numberOfColumns); - } -} +package fdiscovery.columns; + + +public class DifferenceSet extends ColumnCollection { + + private static final long serialVersionUID = -5174627424398542681L; + + public DifferenceSet(AgreeSet agreeSet) { + super(agreeSet.getNumberOfColumns()); + + this.or(agreeSet); + this.flip(0, this.numberOfColumns); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java index 06b713b..7136090 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Path.java @@ -1,26 +1,26 @@ -package fdiscovery.columns; - -import java.util.ArrayList; - - -public class Path extends ColumnCollection { - - private static final long serialVersionUID = -6451347203736964695L; - - public Path(int numberOfColumns) { - super(numberOfColumns); - } - - public ArrayList getMaximalSubsets() { - ArrayList maximalSubsetPaths = new ArrayList<>(); - - if (this.isEmpty()) { - return new ArrayList<>(); - } - for (int columnIndex : this.getSetBits()) { - maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex)); - } - - return maximalSubsetPaths; - } -} +package fdiscovery.columns; + +import java.util.ArrayList; + + +public class Path extends ColumnCollection { + + private static final long serialVersionUID = -6451347203736964695L; + + public Path(int numberOfColumns) { + super(numberOfColumns); + } + + public ArrayList getMaximalSubsets() { + ArrayList maximalSubsetPaths = new ArrayList<>(); + + if (this.isEmpty()) { + return new ArrayList<>(); + } + for (int columnIndex : this.getSetBits()) { + maximalSubsetPaths.add((Path)this.removeColumnCopy(columnIndex)); + } + + return maximalSubsetPaths; + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java index 9fd3d05..9fee75a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/columns/Seed.java @@ -1,62 +1,62 @@ -package fdiscovery.columns; - -import fdiscovery.partitions.FileBasedPartition; -import fdiscovery.partitions.Partition; - -public class Seed implements Comparable { - - private ColumnCollection indices; - private int additionalColumnIndex; - private double distinctiveness; - - public Seed(Partition a, FileBasedPartition b) { - this.indices = a.getIndices().orCopy(b.getIndices()); - this.additionalColumnIndex = b.getIndex(); - this.distinctiveness = Partition.estimateDistinctiveness(a, b); - } - - // inverse order - @Override - public int compareTo(Seed o) { - if (this.distinctiveness != o.distinctiveness) { - if (o.distinctiveness - this.distinctiveness < 0) { - return -1; - } - return 1; - } - return this.indices.compareTo(o.indices); - } - - @Override - public boolean equals(Object o) { - if (o == null) { - return false; - } - if (o == this) { - return true; - } - if (!(o instanceof Seed)) { - return false; - } - Seed otherSeed = (Seed) o; - return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; - } - - public ColumnCollection getBaseIndices() { - return this.indices.removeColumnCopy(additionalColumnIndex); - } - - public ColumnCollection getIndices() { - return this.indices; - } - - public int getAdditionalColumnIndex() { - return this.additionalColumnIndex; - } - - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, Double.valueOf(this.distinctiveness))); - return outputBuilder.toString(); - } -} +package fdiscovery.columns; + +import fdiscovery.partitions.FileBasedPartition; +import fdiscovery.partitions.Partition; + +public class Seed implements Comparable { + + private ColumnCollection indices; + private int additionalColumnIndex; + private double distinctiveness; + + public Seed(Partition a, FileBasedPartition b) { + this.indices = a.getIndices().orCopy(b.getIndices()); + this.additionalColumnIndex = b.getIndex(); + this.distinctiveness = Partition.estimateDistinctiveness(a, b); + } + + // inverse order + @Override + public int compareTo(Seed o) { + if (this.distinctiveness != o.distinctiveness) { + if (o.distinctiveness - this.distinctiveness < 0) { + return -1; + } + return 1; + } + return this.indices.compareTo(o.indices); + } + + @Override + public boolean equals(Object o) { + if (o == null) { + return false; + } + if (o == this) { + return true; + } + if (!(o instanceof Seed)) { + return false; + } + Seed otherSeed = (Seed) o; + return this.distinctiveness == otherSeed.distinctiveness && this.indices.compareTo(otherSeed.indices) == 0; + } + + public ColumnCollection getBaseIndices() { + return this.indices.removeColumnCopy(additionalColumnIndex); + } + + public ColumnCollection getIndices() { + return this.indices; + } + + public int getAdditionalColumnIndex() { + return this.additionalColumnIndex; + } + + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + outputBuilder.append(String.format("Seed: [%s]\t%f", this.indices, Double.valueOf(this.distinctiveness))); + return outputBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java index 5e3c2c0..2cb0e39 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupHashSet.java @@ -1,50 +1,50 @@ -package fdiscovery.equivalence; - -import java.util.HashSet; -import java.util.Set; - -public class EquivalenceGroupHashSet extends HashSet implements Comparable, Equivalence { - - private static final long serialVersionUID = 8411462245069900864L; - - private int identifier; - - public EquivalenceGroupHashSet() { - this.identifier = Equivalence.unassignedIdentifier; - } - - public EquivalenceGroupHashSet(int identifier) { - this.identifier = identifier; - } - - @Override - public int compareTo(EquivalenceGroupHashSet o) { - if (this.size() != o.size()) { - return this.size() - o.size(); - } - return this.identifier - o.identifier; - } - - @Override - public int getIdentifier() { - return this.identifier; - } - - @Override - public > boolean isProperSubset(T other) { - if (this.size() >= other.size()) { - return false; - } - - return other.containsAll(this); - } - - @Override - public void add(int value) { - if (this.identifier == Equivalence.unassignedIdentifier) { - this.identifier = value; - } - - super.add(Integer.valueOf(value)); - } +package fdiscovery.equivalence; + +import java.util.HashSet; +import java.util.Set; + +public class EquivalenceGroupHashSet extends HashSet implements Comparable, Equivalence { + + private static final long serialVersionUID = 8411462245069900864L; + + private int identifier; + + public EquivalenceGroupHashSet() { + this.identifier = Equivalence.unassignedIdentifier; + } + + public EquivalenceGroupHashSet(int identifier) { + this.identifier = identifier; + } + + @Override + public int compareTo(EquivalenceGroupHashSet o) { + if (this.size() != o.size()) { + return this.size() - o.size(); + } + return this.identifier - o.identifier; + } + + @Override + public int getIdentifier() { + return this.identifier; + } + + @Override + public > boolean isProperSubset(T other) { + if (this.size() >= other.size()) { + return false; + } + + return other.containsAll(this); + } + + @Override + public void add(int value) { + if (this.identifier == Equivalence.unassignedIdentifier) { + this.identifier = value; + } + + super.add(Integer.valueOf(value)); + } } \ No newline at end of file diff --git a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java index ce33016..f89ce2d 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/equivalence/EquivalenceGroupTIntHashSet.java @@ -43,6 +43,6 @@ public int compareTo(EquivalenceGroupTIntHashSet o) { if (this.size() != o.size()) { return this.size() - o.size(); } - return this.identifier - o.identifier; + return this.identifier - o.identifier; } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java index 05ead74..35cc7c9 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/PartialOrder.java @@ -1,75 +1,75 @@ -package fdiscovery.fastfds; - -import gnu.trove.map.hash.TIntIntHashMap; - -import java.util.ArrayList; -import java.util.Collections; - -import fdiscovery.columns.DifferenceSet; -import fdiscovery.columns.DifferenceSets; - -public class PartialOrder extends ArrayList { - - private static final long serialVersionUID = -4312148937513750522L; - - public PartialOrder(DifferenceSets differenceSets) { - TIntIntHashMap orderMap = new TIntIntHashMap(); - - for (DifferenceSet differenceSet : differenceSets) { - // increase the cover count for set columns - int bitIndex = 0; - while (bitIndex < differenceSet.getNumberOfColumns()) { - int currentNextSetBit = differenceSet.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent(currentNextSetBit, 0); - orderMap.increment(currentNextSetBit); - } else { - bitIndex = differenceSet.getNumberOfColumns(); - } - } - } - - for (int index : orderMap.keys()) { - this.add(new CoverOrder(index, orderMap.get(index))); - } - - Collections.sort(this, Collections.reverseOrder()); - - } - - public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) { - TIntIntHashMap orderMap = new TIntIntHashMap(); - - for (DifferenceSet differenceSet : differenceSets) { - // increase the cover count for set columns - int bitIndex = columnIndexToSkip; - while (bitIndex < differenceSet.getNumberOfColumns()) { - int currentNextSetBit = differenceSet.nextSetBit(bitIndex); - if (currentNextSetBit != -1) { - bitIndex = currentNextSetBit + 1; - orderMap.putIfAbsent(currentNextSetBit, 0); - orderMap.increment(currentNextSetBit); - } else { - bitIndex = differenceSet.getNumberOfColumns(); - } - } - } - - for (int index : orderMap.keys()) { - this.add(new CoverOrder(index, orderMap.get(index))); - } - - Collections.sort(this, Collections.reverseOrder()); - - } - - public ArrayList getOrderedColumns() { - ArrayList orderedColumns = new ArrayList<>(); - for (CoverOrder order : this) { - orderedColumns.add(Integer.valueOf(order.getColumnIndex())); - } - - return orderedColumns; - } -} +package fdiscovery.fastfds; + +import gnu.trove.map.hash.TIntIntHashMap; + +import java.util.ArrayList; +import java.util.Collections; + +import fdiscovery.columns.DifferenceSet; +import fdiscovery.columns.DifferenceSets; + +public class PartialOrder extends ArrayList { + + private static final long serialVersionUID = -4312148937513750522L; + + public PartialOrder(DifferenceSets differenceSets) { + TIntIntHashMap orderMap = new TIntIntHashMap(); + + for (DifferenceSet differenceSet : differenceSets) { + // increase the cover count for set columns + int bitIndex = 0; + while (bitIndex < differenceSet.getNumberOfColumns()) { + int currentNextSetBit = differenceSet.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + bitIndex = currentNextSetBit + 1; + orderMap.putIfAbsent(currentNextSetBit, 0); + orderMap.increment(currentNextSetBit); + } else { + bitIndex = differenceSet.getNumberOfColumns(); + } + } + } + + for (int index : orderMap.keys()) { + this.add(new CoverOrder(index, orderMap.get(index))); + } + + Collections.sort(this, Collections.reverseOrder()); + + } + + public PartialOrder(DifferenceSets differenceSets, int columnIndexToSkip) { + TIntIntHashMap orderMap = new TIntIntHashMap(); + + for (DifferenceSet differenceSet : differenceSets) { + // increase the cover count for set columns + int bitIndex = columnIndexToSkip; + while (bitIndex < differenceSet.getNumberOfColumns()) { + int currentNextSetBit = differenceSet.nextSetBit(bitIndex); + if (currentNextSetBit != -1) { + bitIndex = currentNextSetBit + 1; + orderMap.putIfAbsent(currentNextSetBit, 0); + orderMap.increment(currentNextSetBit); + } else { + bitIndex = differenceSet.getNumberOfColumns(); + } + } + } + + for (int index : orderMap.keys()) { + this.add(new CoverOrder(index, orderMap.get(index))); + } + + Collections.sort(this, Collections.reverseOrder()); + + } + + public ArrayList getOrderedColumns() { + ArrayList orderedColumns = new ArrayList<>(); + for (CoverOrder order : this) { + orderedColumns.add(Integer.valueOf(order.getColumnIndex())); + } + + return orderedColumns; + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java index e8e89c9..26dd8c2 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/fastfds/runner/FastFDs.java @@ -1,219 +1,219 @@ -package fdiscovery.fastfds.runner; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; - -import org.apache.commons.cli.CommandLine; - -import fdiscovery.columns.AgreeSets; -import fdiscovery.columns.ColumnCollection; -import fdiscovery.columns.DifferenceSets; -import fdiscovery.columns.Path; - -import com.rits.cloning.Cloner; - -import fdiscovery.partitions.StrippedPartitions; -import fdiscovery.preprocessing.SVFileProcessor; -import fdiscovery.fastfds.EquivalenceClasses; -import fdiscovery.fastfds.MaximalEquivalenceClasses; -import fdiscovery.fastfds.PartialOrder; -import fdiscovery.general.CLIParserMiner; -import fdiscovery.general.ColumnFiles; -import fdiscovery.general.FunctionalDependencies; -import fdiscovery.general.Miner; - -public class FastFDs extends Miner { - - private int numberOfColumns; - private int numberOfRows; - private FunctionalDependencies minimalDependencies; - private DifferenceSets differenceSets; - - @SuppressWarnings("unused") - public static void main2(String[] args) { - createColumDirectory(); - createResultDirectory(); - - File source = new File(Miner.input); - SVFileProcessor inputFileProcessor = null; - try { - long timeStart = System.currentTimeMillis(); - - inputFileProcessor = new SVFileProcessor(source); - inputFileProcessor.init(); - System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); - System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); - System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); - inputFileProcessor.createColumnFiles(); - FastFDs fastFDRunner = new FastFDs(inputFileProcessor); - - fastFDRunner.run(); - System.out.println(String.format("Dependencies: %d.", Integer.valueOf(fastFDRunner.minimalDependencies.getCount()))); - long timeFindFDs = System.currentTimeMillis(); - System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); - System.out.println(fastFDRunner.getDependencies()); - } catch (FileNotFoundException e) { - System.out.println("The input file could not be found."); - } catch (IOException e) { - System.out.println("The input reader could not be reset."); - } - } - - public static void main(String[] args) { - CLIParserMiner parser = new CLIParserMiner(); - CommandLine cli = parser.parse(args); - String inputFilename = new String(); - String columnFileDirectory = new String(); - String resultFile = new String(); - int numberOfColumns = 0; - int numberOfRows = 0; - - if (cli.hasOption("file")) { - inputFilename = cli.getOptionValue("file"); - } - if (cli.hasOption("input")) { - columnFileDirectory = cli.getOptionValue("input"); - } - if (cli.hasOption("result")) { - resultFile = cli.getOptionValue("result"); - } - if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); - } - if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); - } - ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); - long timeStart = System.currentTimeMillis(); - try { - FastFDs runner = new FastFDs(columnFiles, numberOfRows); - runner.run(); - long timeEnd = System.currentTimeMillis(); - runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); - } catch(OutOfMemoryError e) { - System.exit(Miner.STATUS_OOM); - } - System.exit(0); - } - - private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; - - StringBuilder outputBuilder = new StringBuilder(); - if (!inputFileName.isEmpty()) { - outputBuilder.append(String.format("%s\t", inputFileName)); - } - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); - outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); - outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); - outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); - - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); - resultFileWriter.write(outputBuilder.toString()); - System.out.print(outputBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write output."); - } - } - - public FastFDs(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { - this.minimalDependencies = new FunctionalDependencies(); - this.numberOfColumns = columnFiles.getNumberOfColumns(); - this.numberOfRows = numberOfRows; - - StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); - EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); - MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); - strippedPartitions.clear(); - AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); - maximalEquivalenceClasses.clear(); - equivalenceClasses.clear(); - this.differenceSets = new DifferenceSets(agreeSets); - agreeSets.clear(); - } - - public FastFDs(SVFileProcessor table) throws OutOfMemoryError { - this.minimalDependencies = new FunctionalDependencies(); - this.numberOfColumns = table.getNumberOfColumns(); - this.numberOfRows = table.getNumberOfRows(); - - ColumnFiles columnFiles = table.getColumnFiles(); - StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); - EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); - MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); - strippedPartitions.clear(); - AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); - maximalEquivalenceClasses.clear(); - equivalenceClasses.clear(); - this.differenceSets = new DifferenceSets(agreeSets); - agreeSets.clear(); - } - - public void run() throws OutOfMemoryError { - int numberOfColumns = this.numberOfColumns; - - DifferenceSets[] differenceSetsModulo = this.differenceSets.allModulo(this.numberOfColumns); - for (int rhsIndex = 0; rhsIndex < numberOfColumns; rhsIndex++) { - DifferenceSets orig = differenceSetsModulo[rhsIndex]; - Cloner cloner = new Cloner(); - DifferenceSets uncovered = cloner.deepClone(orig); - if (orig.isEmpty()) { - ColumnCollection lhs = new ColumnCollection(this.numberOfColumns); - - for (int lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { - this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex); - } - } - else if (!orig.containsEmptySet()) { - PartialOrder currentOrder = new PartialOrder(orig); - Path path = new Path(numberOfColumns); - findCovers(rhsIndex, orig, uncovered, path, currentOrder); - } - } - } - - public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets uncovered, Path currentPath, PartialOrder currentOrder) { - // no dependencies here - if (currentOrder.isEmpty() && !uncovered.isEmpty()) { - return; - } - - if (uncovered.isEmpty()) { - if (!orig.maximumSubsetCoversDifferenceSet(currentPath)) { - this.minimalDependencies.addRHSColumn(currentPath, columnIndex); - } else { - // dependency not minimal - return; - } - } - - // RECURSIVE CASE - for (int remainingColumn : currentOrder.getOrderedColumns()) { - DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn); - PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn); - Path nextPath = (Path) currentPath.addColumn(remainingColumn); - - nextPath.addColumn(remainingColumn); - findCovers(columnIndex, orig, nextDifferenceSets, nextPath, nextOrder); - } - } - - public FunctionalDependencies getDependencies() { - return this.minimalDependencies; - } -} +package fdiscovery.fastfds.runner; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; + +import org.apache.commons.cli.CommandLine; + +import fdiscovery.columns.AgreeSets; +import fdiscovery.columns.ColumnCollection; +import fdiscovery.columns.DifferenceSets; +import fdiscovery.columns.Path; + +import com.rits.cloning.Cloner; + +import fdiscovery.partitions.StrippedPartitions; +import fdiscovery.preprocessing.SVFileProcessor; +import fdiscovery.fastfds.EquivalenceClasses; +import fdiscovery.fastfds.MaximalEquivalenceClasses; +import fdiscovery.fastfds.PartialOrder; +import fdiscovery.general.CLIParserMiner; +import fdiscovery.general.ColumnFiles; +import fdiscovery.general.FunctionalDependencies; +import fdiscovery.general.Miner; + +public class FastFDs extends Miner { + + private int numberOfColumns; + private int numberOfRows; + private FunctionalDependencies minimalDependencies; + private DifferenceSets differenceSets; + + @SuppressWarnings("unused") + public static void main2(String[] args) { + createColumDirectory(); + createResultDirectory(); + + File source = new File(Miner.input); + SVFileProcessor inputFileProcessor = null; + try { + long timeStart = System.currentTimeMillis(); + + inputFileProcessor = new SVFileProcessor(source); + inputFileProcessor.init(); + System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); + System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); + System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); + inputFileProcessor.createColumnFiles(); + FastFDs fastFDRunner = new FastFDs(inputFileProcessor); + + fastFDRunner.run(); + System.out.println(String.format("Dependencies: %d.", Integer.valueOf(fastFDRunner.minimalDependencies.getCount()))); + long timeFindFDs = System.currentTimeMillis(); + System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); + System.out.println(fastFDRunner.getDependencies()); + } catch (FileNotFoundException e) { + System.out.println("The input file could not be found."); + } catch (IOException e) { + System.out.println("The input reader could not be reset."); + } + } + + public static void main(String[] args) { + CLIParserMiner parser = new CLIParserMiner(); + CommandLine cli = parser.parse(args); + String inputFilename = new String(); + String columnFileDirectory = new String(); + String resultFile = new String(); + int numberOfColumns = 0; + int numberOfRows = 0; + + if (cli.hasOption("file")) { + inputFilename = cli.getOptionValue("file"); + } + if (cli.hasOption("input")) { + columnFileDirectory = cli.getOptionValue("input"); + } + if (cli.hasOption("result")) { + resultFile = cli.getOptionValue("result"); + } + if (cli.hasOption("columns")) { + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + } + if (cli.hasOption("rows")) { + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + } + ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); + long timeStart = System.currentTimeMillis(); + try { + FastFDs runner = new FastFDs(columnFiles, numberOfRows); + runner.run(); + long timeEnd = System.currentTimeMillis(); + runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); + } catch(OutOfMemoryError e) { + System.exit(Miner.STATUS_OOM); + } + System.exit(0); + } + + private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { + String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; + + StringBuilder outputBuilder = new StringBuilder(); + if (!inputFileName.isEmpty()) { + outputBuilder.append(String.format("%s\t", inputFileName)); + } + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%s\t", timeString)); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(0))); + outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); + + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); + resultFileWriter.write(outputBuilder.toString()); + System.out.print(outputBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write output."); + } + } + + public FastFDs(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { + this.minimalDependencies = new FunctionalDependencies(); + this.numberOfColumns = columnFiles.getNumberOfColumns(); + this.numberOfRows = numberOfRows; + + StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); + EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); + MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); + strippedPartitions.clear(); + AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); + maximalEquivalenceClasses.clear(); + equivalenceClasses.clear(); + this.differenceSets = new DifferenceSets(agreeSets); + agreeSets.clear(); + } + + public FastFDs(SVFileProcessor table) throws OutOfMemoryError { + this.minimalDependencies = new FunctionalDependencies(); + this.numberOfColumns = table.getNumberOfColumns(); + this.numberOfRows = table.getNumberOfRows(); + + ColumnFiles columnFiles = table.getColumnFiles(); + StrippedPartitions strippedPartitions = new StrippedPartitions(columnFiles); + EquivalenceClasses equivalenceClasses = new EquivalenceClasses(strippedPartitions); + MaximalEquivalenceClasses maximalEquivalenceClasses = new MaximalEquivalenceClasses(strippedPartitions); + strippedPartitions.clear(); + AgreeSets agreeSets = new AgreeSets(maximalEquivalenceClasses, equivalenceClasses, this.numberOfColumns, this.numberOfRows); + maximalEquivalenceClasses.clear(); + equivalenceClasses.clear(); + this.differenceSets = new DifferenceSets(agreeSets); + agreeSets.clear(); + } + + public void run() throws OutOfMemoryError { + int numberOfColumns = this.numberOfColumns; + + DifferenceSets[] differenceSetsModulo = this.differenceSets.allModulo(this.numberOfColumns); + for (int rhsIndex = 0; rhsIndex < numberOfColumns; rhsIndex++) { + DifferenceSets orig = differenceSetsModulo[rhsIndex]; + Cloner cloner = new Cloner(); + DifferenceSets uncovered = cloner.deepClone(orig); + if (orig.isEmpty()) { + ColumnCollection lhs = new ColumnCollection(this.numberOfColumns); + + for (int lhsIndex : lhs.setCopy(rhsIndex).complement().getSetBits()) { + this.minimalDependencies.addRHSColumn(lhs.setCopy(lhsIndex), rhsIndex); + } + } + else if (!orig.containsEmptySet()) { + PartialOrder currentOrder = new PartialOrder(orig); + Path path = new Path(numberOfColumns); + findCovers(rhsIndex, orig, uncovered, path, currentOrder); + } + } + } + + public void findCovers(int columnIndex, DifferenceSets orig, DifferenceSets uncovered, Path currentPath, PartialOrder currentOrder) { + // no dependencies here + if (currentOrder.isEmpty() && !uncovered.isEmpty()) { + return; + } + + if (uncovered.isEmpty()) { + if (!orig.maximumSubsetCoversDifferenceSet(currentPath)) { + this.minimalDependencies.addRHSColumn(currentPath, columnIndex); + } else { + // dependency not minimal + return; + } + } + + // RECURSIVE CASE + for (int remainingColumn : currentOrder.getOrderedColumns()) { + DifferenceSets nextDifferenceSets = uncovered.removeCovered(remainingColumn); + PartialOrder nextOrder = new PartialOrder(nextDifferenceSets, remainingColumn); + Path nextPath = (Path) currentPath.addColumn(remainingColumn); + + nextPath.addColumn(remainingColumn); + findCovers(columnIndex, orig, nextDifferenceSets, nextPath, nextOrder); + } + } + + public FunctionalDependencies getDependencies() { + return this.minimalDependencies; + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java index 43d0172..afc934c 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/Benchmarker.java @@ -1,219 +1,219 @@ -package fdiscovery.general; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.Arrays; - -import org.apache.commons.exec.CommandLine; -import org.apache.commons.exec.DefaultExecuteResultHandler; -import org.apache.commons.exec.DefaultExecutor; -import org.apache.commons.exec.ExecuteWatchdog; -import org.apache.commons.exec.PumpStreamHandler; - -import fdiscovery.preprocessing.SVFileProcessor; -import gnu.trove.map.hash.THashMap; - -public class Benchmarker { - - protected static File[] getBenchmarkFilesWithPattern(File benchmarkDirectory) { - File[] benchmarkFiles = benchmarkDirectory.listFiles(new FilenameFilter() { - - @Override - public boolean accept(File dir, String name) { - return name.matches(Miner.BENCHMARK_FILE_REGEX); - } - }); - return benchmarkFiles; - } - - protected static final String getResultFileName(String inputDirectory, String miner) { - String[] splitInputDirectory = inputDirectory.split("\\" + File.separator); - if (splitInputDirectory.length >= 2) { - String staticComponent = splitInputDirectory[splitInputDirectory.length-1]; - String source = splitInputDirectory[splitInputDirectory.length-2]; - return String.format("%s%s-%s-%s.dat", Miner.RESULT_FILE_PATH, miner, staticComponent, source); - } - return new String(); - } - - protected static final void writeErrorCode(File resultFile, int exitCode) { - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile, true)); - if (exitCode == Miner.STATUS_OOT) { - resultFileWriter.write("#OOT"); - } else if (exitCode == Miner.STATUS_OOM) { - resultFileWriter.write("#OOM"); - } - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write meta data."); - } - } - - protected static final void writeMetaData(File resultFile, THashMap cmdLine) { - StringBuilder metaDataLineBuilder = new StringBuilder(); - for (String optionKey : cmdLine.keySet()) { - if (cmdLine.get(optionKey) != null) { - metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); - System.out.print(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); - } else { - metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, "true")); - System.out.print(String.format("# %s :\t%s\n", optionKey, "true")); - } - } - metaDataLineBuilder.append("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); - System.out.println("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile)); - resultFileWriter.write(metaDataLineBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write meta data."); - } - } - - public static void main(String[] args) { - CLIParserBenchmarker parser = new CLIParserBenchmarker(); - THashMap cmdLine = parser.parse(args); - String inputDirectoryName = new String(); - String miner = new String(); - char delimiter = '\t'; - String xmx = new String(); - int timeout = -1; - boolean allFiles = false; - - if (cmdLine.contains("input")) { - inputDirectoryName = cmdLine.get("input"); - } - if (cmdLine.contains("miner")) { - miner = cmdLine.get("miner"); - } - if (cmdLine.contains("delimiter")) { - delimiter = (cmdLine.get("delimiter")).charAt(0); - } - if (cmdLine.contains("xmx")) { - xmx = cmdLine.get("xmx"); - } - if (cmdLine.contains("timeout")) { - System.out.println(String.format("Timeout:%s", cmdLine.get("timeout"))); - timeout = Integer.valueOf(cmdLine.get("timeout")).intValue(); - } - if (cmdLine.containsKey("all")) { - System.out.println("Use all files."); - allFiles = true; - } - File executable = null; - if (miner.equals("tane")) { - executable = new File("tane.jar"); - } else if (miner.equals("fastfds")) { - executable = new File("fastfds.jar"); - } else if (miner.equals("dfd")) { - executable = new File("dfd.jar"); - } - else { - System.out.println(String.format("No valid miner:\t%s", miner)); - System.exit(1); - } - - File inputDirectory = new File(inputDirectoryName); - if (!inputDirectory.exists()) { - System.out.println("Input directory doesn't exist."); - System.exit(1); - } - - File[] benchmarkFiles = new File[0]; - if (allFiles) { - benchmarkFiles = inputDirectory.listFiles(); - } else { - benchmarkFiles = getBenchmarkFilesWithPattern(inputDirectory); - } - Arrays.sort(benchmarkFiles); - - if (benchmarkFiles.length != 0) { - Miner.createColumDirectory(); - Miner.createResultDirectory(); - String resultFilename = getResultFileName(inputDirectory.getAbsolutePath(), miner); - File resultFile = new File(resultFilename); - writeMetaData(resultFile, cmdLine); - boolean errors = false; - for (File benchmarkFile : benchmarkFiles) { - if (!errors) { - try { - // create columns files and collect meta data - SVFileProcessor fileProcessor = new SVFileProcessor(benchmarkFile); - fileProcessor.init(delimiter); - fileProcessor.createColumnFiles(); - - // build command line with parameters - CommandLine processCmdLine = new CommandLine("java"); - processCmdLine.addArgument("-d64"); - processCmdLine.addArgument("-XX:GCTimeLimit=90"); - processCmdLine.addArgument("-XX:GCHeapFreeLimit=10"); - processCmdLine.addArgument("-XX:+UseSerialGC"); - processCmdLine.addArgument(String.format("-Xmx%s", xmx)); - processCmdLine.addArgument("-jar"); - processCmdLine.addArgument(executable.getName()); - processCmdLine.addArgument("-file"); - processCmdLine.addArgument(String.valueOf(benchmarkFile.getName())); - processCmdLine.addArgument("-columns"); - processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfColumns())); - processCmdLine.addArgument("-rows"); - processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfRows())); - processCmdLine.addArgument("-result"); - processCmdLine.addArgument(resultFile.getAbsolutePath()); - processCmdLine.addArgument("-input"); - processCmdLine.addArgument(fileProcessor.getColumnDirectoryName()); - - // build process with watchdog - DefaultExecutor executor = new DefaultExecutor(); - ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout); - executor.setWatchdog(watchdog); - - // handle results - DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler(); - PumpStreamHandler streamHandler = new PumpStreamHandler(); - executor.setStreamHandler(streamHandler); - long timeStart = System.currentTimeMillis(); - executor.execute(processCmdLine, resultHandler); - resultHandler.waitFor(timeout); - - long timeEnd = System.currentTimeMillis(); - System.out.println(String.format("Time:%.1f", Double.valueOf((double)(timeEnd - timeStart) / 1000))); - - int exitCode = 0; - if (resultHandler.hasResult()) { - exitCode = resultHandler.getExitValue(); - } else { - exitCode = Miner.STATUS_OOT; - executor.getWatchdog().destroyProcess(); - } - - if (watchdog.killedProcess()) { - exitCode = Miner.STATUS_OOT; - executor.getWatchdog().destroyProcess(); - } else { - } - System.out.println(String.format("ExitCode %d", Integer.valueOf(exitCode))); - if (exitCode == Miner.STATUS_OK) { - - } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) { - writeErrorCode(resultFile, exitCode); - errors = true; - } - } catch (FileNotFoundException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - } - } - - } -} +package fdiscovery.general; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.Arrays; + +import org.apache.commons.exec.CommandLine; +import org.apache.commons.exec.DefaultExecuteResultHandler; +import org.apache.commons.exec.DefaultExecutor; +import org.apache.commons.exec.ExecuteWatchdog; +import org.apache.commons.exec.PumpStreamHandler; + +import fdiscovery.preprocessing.SVFileProcessor; +import gnu.trove.map.hash.THashMap; + +public class Benchmarker { + + protected static File[] getBenchmarkFilesWithPattern(File benchmarkDirectory) { + File[] benchmarkFiles = benchmarkDirectory.listFiles(new FilenameFilter() { + + @Override + public boolean accept(File dir, String name) { + return name.matches(Miner.BENCHMARK_FILE_REGEX); + } + }); + return benchmarkFiles; + } + + protected static final String getResultFileName(String inputDirectory, String miner) { + String[] splitInputDirectory = inputDirectory.split("\\" + File.separator); + if (splitInputDirectory.length >= 2) { + String staticComponent = splitInputDirectory[splitInputDirectory.length-1]; + String source = splitInputDirectory[splitInputDirectory.length-2]; + return String.format("%s%s-%s-%s.dat", Miner.RESULT_FILE_PATH, miner, staticComponent, source); + } + return new String(); + } + + protected static final void writeErrorCode(File resultFile, int exitCode) { + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile, true)); + if (exitCode == Miner.STATUS_OOT) { + resultFileWriter.write("#OOT"); + } else if (exitCode == Miner.STATUS_OOM) { + resultFileWriter.write("#OOM"); + } + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write meta data."); + } + } + + protected static final void writeMetaData(File resultFile, THashMap cmdLine) { + StringBuilder metaDataLineBuilder = new StringBuilder(); + for (String optionKey : cmdLine.keySet()) { + if (cmdLine.get(optionKey) != null) { + metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); + System.out.print(String.format("# %s :\t%s\n", optionKey, cmdLine.get(optionKey))); + } else { + metaDataLineBuilder.append(String.format("# %s :\t%s\n", optionKey, "true")); + System.out.print(String.format("# %s :\t%s\n", optionKey, "true")); + } + } + metaDataLineBuilder.append("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); + System.out.println("#Filename\t#Rows\t#Columns\tTime\t#Deps\t#<2Deps\t#<3Deps\t#<4Deps\t#<5Deps\t#<6Deps\t#>5Deps\t#Partitions\n"); + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(resultFile)); + resultFileWriter.write(metaDataLineBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write meta data."); + } + } + + public static void main(String[] args) { + CLIParserBenchmarker parser = new CLIParserBenchmarker(); + THashMap cmdLine = parser.parse(args); + String inputDirectoryName = new String(); + String miner = new String(); + char delimiter = '\t'; + String xmx = new String(); + int timeout = -1; + boolean allFiles = false; + + if (cmdLine.contains("input")) { + inputDirectoryName = cmdLine.get("input"); + } + if (cmdLine.contains("miner")) { + miner = cmdLine.get("miner"); + } + if (cmdLine.contains("delimiter")) { + delimiter = (cmdLine.get("delimiter")).charAt(0); + } + if (cmdLine.contains("xmx")) { + xmx = cmdLine.get("xmx"); + } + if (cmdLine.contains("timeout")) { + System.out.println(String.format("Timeout:%s", cmdLine.get("timeout"))); + timeout = Integer.valueOf(cmdLine.get("timeout")).intValue(); + } + if (cmdLine.containsKey("all")) { + System.out.println("Use all files."); + allFiles = true; + } + File executable = null; + if (miner.equals("tane")) { + executable = new File("tane.jar"); + } else if (miner.equals("fastfds")) { + executable = new File("fastfds.jar"); + } else if (miner.equals("dfd")) { + executable = new File("dfd.jar"); + } + else { + System.out.println(String.format("No valid miner:\t%s", miner)); + System.exit(1); + } + + File inputDirectory = new File(inputDirectoryName); + if (!inputDirectory.exists()) { + System.out.println("Input directory doesn't exist."); + System.exit(1); + } + + File[] benchmarkFiles = new File[0]; + if (allFiles) { + benchmarkFiles = inputDirectory.listFiles(); + } else { + benchmarkFiles = getBenchmarkFilesWithPattern(inputDirectory); + } + Arrays.sort(benchmarkFiles); + + if (benchmarkFiles.length != 0) { + Miner.createColumDirectory(); + Miner.createResultDirectory(); + String resultFilename = getResultFileName(inputDirectory.getAbsolutePath(), miner); + File resultFile = new File(resultFilename); + writeMetaData(resultFile, cmdLine); + boolean errors = false; + for (File benchmarkFile : benchmarkFiles) { + if (!errors) { + try { + // create columns files and collect meta data + SVFileProcessor fileProcessor = new SVFileProcessor(benchmarkFile); + fileProcessor.init(delimiter); + fileProcessor.createColumnFiles(); + + // build command line with parameters + CommandLine processCmdLine = new CommandLine("java"); + processCmdLine.addArgument("-d64"); + processCmdLine.addArgument("-XX:GCTimeLimit=90"); + processCmdLine.addArgument("-XX:GCHeapFreeLimit=10"); + processCmdLine.addArgument("-XX:+UseSerialGC"); + processCmdLine.addArgument(String.format("-Xmx%s", xmx)); + processCmdLine.addArgument("-jar"); + processCmdLine.addArgument(executable.getName()); + processCmdLine.addArgument("-file"); + processCmdLine.addArgument(String.valueOf(benchmarkFile.getName())); + processCmdLine.addArgument("-columns"); + processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfColumns())); + processCmdLine.addArgument("-rows"); + processCmdLine.addArgument(String.valueOf(fileProcessor.getNumberOfRows())); + processCmdLine.addArgument("-result"); + processCmdLine.addArgument(resultFile.getAbsolutePath()); + processCmdLine.addArgument("-input"); + processCmdLine.addArgument(fileProcessor.getColumnDirectoryName()); + + // build process with watchdog + DefaultExecutor executor = new DefaultExecutor(); + ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout); + executor.setWatchdog(watchdog); + + // handle results + DefaultExecuteResultHandler resultHandler = new DefaultExecuteResultHandler(); + PumpStreamHandler streamHandler = new PumpStreamHandler(); + executor.setStreamHandler(streamHandler); + long timeStart = System.currentTimeMillis(); + executor.execute(processCmdLine, resultHandler); + resultHandler.waitFor(timeout); + + long timeEnd = System.currentTimeMillis(); + System.out.println(String.format("Time:%.1f", Double.valueOf((double)(timeEnd - timeStart) / 1000))); + + int exitCode = 0; + if (resultHandler.hasResult()) { + exitCode = resultHandler.getExitValue(); + } else { + exitCode = Miner.STATUS_OOT; + executor.getWatchdog().destroyProcess(); + } + + if (watchdog.killedProcess()) { + exitCode = Miner.STATUS_OOT; + executor.getWatchdog().destroyProcess(); + } else { + } + System.out.println(String.format("ExitCode %d", Integer.valueOf(exitCode))); + if (exitCode == Miner.STATUS_OK) { + + } else if (exitCode == Miner.STATUS_OOT || exitCode == Miner.STATUS_OOM) { + writeErrorCode(resultFile, exitCode); + errors = true; + } + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } + + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java index 98af5aa..326adc3 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/ColumnFiles.java @@ -58,6 +58,6 @@ public boolean accept(File file) { } private final String getColumnFileName(final int columnIndex) { - return String.format(this.formatString, Integer.valueOf(columnIndex)); + return String.format(this.formatString, Integer.valueOf(columnIndex)); } } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java index dbf1c4e..7868f32 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/general/FunctionalDependencies.java @@ -140,11 +140,11 @@ public String toString() { StringBuilder outputBuilder = new StringBuilder(); for (ColumnCollection determining : this.keySet()) { - for (int dependentColumn : this.get(determining).getSetBits()) { - for (int determiningColumn : determining.getSetBits()) { - outputBuilder.append(String.format("c%04d\t", Integer.valueOf(determiningColumn))); + for (Integer dependentColumn : this.get(determining).getSetBits()) { + for (Integer determiningColumn : determining.getSetBits()) { + outputBuilder.append(String.format("c%04d\t", determiningColumn)); } - outputBuilder.append(String.format("->\tc%04d\n", Integer.valueOf(dependentColumn))); + outputBuilder.append(String.format("->\tc%04d\n", dependentColumn)); } } return outputBuilder.toString(); diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java index ecb4d79..3282ba9 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/Partition.java @@ -1,171 +1,171 @@ -package fdiscovery.partitions; - -import java.util.TreeSet; - -import fdiscovery.columns.ColumnCollection; -import fdiscovery.equivalence.TEquivalence; -import gnu.trove.iterator.TIntIterator; - -public abstract class Partition extends TreeSet implements Comparable { - - private static final long serialVersionUID = 174046028525977844L; - - protected static int[] probeTable; - protected ColumnCollection indices; - protected int numberOfRows; - protected double error; - protected double distinctiveness; -// protected long hashNumber; - - public Partition(int columnIndex, int numberOfColumns, int numberOfRows) { - this.indices = new ColumnCollection(numberOfColumns); - this.indices.set(columnIndex); - this.numberOfRows = numberOfRows; - this.error = -1; - this.distinctiveness = -1; - if (Partition.probeTable == null || Partition.probeTable.length != numberOfRows) { - Partition.probeTable = new int[numberOfRows+1]; - for (int i = 0; i < Partition.probeTable.length; i++) { - Partition.probeTable[i] = -1; - } - } - } - - public void init(int numberOfRows) { - if (Partition.probeTable.length != numberOfRows) { - Partition.probeTable = new int[numberOfRows+1]; - } - } - - public Partition(Partition base, Partition additional) { - this.indices = base.indices.orCopy(additional.indices); - this.error = -1; - this.numberOfRows = base.numberOfRows; - this.distinctiveness = -1; - if (Partition.probeTable == null) { - Partition.probeTable = new int[numberOfRows+1]; - for (int i = 0; i < Partition.probeTable.length; i++) { - Partition.probeTable[i] = -1; - } - } - - } - - private void resetProbeTable() { - for (int i = 0; i < Partition.probeTable.length; i++) { - Partition.probeTable[i] = -1; - } - } - - @Override - public int compareTo(Partition o) { - if (this.getDistinctiveness() == o.getDistinctiveness()) { - return this.indices.compareTo(o.indices); - } - return Double.valueOf(this.getDistinctiveness()).compareTo(Double.valueOf(o.getDistinctiveness())); - } - - public int getNumberOfRows() { - return this.numberOfRows; - } - - public ColumnCollection getIndices() { - return this.indices; - } - - protected double getDistinctiveness() { - if (this.distinctiveness == -1) { - double distinctiveness = (double)(this.numberOfRows - this.size())/this.numberOfRows; - this.distinctiveness = distinctiveness; - } - return this.distinctiveness; - } - - public static double estimateDistinctiveness(Partition a, Partition b) { - return a.getDistinctiveness() + b.getDistinctiveness() - a.getDistinctiveness() * b.getDistinctiveness(); - } - - protected double getError() { - if (this.error == -1) { - int cumulatedEqClassSizes = 0; - for (TEquivalence equivalenceGroup : this) { - cumulatedEqClassSizes += equivalenceGroup.size(); - } - double error = (double)(cumulatedEqClassSizes - this.size())/this.numberOfRows; - this.error = error; - } - return this.error; - } - - public static boolean representsFD(Partition base, Partition baseMergedWithRHS) { - if (base.getError() == baseMergedWithRHS.getError()) { - return true; - } - return false; - } - - public boolean isUnique() { - return this.size() == 0; - } - - public boolean equals(Partition other) { - int numberOfValues = 0; - int groupIndex = 0; - for (TEquivalence equivalenceGroup : this) { - for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { - Partition.probeTable[equivalenceGroupIt.next()] = groupIndex; - numberOfValues++; - } - groupIndex++; - } - for (TEquivalence equivalenceGroup : other) { - groupIndex = -2; - for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { - int currentGroupIndex = Partition.probeTable[equivalenceGroupIt.next()]; - if (groupIndex == -2 || currentGroupIndex == groupIndex) { - groupIndex = currentGroupIndex; - } else { - resetProbeTable(); - return false; - } - numberOfValues--; - } - } - resetProbeTable(); - if (numberOfValues == 0) { - return true; - } - return false; - } - - public String printIndices() { - StringBuilder outputBuilder = new StringBuilder(this.indices.size()); - - for (int i=0; i < this.indices.size(); i++) { - if (this.indices.get(i)) { - outputBuilder.append("1"); - } else { - outputBuilder.append("0"); - } - } - return outputBuilder.toString(); - } - - @Override - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append(String.format("[%s]{", this.indices)); - - for(TEquivalence equivalenceGroup : this) { - outputBuilder.append("{"); - for (TIntIterator valueIt=equivalenceGroup.iterator(); valueIt.hasNext(); ) { - outputBuilder.append(valueIt.next()); - outputBuilder.append(","); - } - outputBuilder.append("}"); - } - outputBuilder.append("}"); - - return outputBuilder.toString(); - } -} +package fdiscovery.partitions; + +import java.util.TreeSet; + +import fdiscovery.columns.ColumnCollection; +import fdiscovery.equivalence.TEquivalence; +import gnu.trove.iterator.TIntIterator; + +public abstract class Partition extends TreeSet implements Comparable { + + private static final long serialVersionUID = 174046028525977844L; + + protected static int[] probeTable; + protected ColumnCollection indices; + protected int numberOfRows; + protected double error; + protected double distinctiveness; +// protected long hashNumber; + + public Partition(int columnIndex, int numberOfColumns, int numberOfRows) { + this.indices = new ColumnCollection(numberOfColumns); + this.indices.set(columnIndex); + this.numberOfRows = numberOfRows; + this.error = -1; + this.distinctiveness = -1; + if (Partition.probeTable == null || Partition.probeTable.length != numberOfRows) { + Partition.probeTable = new int[numberOfRows+1]; + for (int i = 0; i < Partition.probeTable.length; i++) { + Partition.probeTable[i] = -1; + } + } + } + + public void init(int numberOfRows) { + if (Partition.probeTable.length != numberOfRows) { + Partition.probeTable = new int[numberOfRows+1]; + } + } + + public Partition(Partition base, Partition additional) { + this.indices = base.indices.orCopy(additional.indices); + this.error = -1; + this.numberOfRows = base.numberOfRows; + this.distinctiveness = -1; + if (Partition.probeTable == null) { + Partition.probeTable = new int[numberOfRows+1]; + for (int i = 0; i < Partition.probeTable.length; i++) { + Partition.probeTable[i] = -1; + } + } + + } + + private void resetProbeTable() { + for (int i = 0; i < Partition.probeTable.length; i++) { + Partition.probeTable[i] = -1; + } + } + + @Override + public int compareTo(Partition o) { + if (this.getDistinctiveness() == o.getDistinctiveness()) { + return this.indices.compareTo(o.indices); + } + return Double.valueOf(this.getDistinctiveness()).compareTo(Double.valueOf(o.getDistinctiveness())); + } + + public int getNumberOfRows() { + return this.numberOfRows; + } + + public ColumnCollection getIndices() { + return this.indices; + } + + protected double getDistinctiveness() { + if (this.distinctiveness == -1) { + double distinctiveness = (double)(this.numberOfRows - this.size())/this.numberOfRows; + this.distinctiveness = distinctiveness; + } + return this.distinctiveness; + } + + public static double estimateDistinctiveness(Partition a, Partition b) { + return a.getDistinctiveness() + b.getDistinctiveness() - a.getDistinctiveness() * b.getDistinctiveness(); + } + + protected double getError() { + if (this.error == -1) { + int cumulatedEqClassSizes = 0; + for (TEquivalence equivalenceGroup : this) { + cumulatedEqClassSizes += equivalenceGroup.size(); + } + double error = (double)(cumulatedEqClassSizes - this.size())/this.numberOfRows; + this.error = error; + } + return this.error; + } + + public static boolean representsFD(Partition base, Partition baseMergedWithRHS) { + if (base.getError() == baseMergedWithRHS.getError()) { + return true; + } + return false; + } + + public boolean isUnique() { + return this.size() == 0; + } + + public boolean equals(Partition other) { + int numberOfValues = 0; + int groupIndex = 0; + for (TEquivalence equivalenceGroup : this) { + for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { + Partition.probeTable[equivalenceGroupIt.next()] = groupIndex; + numberOfValues++; + } + groupIndex++; + } + for (TEquivalence equivalenceGroup : other) { + groupIndex = -2; + for (TIntIterator equivalenceGroupIt = equivalenceGroup.iterator(); equivalenceGroupIt.hasNext(); ) { + int currentGroupIndex = Partition.probeTable[equivalenceGroupIt.next()]; + if (groupIndex == -2 || currentGroupIndex == groupIndex) { + groupIndex = currentGroupIndex; + } else { + resetProbeTable(); + return false; + } + numberOfValues--; + } + } + resetProbeTable(); + if (numberOfValues == 0) { + return true; + } + return false; + } + + public String printIndices() { + StringBuilder outputBuilder = new StringBuilder(this.indices.size()); + + for (int i=0; i < this.indices.size(); i++) { + if (this.indices.get(i)) { + outputBuilder.append("1"); + } else { + outputBuilder.append("0"); + } + } + return outputBuilder.toString(); + } + + @Override + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + outputBuilder.append(String.format("[%s]{", this.indices)); + + for(TEquivalence equivalenceGroup : this) { + outputBuilder.append("{"); + for (TIntIterator valueIt=equivalenceGroup.iterator(); valueIt.hasNext(); ) { + outputBuilder.append(valueIt.next()); + outputBuilder.append(","); + } + outputBuilder.append("}"); + } + outputBuilder.append("}"); + + return outputBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java index a3462fe..93bb615 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/PartitionStatistics.java @@ -1,40 +1,40 @@ -package fdiscovery.partitions; - -import java.util.ArrayList; - -import fdiscovery.columns.ColumnCollection; -import gnu.trove.iterator.TIntObjectIterator; -import gnu.trove.iterator.TLongObjectIterator; -import gnu.trove.map.hash.TIntObjectHashMap; -import gnu.trove.map.hash.TLongObjectHashMap; -import gnu.trove.map.hash.TObjectIntHashMap; - -public class PartitionStatistics extends TObjectIntHashMap { - - public String getStatistics() { - TLongObjectHashMap>> statsAndCountsByLevel = new TLongObjectHashMap<>(); - for (ColumnCollection partitionKey : this.keySet()) { - long keyCardinality = partitionKey.cardinality(); - int usageCount = this.get(partitionKey); - statsAndCountsByLevel.putIfAbsent(keyCardinality, new TIntObjectHashMap>()); - statsAndCountsByLevel.get(keyCardinality).putIfAbsent(usageCount, new ArrayList()); - statsAndCountsByLevel.get(keyCardinality).get(usageCount).add(partitionKey); - } - StringBuilder statisticsBuilder = new StringBuilder(); - statisticsBuilder.append("Statistics:\n"); - for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) { - statsByLevelIt.advance(); - long levelCardinality = statsByLevelIt.key(); - statisticsBuilder.append(String.format("%d attributes {\n", Long.valueOf(levelCardinality))); - for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) { - countByLevelIt.advance(); - int usageCount = countByLevelIt.key(); - int numberOfElements = countByLevelIt.value().size(); - statisticsBuilder.append(String.format("\t%d elements used %d times\n", Integer.valueOf(numberOfElements), Integer.valueOf(usageCount))); - } - statisticsBuilder.append("}\n"); - } - - return statisticsBuilder.toString(); - } -} +package fdiscovery.partitions; + +import java.util.ArrayList; + +import fdiscovery.columns.ColumnCollection; +import gnu.trove.iterator.TIntObjectIterator; +import gnu.trove.iterator.TLongObjectIterator; +import gnu.trove.map.hash.TIntObjectHashMap; +import gnu.trove.map.hash.TLongObjectHashMap; +import gnu.trove.map.hash.TObjectIntHashMap; + +public class PartitionStatistics extends TObjectIntHashMap { + + public String getStatistics() { + TLongObjectHashMap>> statsAndCountsByLevel = new TLongObjectHashMap<>(); + for (ColumnCollection partitionKey : this.keySet()) { + long keyCardinality = partitionKey.cardinality(); + int usageCount = this.get(partitionKey); + statsAndCountsByLevel.putIfAbsent(keyCardinality, new TIntObjectHashMap>()); + statsAndCountsByLevel.get(keyCardinality).putIfAbsent(usageCount, new ArrayList()); + statsAndCountsByLevel.get(keyCardinality).get(usageCount).add(partitionKey); + } + StringBuilder statisticsBuilder = new StringBuilder(); + statisticsBuilder.append("Statistics:\n"); + for (TLongObjectIterator>> statsByLevelIt = statsAndCountsByLevel.iterator(); statsByLevelIt.hasNext(); ) { + statsByLevelIt.advance(); + long levelCardinality = statsByLevelIt.key(); + statisticsBuilder.append(String.format("%d attributes {\n", Long.valueOf(levelCardinality))); + for (TIntObjectIterator> countByLevelIt = statsByLevelIt.value().iterator(); countByLevelIt.hasNext(); ) { + countByLevelIt.advance(); + int usageCount = countByLevelIt.key(); + int numberOfElements = countByLevelIt.value().size(); + statisticsBuilder.append(String.format("\t%d elements used %d times\n", Integer.valueOf(numberOfElements), Integer.valueOf(usageCount))); + } + statisticsBuilder.append("}\n"); + } + + return statisticsBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java index 8c527a5..f812e92 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/partitions/StrippedPartition.java @@ -1,78 +1,78 @@ -package fdiscovery.partitions; - -import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; -import fdiscovery.equivalence.TEquivalence; -import gnu.trove.iterator.TIntIterator; -import gnu.trove.map.hash.TObjectIntHashMap; - -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.TreeSet; - -public class StrippedPartition extends TreeSet { - - private static final long serialVersionUID = -10500424753490842L; - - // constructor for TANEs strippedProduct - public StrippedPartition() { - - } - - @SuppressWarnings("unused") - public StrippedPartition(StrippedPartition base, StrippedPartition additional) { - - } - - public StrippedPartition(String[] columnContent) { - TObjectIntHashMap valueToIndex = new TObjectIntHashMap<>(); - LinkedHashMap helpMap = new LinkedHashMap<>(); - - for (int rowIndex = 0; rowIndex < columnContent.length; rowIndex++) { - String value = columnContent[rowIndex]; - // if the value wasn't there yet, the row index becomes the representative - // for that equivalence class - if (!valueToIndex.containsKey(value)) { - valueToIndex.put(value, rowIndex); - TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet(); - equivalenceGroup.add(rowIndex); - helpMap.put(Integer.valueOf(rowIndex), equivalenceGroup); - } - // otherwise find the right equivalence class and add the current element index - else { - int equivalenceGroupIndex = valueToIndex.get(value); - TEquivalence equivalenceClass = helpMap.get(Integer.valueOf(equivalenceGroupIndex)); - equivalenceClass.add(rowIndex); - } - } - // remove equivalence classes with only one element - for(Iterator> it=helpMap.entrySet().iterator(); it.hasNext();) { - Map.Entry entry = it.next(); - if (entry.getValue().size() <= 1) { - it.remove(); - } - } - - // sort the stripped partition by equivalence group sizes - this.addAll(helpMap.values()); - } - - @Override - public String toString() { - StringBuilder outputBuilder = new StringBuilder(); - outputBuilder.append("{"); - - for(TEquivalence entry : this) { - outputBuilder.append("{"); - for (TIntIterator valueIt=entry.iterator(); valueIt.hasNext(); ) { -// for (TIntIteratorInteger value : entry) { - outputBuilder.append(valueIt.next()); - outputBuilder.append(","); - } - outputBuilder.append("}"); - } - outputBuilder.append("}"); - - return outputBuilder.toString(); - } -} +package fdiscovery.partitions; + +import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; +import fdiscovery.equivalence.TEquivalence; +import gnu.trove.iterator.TIntIterator; +import gnu.trove.map.hash.TObjectIntHashMap; + +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.TreeSet; + +public class StrippedPartition extends TreeSet { + + private static final long serialVersionUID = -10500424753490842L; + + // constructor for TANEs strippedProduct + public StrippedPartition() { + + } + + @SuppressWarnings("unused") + public StrippedPartition(StrippedPartition base, StrippedPartition additional) { + + } + + public StrippedPartition(String[] columnContent) { + TObjectIntHashMap valueToIndex = new TObjectIntHashMap<>(); + LinkedHashMap helpMap = new LinkedHashMap<>(); + + for (int rowIndex = 0; rowIndex < columnContent.length; rowIndex++) { + String value = columnContent[rowIndex]; + // if the value wasn't there yet, the row index becomes the representative + // for that equivalence class + if (!valueToIndex.containsKey(value)) { + valueToIndex.put(value, rowIndex); + TEquivalence equivalenceGroup = new EquivalenceGroupTIntHashSet(); + equivalenceGroup.add(rowIndex); + helpMap.put(Integer.valueOf(rowIndex), equivalenceGroup); + } + // otherwise find the right equivalence class and add the current element index + else { + int equivalenceGroupIndex = valueToIndex.get(value); + TEquivalence equivalenceClass = helpMap.get(Integer.valueOf(equivalenceGroupIndex)); + equivalenceClass.add(rowIndex); + } + } + // remove equivalence classes with only one element + for(Iterator> it=helpMap.entrySet().iterator(); it.hasNext();) { + Map.Entry entry = it.next(); + if (entry.getValue().size() <= 1) { + it.remove(); + } + } + + // sort the stripped partition by equivalence group sizes + this.addAll(helpMap.values()); + } + + @Override + public String toString() { + StringBuilder outputBuilder = new StringBuilder(); + outputBuilder.append("{"); + + for(TEquivalence entry : this) { + outputBuilder.append("{"); + for (TIntIterator valueIt=entry.iterator(); valueIt.hasNext(); ) { +// for (TIntIteratorInteger value : entry) { + outputBuilder.append(valueIt.next()); + outputBuilder.append(","); + } + outputBuilder.append("}"); + } + outputBuilder.append("}"); + + return outputBuilder.toString(); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java index d354e37..205b4d0 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/Observations.java @@ -24,7 +24,7 @@ public THashSet getUncheckedMaximalSubsets(ColumnCollection lh THashSet uncheckedMaximalSubsets = new THashSet<>(); // if (lhs.cardinality() > 2) { - for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices)) { uncheckedMaximalSubsets.add(subsetIndices); @@ -39,7 +39,7 @@ public THashSet getUncheckedOrCandidateMaximalSubsets(ColumnCo // we only want to check subsets with at least 2 columns if (lhs.cardinality() > 2) { - for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); if (!this.containsKey(subsetIndices) || this.get(subsetIndices) == Observation.CANDIDATE_MINIMAL_DEPENDENCY) { uncheckedMaximalSubsets.add(subsetIndices); @@ -54,7 +54,7 @@ public THashSet getMaximalSubsets(ColumnCollection lhs, Column // we only want to check subsets with at least 2 columns if (lhs.cardinality() > 2) { - for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { + for (int columnIndex : order.getOrderHighDistinctCount(lhs)) { ColumnCollection subsetIndices = lhs.removeColumnCopy(columnIndex); uncheckedMaximalSubsets.add(subsetIndices); } diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java index 1a11432..25f43d0 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneHashSet.java @@ -1,70 +1,70 @@ -package fdiscovery.pruning; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; - -import fdiscovery.columns.ColumnCollection; - -public class PruneHashSet extends HashMap> implements PruneInterface { - - private static final long serialVersionUID = 8012444410589325434L; - - public PruneHashSet(int numberOfColumns) { - super(numberOfColumns); - ColumnCollection key = new ColumnCollection(numberOfColumns); - for (int columnIndex = 0; columnIndex < numberOfColumns; columnIndex++) { - this.put(key.setCopy(columnIndex), new HashSet()); - } - } - - public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDependencies nonDependencies, ArrayList candidates) { - for (ColumnCollection candidate : candidates) { - if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) { - return candidate; - } - } - return null; - } - - @Override - public void rebalance() { - boolean rebalancedGroup = false; - - do { - rebalancedGroup = false; - ArrayList groupKeys = new ArrayList<>(this.keySet()); - for (ColumnCollection key : groupKeys) { - if (this.get(key).size() > SPLIT_THRESHOLD) { - rebalanceGroup(key); - rebalancedGroup = true; - } - } - } while (rebalancedGroup); - } - - @Override - public void rebalanceGroup(ColumnCollection groupKey) { - HashSet depsOfGroup = this.get(groupKey); - for (int columnIndex : groupKey.complementCopy().getSetBits()) { - ColumnCollection newKey = groupKey.setCopy(columnIndex); - HashSet newGroup = new HashSet(); - this.put(newKey, newGroup); - - for (ColumnCollection depOfGroup : depsOfGroup) { - // when splitting a group it cannot contain the key itself - // because otherwise the group cannot contain any other - // element since it would be a superset of the key and be pruned - // OR - // when splitting a group it cannot contain the key itself - // because otherwise all supersets of the key would have - // been pruned and it wouldn't need to be split - if (newKey.isSubsetOf(depOfGroup)) { - newGroup.add(depOfGroup); - } - } - } - // remove the old group - this.remove(groupKey); - } -} +package fdiscovery.pruning; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +import fdiscovery.columns.ColumnCollection; + +public class PruneHashSet extends HashMap> implements PruneInterface { + + private static final long serialVersionUID = 8012444410589325434L; + + public PruneHashSet(int numberOfColumns) { + super(numberOfColumns); + ColumnCollection key = new ColumnCollection(numberOfColumns); + for (int columnIndex = 0; columnIndex < numberOfColumns; columnIndex++) { + this.put(key.setCopy(columnIndex), new HashSet()); + } + } + + public static ColumnCollection getNotPrunedKey(Dependencies dependencies, NonDependencies nonDependencies, ArrayList candidates) { + for (ColumnCollection candidate : candidates) { + if (!dependencies.isRepresented(candidate) && !nonDependencies.isRepresented(candidate)) { + return candidate; + } + } + return null; + } + + @Override + public void rebalance() { + boolean rebalancedGroup = false; + + do { + rebalancedGroup = false; + ArrayList groupKeys = new ArrayList<>(this.keySet()); + for (ColumnCollection key : groupKeys) { + if (this.get(key).size() > SPLIT_THRESHOLD) { + rebalanceGroup(key); + rebalancedGroup = true; + } + } + } while (rebalancedGroup); + } + + @Override + public void rebalanceGroup(ColumnCollection groupKey) { + HashSet depsOfGroup = this.get(groupKey); + for (int columnIndex : groupKey.complementCopy().getSetBits()) { + ColumnCollection newKey = groupKey.setCopy(columnIndex); + HashSet newGroup = new HashSet(); + this.put(newKey, newGroup); + + for (ColumnCollection depOfGroup : depsOfGroup) { + // when splitting a group it cannot contain the key itself + // because otherwise the group cannot contain any other + // element since it would be a superset of the key and be pruned + // OR + // when splitting a group it cannot contain the key itself + // because otherwise all supersets of the key would have + // been pruned and it wouldn't need to be split + if (newKey.isSubsetOf(depOfGroup)) { + newGroup.add(depOfGroup); + } + } + } + // remove the old group + this.remove(groupKey); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java index cf045e2..f6fe76a 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/pruning/PruneTable.java @@ -1,37 +1,37 @@ -package fdiscovery.pruning; - -import java.util.ArrayList; -import java.util.HashMap; - -import fdiscovery.columns.ColumnCollection; - -// from rhs to lhs -public abstract class PruneTable extends HashMap>> { - - private static final long serialVersionUID = 4470955427882698208L; - - public int getCount(ColumnCollection RHS) { - int count = 0; - if (this.containsKey(RHS)) { - for (ArrayList collection : this.get(RHS).values()) { - count += collection.size(); - } - } - return count; - } - - - public void addValue(ColumnCollection RHS, ColumnCollection LHS) { - if (!this.containsKey(RHS)) { - this.put(RHS, new HashMap>()); - } - if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) { - this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList()); - } -// System.out.println(this.get(RHS)); -// System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality())); - ArrayList dependencies = this.get(RHS).get(Integer.valueOf(LHS.cardinality())); -// System.out.println(dependencies); - dependencies.add(LHS); - } -} +package fdiscovery.pruning; + +import java.util.ArrayList; +import java.util.HashMap; + +import fdiscovery.columns.ColumnCollection; + +// from rhs to lhs +public abstract class PruneTable extends HashMap>> { + + private static final long serialVersionUID = 4470955427882698208L; + + public int getCount(ColumnCollection RHS) { + int count = 0; + if (this.containsKey(RHS)) { + for (ArrayList collection : this.get(RHS).values()) { + count += collection.size(); + } + } + return count; + } + + + public void addValue(ColumnCollection RHS, ColumnCollection LHS) { + if (!this.containsKey(RHS)) { + this.put(RHS, new HashMap>()); + } + if (!this.get(RHS).containsKey(Integer.valueOf(LHS.cardinality()))) { + this.get(RHS).put(Integer.valueOf(LHS.cardinality()), new ArrayList()); + } +// System.out.println(this.get(RHS)); +// System.out.println(String.format("Column:\t%s\t%d", LHS, LHS.cardinality())); + ArrayList dependencies = this.get(RHS).get(Integer.valueOf(LHS.cardinality())); +// System.out.println(dependencies); + dependencies.add(LHS); + } +} diff --git a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java index 0909653..5740f8b 100755 --- a/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java +++ b/dfd/dfdAlgorithm/src/fdiscovery/tane/runner/Tane.java @@ -1,431 +1,431 @@ -package fdiscovery.tane.runner; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; - -import org.apache.commons.cli.CommandLine; - -import fdiscovery.columns.ColumnCollection; - -import com.rits.cloning.Cloner; - -import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; -import fdiscovery.equivalence.TEquivalence; -import fdiscovery.partitions.StrippedPartition; -import fdiscovery.partitions.StrippedPartitions; -import fdiscovery.preprocessing.SVFileProcessor; -import fdiscovery.tane.AprioriGeneration; -import fdiscovery.general.CLIParserMiner; -import fdiscovery.general.CollectionSet; -import fdiscovery.general.ColumnFiles; -import fdiscovery.general.FunctionalDependencies; -import fdiscovery.general.Miner; -import gnu.trove.iterator.TIntIterator; -import gnu.trove.map.hash.THashMap; - -public class Tane extends Miner { - - private int numberOfColumns; - private int numberOfRows; - private int[] T, Te; - private FunctionalDependencies minimalDependencies; - private StrippedPartitions strippedPartitions; - private HashMap cPlus; - private ArrayList> levels; - private ColumnCollection rSet; - - public FunctionalDependencies getDependencies() { - return this.minimalDependencies; - } - - @SuppressWarnings("unused") - public static void main2(String[] args) { - createColumDirectory(); - createResultDirectory(); - - File source = new File(Miner.input); - SVFileProcessor inputFileProcessor = null; - try { - long timeStart = System.currentTimeMillis(); - - inputFileProcessor = new SVFileProcessor(source); - inputFileProcessor.init(); - System.out.println("TANE"); - System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); - System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); - System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); - inputFileProcessor.createColumnFiles(); - Tane taneRunner = new Tane(inputFileProcessor); - taneRunner.run(); - - System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(taneRunner.minimalDependencies.getCount())));; - long timeFindFDs = System.currentTimeMillis(); - System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); - System.out.println(taneRunner.getDependencies()); - - } catch (FileNotFoundException e) { - System.out.println("The input file could not be found."); - } catch (IOException e) { - System.out.println("The input reader could not be reset."); - } - } - - public static void main(String[] args) { - CLIParserMiner parser = new CLIParserMiner(); - CommandLine cli = parser.parse(args); - String inputFilename = new String(); - String columnFileDirectory = new String(); - String resultFile = new String(); - int numberOfColumns = 0; - int numberOfRows = 0; - - if (cli.hasOption("file")) { - inputFilename = cli.getOptionValue("file"); - } - if (cli.hasOption("input")) { - columnFileDirectory = cli.getOptionValue("input"); - } - if (cli.hasOption("result")) { - resultFile = cli.getOptionValue("result"); - } - if (cli.hasOption("columns")) { - numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); - } - if (cli.hasOption("rows")) { - numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); - } - ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); - long timeStart = System.currentTimeMillis(); - try { - Tane runner = new Tane(columnFiles, numberOfRows); - runner.run(); - long timeEnd = System.currentTimeMillis(); - runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); - } catch(OutOfMemoryError e) { - System.exit(Miner.STATUS_OOM); - } - System.exit(0); - } - - private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { - String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; - - StringBuilder outputBuilder = new StringBuilder(); - if (!inputFileName.isEmpty()) { - outputBuilder.append(String.format("%s\t", inputFileName)); - } - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); - outputBuilder.append(String.format("%s\t", timeString)); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); - outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); - outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); - outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); - - try { - BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); - resultFileWriter.write(outputBuilder.toString()); - System.out.print(outputBuilder.toString()); - resultFileWriter.close(); - } catch (IOException e) { - System.out.println("Couldn't write output."); - } - } - - public Tane(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { - this.numberOfColumns = columnFiles.getNumberOfColumns(); - this.numberOfRows = numberOfRows; - this.minimalDependencies = new FunctionalDependencies(); - this.strippedPartitions = new StrippedPartitions(columnFiles); - columnFiles.clear(); - } - - - public Tane(SVFileProcessor table) throws OutOfMemoryError { - this.numberOfColumns = table.getNumberOfColumns(); - this.numberOfRows = table.getNumberOfRows(); - this.minimalDependencies = new FunctionalDependencies(); - this.strippedPartitions = new StrippedPartitions(table.getColumnFiles()); - } - - public THashMap run() throws OutOfMemoryError { - - levels = new ArrayList<>(); - cPlus = new HashMap<>(); - - // Level 0 is the empty set - levels.add(new CollectionSet()); - // Level 1 initialization - levels.add(new CollectionSet()); - - ColumnCollection emptyLHSSet = new ColumnCollection(this.numberOfColumns); - rSet = new ColumnCollection(this.numberOfColumns); - - cPlus.put(emptyLHSSet, rSet); - - this.T = new int[this.numberOfRows + 1]; - this.Te = new int[this.numberOfRows + 1]; - // initialize T to all -1, because it is specified to be all "NULL" - // (!=0) in TANE - for (int i = 0; i < T.length; i++) { - T[i] = -1; - } - - // Initialization - for (int i = 0; i < this.numberOfColumns; i++) { - // set all bits in R - rSet.set(i); - // build atomic attribute-sets - ColumnCollection subset = new ColumnCollection(this.numberOfColumns); - subset.set(i); - // add to first level - levels.get(1).add(subset); - } - - // main algorithm - int level = 1; - while (!levels.get(level).isEmpty()) { -// System.out.println("Level:\t" + level); - this.computeDependencies(levels.get(level)); - this.prune(levels.get(level)); - levels.add(this.generateNextLevel(levels.get(level))); - levels.get(level).clear(); - level++; - } - return minimalDependencies; - } - - private CollectionSet generateNextLevel(CollectionSet currentLevel) { - CollectionSet nextLevel = new CollectionSet<>(); - - Cloner cloner = new Cloner(); - AprioriGeneration prefixBlockGenerator = new AprioriGeneration<>(cloner.deepClone(currentLevel)); - for (CollectionSet k : prefixBlockGenerator.prefixBlocks()) { - for (ColumnCollection y : k) { - for (ColumnCollection z : k.tailSet(y)) { - ColumnCollection x = y.orCopy(z); - boolean xInNextLevel = true; - for (int a : x.getSetBits()) { - x.clear(a); - if (!currentLevel.contains(x)) { - xInNextLevel = false; - break; - } - x.set(a); - } - if (xInNextLevel) { - nextLevel.add(x); - strippedPartitions.put(x, strippedProduct(strippedPartitions.get(y), strippedPartitions.get(z))); - } - } - } - } - - return nextLevel; - } - - private void computeDependencies(CollectionSet currentLevel) { - for (ColumnCollection x : currentLevel) { - addCPlusOfX(x); - } - - for (ColumnCollection x : currentLevel) { - for (int a : x.andCopy(cPlus.get(x)).getSetBits()) { - boolean isDependency = isValidDependency(x.clearCopy(a), Integer.valueOf(a)); - - if (isDependency) { - minimalDependencies.addRHSColumn(x.clearCopy(a), a); - cPlus.get(x).clear(a); - - for (int B : rSet.removeCopy(x).getSetBits()) { - cPlus.get(x).clear(B); - } - } - } - - } - } - - private ColumnCollection addCPlusOfX(ColumnCollection x) { - ColumnCollection cPlusOfX = cPlus.get(x.clearCopy(x.nextSetBit(0))); - - // if cPlusOfX was not in the list it has to be computed recursively - if (cPlusOfX == null) { - cPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(x.nextSetBit(0))).clone(); - } else { - cPlusOfX = (ColumnCollection) cPlusOfX.clone(); - } - for (int a : x.getSetBits()) { - ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a)); - - if (nextCPlusOfX == null) { - nextCPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(a)).clone(); - } else { - nextCPlusOfX = (ColumnCollection) nextCPlusOfX.clone(); - } - - cPlusOfX.and(nextCPlusOfX); - } - cPlus.put(x, cPlusOfX); - - return cPlusOfX; - } - - private void prune(CollectionSet currentLevel) { - Iterator currentLevelIterator = currentLevel.iterator(); - - while (currentLevelIterator.hasNext()) { - ColumnCollection x = currentLevelIterator.next(); - - ColumnCollection cPlusOfX = cPlus.get(x); - if (cPlusOfX == null) { - cPlusOfX = addCPlusOfX(x); - } - - if (cPlusOfX.isEmpty()) { - currentLevelIterator.remove(); - continue; - } - - boolean isSuperKey = isSuperKey(x); - if (isSuperKey) { - for (int a : cPlus.get(x).removeCopy(x).getSetBits()) { - ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0)); - ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey); - if (firstCPlusCandidates == null) { - firstCPlusCandidates = (ColumnCollection) addCPlusOfX(firstCPlusCandidatesKey).clone(); - } else { - firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone(); - } - for (int b : x.getSetBits()) { - - ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b)); - if (nextCPlusCandidates == null) { - nextCPlusCandidates = (ColumnCollection) addCPlusOfX(x.setCopy(a).clearCopy(b)).clone(); - } else { - nextCPlusCandidates = (ColumnCollection) nextCPlusCandidates.clone(); - } - - firstCPlusCandidates.and(nextCPlusCandidates); - } - if (firstCPlusCandidates.get(a)) { - minimalDependencies.addRHSColumn(x, a); - } - } - currentLevelIterator.remove(); - } - } - } - - protected boolean isSuperKey(ColumnCollection LHS) { - StrippedPartition partitionOfX = strippedPartitions.get(LHS); - - int sumOfSizesOfEquivalenceClasses = 0; - int numberOfEquivalenceClasses = 0; - - for (TEquivalence equivalenceGroup : partitionOfX) { - sumOfSizesOfEquivalenceClasses += equivalenceGroup.size(); - numberOfEquivalenceClasses++; - } - - // equation (1) in the paper - boolean result = (((sumOfSizesOfEquivalenceClasses - numberOfEquivalenceClasses) / (double) this.numberOfColumns) == 0); - - return result; - } - - private double error(StrippedPartition xPartition, StrippedPartition xUnionAPartition) { - int e = 0; - - for (TEquivalence equivalenceGroup : xUnionAPartition) { - Te[equivalenceGroup.getIdentifier()] = equivalenceGroup.size(); - } - for (TEquivalence equivalenceGroup : xPartition) { - int m = 1; - - for (TIntIterator tIt=equivalenceGroup.iterator(); tIt.hasNext(); ) { -// for (Integer t : equivalenceGroup) { - m = Math.max(m, Te[tIt.next()]); - } - e = e + equivalenceGroup.size() - m; - - } - for (TEquivalence equivalenceGroup : xUnionAPartition) { - Te[equivalenceGroup.getIdentifier()] = 0; - } - - return (double)e / this.numberOfRows; - } - - - private boolean isValidDependency(ColumnCollection LHS, Integer RHS) { - if (LHS.isEmpty()) { - return false; - } - - return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS.intValue()))) == 0); - } - - public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) { - StrippedPartition xPartition = new StrippedPartition(); - HashMap S = new HashMap<>(); - - if (yPartition.size() > zPartition.size()) { - StrippedPartition swap = zPartition; - zPartition = yPartition; - yPartition = swap; - } - - // build some kind of probe table - int i = 1; - for (TEquivalence cI : yPartition) { - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - T[tValue] = i; - - } - S.put(Integer.valueOf(i), new EquivalenceGroupTIntHashSet()); - i++; - } - - for (TEquivalence cI : zPartition) { - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - if (T[tValue] != -1) { - TEquivalence sOld = S.get(Integer.valueOf(T[tValue])); - sOld.add(tValue); - } - } - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - TEquivalence s = S.get(Integer.valueOf(T[tValue])); - if (s != null && s.size() > 1) { - xPartition.add(s); - } - S.put(Integer.valueOf(T[tValue]), new EquivalenceGroupTIntHashSet()); - } - } - i = 1; - for (TEquivalence cI : yPartition) { - for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { - int tValue = tIt.next(); - T[tValue] = -1; - } - } - - return xPartition; - } -} +package fdiscovery.tane.runner; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; + +import org.apache.commons.cli.CommandLine; + +import fdiscovery.columns.ColumnCollection; + +import com.rits.cloning.Cloner; + +import fdiscovery.equivalence.EquivalenceGroupTIntHashSet; +import fdiscovery.equivalence.TEquivalence; +import fdiscovery.partitions.StrippedPartition; +import fdiscovery.partitions.StrippedPartitions; +import fdiscovery.preprocessing.SVFileProcessor; +import fdiscovery.tane.AprioriGeneration; +import fdiscovery.general.CLIParserMiner; +import fdiscovery.general.CollectionSet; +import fdiscovery.general.ColumnFiles; +import fdiscovery.general.FunctionalDependencies; +import fdiscovery.general.Miner; +import gnu.trove.iterator.TIntIterator; +import gnu.trove.map.hash.THashMap; + +public class Tane extends Miner { + + private int numberOfColumns; + private int numberOfRows; + private int[] T, Te; + private FunctionalDependencies minimalDependencies; + private StrippedPartitions strippedPartitions; + private HashMap cPlus; + private ArrayList> levels; + private ColumnCollection rSet; + + public FunctionalDependencies getDependencies() { + return this.minimalDependencies; + } + + @SuppressWarnings("unused") + public static void main2(String[] args) { + createColumDirectory(); + createResultDirectory(); + + File source = new File(Miner.input); + SVFileProcessor inputFileProcessor = null; + try { + long timeStart = System.currentTimeMillis(); + + inputFileProcessor = new SVFileProcessor(source); + inputFileProcessor.init(); + System.out.println("TANE"); + System.out.println("Delimiter:\t" + inputFileProcessor.getDelimiter()); + System.out.println("Columns:\t" + inputFileProcessor.getNumberOfColumns()); + System.out.println("Rows:\t" + inputFileProcessor.getNumberOfRows()); + inputFileProcessor.createColumnFiles(); + Tane taneRunner = new Tane(inputFileProcessor); + taneRunner.run(); + + System.out.println(String.format("Number of dependencies:\t%d", Integer.valueOf(taneRunner.minimalDependencies.getCount())));; + long timeFindFDs = System.currentTimeMillis(); + System.out.println("Total time:\t" + (timeFindFDs - timeStart)/1000 + "s"); + System.out.println(taneRunner.getDependencies()); + + } catch (FileNotFoundException e) { + System.out.println("The input file could not be found."); + } catch (IOException e) { + System.out.println("The input reader could not be reset."); + } + } + + public static void main(String[] args) { + CLIParserMiner parser = new CLIParserMiner(); + CommandLine cli = parser.parse(args); + String inputFilename = new String(); + String columnFileDirectory = new String(); + String resultFile = new String(); + int numberOfColumns = 0; + int numberOfRows = 0; + + if (cli.hasOption("file")) { + inputFilename = cli.getOptionValue("file"); + } + if (cli.hasOption("input")) { + columnFileDirectory = cli.getOptionValue("input"); + } + if (cli.hasOption("result")) { + resultFile = cli.getOptionValue("result"); + } + if (cli.hasOption("columns")) { + numberOfColumns = Integer.valueOf(cli.getOptionValue("columns")).intValue(); + } + if (cli.hasOption("rows")) { + numberOfRows = Integer.valueOf(cli.getOptionValue("rows")).intValue(); + } + ColumnFiles columnFiles = new ColumnFiles(new File(columnFileDirectory), numberOfColumns, numberOfRows); + long timeStart = System.currentTimeMillis(); + try { + Tane runner = new Tane(columnFiles, numberOfRows); + runner.run(); + long timeEnd = System.currentTimeMillis(); + runner.writeOutputSuccessful(resultFile, timeEnd - timeStart, inputFilename); + } catch(OutOfMemoryError e) { + System.exit(Miner.STATUS_OOM); + } + System.exit(0); + } + + private void writeOutputSuccessful(String outputFile, long time, String inputFileName) { + String timeString = (time != -1)? String.format("%.1f", Double.valueOf((double)(time) / 1000)) : "-1"; + + StringBuilder outputBuilder = new StringBuilder(); + if (!inputFileName.isEmpty()) { + outputBuilder.append(String.format("%s\t", inputFileName)); + } + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfRows))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.numberOfColumns))); + outputBuilder.append(String.format("%s\t", timeString)); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCount()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(2)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(3)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(4)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeLesserThan(6)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.minimalDependencies.getCountForSizeGreaterThan(5)))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); + outputBuilder.append(String.format("%d\t", Integer.valueOf(this.strippedPartitions.size()))); + outputBuilder.append(String.format("%d\n", Long.valueOf(Runtime.getRuntime().totalMemory()))); + outputBuilder.append(String.format("#Memory: %s\n", Miner.humanReadableByteCount(Runtime.getRuntime().totalMemory(), false))); + + try { + BufferedWriter resultFileWriter = new BufferedWriter(new FileWriter(new File(outputFile), true)); + resultFileWriter.write(outputBuilder.toString()); + System.out.print(outputBuilder.toString()); + resultFileWriter.close(); + } catch (IOException e) { + System.out.println("Couldn't write output."); + } + } + + public Tane(ColumnFiles columnFiles, int numberOfRows) throws OutOfMemoryError { + this.numberOfColumns = columnFiles.getNumberOfColumns(); + this.numberOfRows = numberOfRows; + this.minimalDependencies = new FunctionalDependencies(); + this.strippedPartitions = new StrippedPartitions(columnFiles); + columnFiles.clear(); + } + + + public Tane(SVFileProcessor table) throws OutOfMemoryError { + this.numberOfColumns = table.getNumberOfColumns(); + this.numberOfRows = table.getNumberOfRows(); + this.minimalDependencies = new FunctionalDependencies(); + this.strippedPartitions = new StrippedPartitions(table.getColumnFiles()); + } + + public THashMap run() throws OutOfMemoryError { + + levels = new ArrayList<>(); + cPlus = new HashMap<>(); + + // Level 0 is the empty set + levels.add(new CollectionSet()); + // Level 1 initialization + levels.add(new CollectionSet()); + + ColumnCollection emptyLHSSet = new ColumnCollection(this.numberOfColumns); + rSet = new ColumnCollection(this.numberOfColumns); + + cPlus.put(emptyLHSSet, rSet); + + this.T = new int[this.numberOfRows + 1]; + this.Te = new int[this.numberOfRows + 1]; + // initialize T to all -1, because it is specified to be all "NULL" + // (!=0) in TANE + for (int i = 0; i < T.length; i++) { + T[i] = -1; + } + + // Initialization + for (int i = 0; i < this.numberOfColumns; i++) { + // set all bits in R + rSet.set(i); + // build atomic attribute-sets + ColumnCollection subset = new ColumnCollection(this.numberOfColumns); + subset.set(i); + // add to first level + levels.get(1).add(subset); + } + + // main algorithm + int level = 1; + while (!levels.get(level).isEmpty()) { +// System.out.println("Level:\t" + level); + this.computeDependencies(levels.get(level)); + this.prune(levels.get(level)); + levels.add(this.generateNextLevel(levels.get(level))); + levels.get(level).clear(); + level++; + } + return minimalDependencies; + } + + private CollectionSet generateNextLevel(CollectionSet currentLevel) { + CollectionSet nextLevel = new CollectionSet<>(); + + Cloner cloner = new Cloner(); + AprioriGeneration prefixBlockGenerator = new AprioriGeneration<>(cloner.deepClone(currentLevel)); + for (CollectionSet k : prefixBlockGenerator.prefixBlocks()) { + for (ColumnCollection y : k) { + for (ColumnCollection z : k.tailSet(y)) { + ColumnCollection x = y.orCopy(z); + boolean xInNextLevel = true; + for (int a : x.getSetBits()) { + x.clear(a); + if (!currentLevel.contains(x)) { + xInNextLevel = false; + break; + } + x.set(a); + } + if (xInNextLevel) { + nextLevel.add(x); + strippedPartitions.put(x, strippedProduct(strippedPartitions.get(y), strippedPartitions.get(z))); + } + } + } + } + + return nextLevel; + } + + private void computeDependencies(CollectionSet currentLevel) { + for (ColumnCollection x : currentLevel) { + addCPlusOfX(x); + } + + for (ColumnCollection x : currentLevel) { + for (int a : x.andCopy(cPlus.get(x)).getSetBits()) { + boolean isDependency = isValidDependency(x.clearCopy(a), Integer.valueOf(a)); + + if (isDependency) { + minimalDependencies.addRHSColumn(x.clearCopy(a), a); + cPlus.get(x).clear(a); + + for (int B : rSet.removeCopy(x).getSetBits()) { + cPlus.get(x).clear(B); + } + } + } + + } + } + + private ColumnCollection addCPlusOfX(ColumnCollection x) { + ColumnCollection cPlusOfX = cPlus.get(x.clearCopy(x.nextSetBit(0))); + + // if cPlusOfX was not in the list it has to be computed recursively + if (cPlusOfX == null) { + cPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(x.nextSetBit(0))).clone(); + } else { + cPlusOfX = (ColumnCollection) cPlusOfX.clone(); + } + for (int a : x.getSetBits()) { + ColumnCollection nextCPlusOfX = cPlus.get(x.clearCopy(a)); + + if (nextCPlusOfX == null) { + nextCPlusOfX = (ColumnCollection) addCPlusOfX(x.clearCopy(a)).clone(); + } else { + nextCPlusOfX = (ColumnCollection) nextCPlusOfX.clone(); + } + + cPlusOfX.and(nextCPlusOfX); + } + cPlus.put(x, cPlusOfX); + + return cPlusOfX; + } + + private void prune(CollectionSet currentLevel) { + Iterator currentLevelIterator = currentLevel.iterator(); + + while (currentLevelIterator.hasNext()) { + ColumnCollection x = currentLevelIterator.next(); + + ColumnCollection cPlusOfX = cPlus.get(x); + if (cPlusOfX == null) { + cPlusOfX = addCPlusOfX(x); + } + + if (cPlusOfX.isEmpty()) { + currentLevelIterator.remove(); + continue; + } + + boolean isSuperKey = isSuperKey(x); + if (isSuperKey) { + for (int a : cPlus.get(x).removeCopy(x).getSetBits()) { + ColumnCollection firstCPlusCandidatesKey = x.setCopy(a).clearCopy(x.nextSetBit(0)); + ColumnCollection firstCPlusCandidates = cPlus.get(firstCPlusCandidatesKey); + if (firstCPlusCandidates == null) { + firstCPlusCandidates = (ColumnCollection) addCPlusOfX(firstCPlusCandidatesKey).clone(); + } else { + firstCPlusCandidates = (ColumnCollection) firstCPlusCandidates.clone(); + } + for (int b : x.getSetBits()) { + + ColumnCollection nextCPlusCandidates = cPlus.get(x.setCopy(a).clearCopy(b)); + if (nextCPlusCandidates == null) { + nextCPlusCandidates = (ColumnCollection) addCPlusOfX(x.setCopy(a).clearCopy(b)).clone(); + } else { + nextCPlusCandidates = (ColumnCollection) nextCPlusCandidates.clone(); + } + + firstCPlusCandidates.and(nextCPlusCandidates); + } + if (firstCPlusCandidates.get(a)) { + minimalDependencies.addRHSColumn(x, a); + } + } + currentLevelIterator.remove(); + } + } + } + + protected boolean isSuperKey(ColumnCollection LHS) { + StrippedPartition partitionOfX = strippedPartitions.get(LHS); + + int sumOfSizesOfEquivalenceClasses = 0; + int numberOfEquivalenceClasses = 0; + + for (TEquivalence equivalenceGroup : partitionOfX) { + sumOfSizesOfEquivalenceClasses += equivalenceGroup.size(); + numberOfEquivalenceClasses++; + } + + // equation (1) in the paper + boolean result = (((sumOfSizesOfEquivalenceClasses - numberOfEquivalenceClasses) / (double) this.numberOfColumns) == 0); + + return result; + } + + private double error(StrippedPartition xPartition, StrippedPartition xUnionAPartition) { + int e = 0; + + for (TEquivalence equivalenceGroup : xUnionAPartition) { + Te[equivalenceGroup.getIdentifier()] = equivalenceGroup.size(); + } + for (TEquivalence equivalenceGroup : xPartition) { + int m = 1; + + for (TIntIterator tIt=equivalenceGroup.iterator(); tIt.hasNext(); ) { +// for (Integer t : equivalenceGroup) { + m = Math.max(m, Te[tIt.next()]); + } + e = e + equivalenceGroup.size() - m; + + } + for (TEquivalence equivalenceGroup : xUnionAPartition) { + Te[equivalenceGroup.getIdentifier()] = 0; + } + + return (double)e / this.numberOfRows; + } + + + private boolean isValidDependency(ColumnCollection LHS, Integer RHS) { + if (LHS.isEmpty()) { + return false; + } + + return (this.error(strippedPartitions.get(LHS), strippedPartitions.get(LHS.setCopy(RHS.intValue()))) == 0); + } + + public StrippedPartition strippedProduct(StrippedPartition yPartition, StrippedPartition zPartition) { + StrippedPartition xPartition = new StrippedPartition(); + HashMap S = new HashMap<>(); + + if (yPartition.size() > zPartition.size()) { + StrippedPartition swap = zPartition; + zPartition = yPartition; + yPartition = swap; + } + + // build some kind of probe table + int i = 1; + for (TEquivalence cI : yPartition) { + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + T[tValue] = i; + + } + S.put(Integer.valueOf(i), new EquivalenceGroupTIntHashSet()); + i++; + } + + for (TEquivalence cI : zPartition) { + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + if (T[tValue] != -1) { + TEquivalence sOld = S.get(Integer.valueOf(T[tValue])); + sOld.add(tValue); + } + } + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + TEquivalence s = S.get(Integer.valueOf(T[tValue])); + if (s != null && s.size() > 1) { + xPartition.add(s); + } + S.put(Integer.valueOf(T[tValue]), new EquivalenceGroupTIntHashSet()); + } + } + i = 1; + for (TEquivalence cI : yPartition) { + for (TIntIterator tIt=cI.iterator(); tIt.hasNext(); ) { + int tValue = tIt.next(); + T[tValue] = -1; + } + } + + return xPartition; + } +} diff --git a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java similarity index 99% rename from dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java rename to dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java index 1c8fd82..e631ab7 100644 --- a/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/DFDMetanome.java +++ b/dfd/dfdMetanome/src/main/java/de/metanome/algorithms/dfd/dfdMetanome/DFDMetanome.java @@ -71,11 +71,11 @@ public void execute() throws AlgorithmExecutionException { DFDMiner dfdMiner = new DFDMiner(inputFileProcessor); dfdMiner.run(); FunctionalDependencies fds = dfdMiner.getDependencies(); - + RelationalInput input = fileInput.generateNewCopy(); String relationName = input.relationName(); List columnNames = input.columnNames(); - + for (ColumnCollection determining : fds.keySet()) { for (int dependentColumn : fds.get(determining).getSetBits()) { ColumnIdentifier[]