Skip to content

Commit

Permalink
feat: add capability to output Duplicate map from deduplicate command (
Browse files Browse the repository at this point in the history
  • Loading branch information
gabynevada authored Feb 14, 2024
1 parent 701678b commit 24b3495
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 11 deletions.
52 changes: 42 additions & 10 deletions src/Biomatch.CLI/Commands/MatchingCommand.Deduplicate.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ private static Command GetDeduplicateTemplateCommand()
var outputOption = new Option<FileInfo>(
name: "--output",
description: "Output file path",
getDefaultValue: () => new FileInfo("Duplicates.csv")
getDefaultValue: () => new FileInfo("Deduplicated.csv")
);
outputOption.AddAlias("-o");

Expand All @@ -34,6 +34,16 @@ private static Command GetDeduplicateTemplateCommand()
getDefaultValue: () => 0.85
);

var mapOption = new Option<bool>(name: "--map", description: "Generate duplicate map");
mapOption.AddAlias("-m");

var outputMapOption = new Option<FileInfo>(
name: "--output-map",
description: "File location for duplicate map",
getDefaultValue: () => new FileInfo("DuplicateMap.csv")
);
outputMapOption.AddAlias("-om");

var dictionaryOptions = GeneralOptions.GetDictionaryOptions();

var command = new Command("deduplicate", "Deduplicate records from a template file")
Expand All @@ -44,16 +54,20 @@ private static Command GetDeduplicateTemplateCommand()
dictionaryOptions.LastNamesDictionaryFilePathOption,
outputOption,
scoreOption,
mapOption,
outputMapOption
};

command.SetHandler(
(
async (
filePathArgumentValue,
firstNamesDictionaryFilePathOptionValue,
middleNamesDictionaryFilePathOptionValue,
lastNamesDictionaryFilePathOptionValue,
outputOptionValue,
scoreOptionValue
scoreOptionValue,
mapOptionValue,
outputMapOptionValue
) =>
{
var records1FromCsv = PersonRecordTemplate
Expand All @@ -67,23 +81,41 @@ private static Command GetDeduplicateTemplateCommand()
lastNamesDictionaryFilePathOptionValue
);

var deduplicatedRecords = Deduplicate.TryDeduplicate(
records1FromCsv,
var preprocessedRecords = records1FromCsv
.PreprocessData(firstNamesDictionary, middleNamesDictionary, lastNamesDictionary)
.ToArray();

var potentialDuplicates = Match.GetPotentialMatchesFromSameDataSet(
preprocessedRecords,
preprocessedRecords,
scoreOptionValue,
firstNamesDictionary,
middleNamesDictionary,
lastNamesDictionary,
1.0,
MatchingProgress.GetMatchingProgressReport
);

return PersonRecordTemplate.WriteToCsv(deduplicatedRecords, outputOptionValue.FullName);
var deduplicatedRecords = Deduplicate.TryDeduplicate(
preprocessedRecords,
potentialDuplicates
);

if (mapOptionValue)
{
await PotentialMatchTemplate.WriteToCsv(
potentialDuplicates,
outputMapOptionValue.FullName
);
}

await PersonRecordTemplate.WriteToCsv(deduplicatedRecords, outputOptionValue.FullName);
},
filePathArgument,
dictionaryOptions.FirstNamesDictionaryFilePathOption,
dictionaryOptions.MiddleNamesDictionaryFilePathOption,
dictionaryOptions.LastNamesDictionaryFilePathOption,
outputOption,
scoreOption
scoreOption,
mapOption,
outputMapOption
);

return command;
Expand Down
2 changes: 1 addition & 1 deletion src/Biomatch.Domain/Biomatch.Domain.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="libphonenumber-csharp" Version="8.13.29" />
<PackageReference Include="libphonenumber-csharp" Version="8.13.30" />
<PackageReference Include="Quickenshtein" Version="1.5.1"/>
<PackageReference Include="SymSpell" Version="6.7.2"/>
<PackageReference Include="System.Private.Uri" Version="4.3.2"/>
Expand Down
36 changes: 36 additions & 0 deletions src/Biomatch.Domain/Deduplicate.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,42 @@ public static IEnumerable<IPersonRecord> TryDeduplicate(
}
}

public static IEnumerable<IPersonRecord> TryDeduplicate(
IEnumerable<PersonRecordForMatch> originalRecords,
IEnumerable<PotentialMatch> potentialDuplicates
)
{
var potentialMatchesGroupedByRecord = potentialDuplicates
.GroupBy(x => x.Value)
.ToDictionary(x => x.Key, x => x.Select(y => y.Match).ToList());

var duplicates = new Dictionary<string, IPersonRecord>();
var uniqueDuplicateRecords = new Dictionary<string, IPersonRecord>();
Console.WriteLine("Processing potential matches...");
foreach (var potentialMatch in potentialMatchesGroupedByRecord)
{
if (duplicates.ContainsKey(potentialMatch.Key.RecordId))
continue;
MarkDuplicates(
potentialMatch.Key,
potentialMatchesGroupedByRecord,
potentialMatch.Value,
duplicates
);
uniqueDuplicateRecords.Add(potentialMatch.Key.RecordId, potentialMatch.Key);
yield return potentialMatch.Key;
}

foreach (var record in originalRecords)
{
if (uniqueDuplicateRecords.ContainsKey(record.RecordId))
continue;
if (duplicates.ContainsKey(record.RecordId))
continue;
yield return record;
}
}

private static void MarkDuplicates(
IPersonRecord originalRecord,
Dictionary<IPersonRecord, List<IPersonRecord>> potentialDuplicates,
Expand Down

0 comments on commit 24b3495

Please sign in to comment.