Skip to content

Commit

Permalink
Revise date extractor and fixes for bugs (#637)
Browse files Browse the repository at this point in the history
* Fix "DayOfWeek-Month-Date" recognized as dateperiod

- minor: support "MLK day" as holiday
- revise date extractor

* Fix "by year" wrongly extracted as date period.
  • Loading branch information
sanxing-chen authored and tellarin committed Jun 20, 2018
1 parent 7b9fdd6 commit 5185f08
Show file tree
Hide file tree
Showing 38 changed files with 396 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ public static class DateTimeDefinitions
public static readonly string WeekDayOfMonthRegex = $@"(?<wom>(the\s+)?(?<cardinal>first|1st|second|2nd|third|3rd|fourth|4th|fifth|5th|last)\s+{WeekDayRegex}\s+{MonthSuffixRegex})";
public static readonly string RelativeWeekDayRegex = $@"\b({WrittenNumRegex}\s+{WeekDayRegex}\s+(from\s+now|later))\b";
public static readonly string SpecialDate = $@"(?=\b(on|at)\s+the\s+){DayRegex}\b";
public static readonly string DateExtractor1 = $@"\b({WeekDayRegex}(\s+|\s*,\s*))?{MonthRegex}(\.)?\s*[/\\\.\-]?\s*{DayRegex}(\.)?\b";
public static readonly string DateExtractor2 = $@"\b({WeekDayRegex}(\s+|\s*,\s*))?{MonthRegex}(\.)?\s*[\.\-]?\s*{DayRegex}(\.)?(\s+|\s*,\s*|\s+of\s+){DateYearRegex}\b";
public static readonly string DateExtractor1 = $@"\b((this\s+)?{WeekDayRegex}\s*[,-]?\s*)?(({MonthRegex}(\.)?\s*[/\\\.\-]?\s*{DayRegex})|(\({MonthRegex}\s*[-.]\s*{DayRegex}\)))";
public static readonly string DateExtractor2 = $@"\b{DateExtractor1}(\s+|\s*,\s*|\s+of\s+){DateYearRegex}\b";
public static readonly string DateExtractor3 = $@"\b({WeekDayRegex}(\s+|\s*,\s*))?{DayRegex}(\.)?(\s+|\s*,\s*|\s+of\s+|\s*-\s*){MonthRegex}(\.)?((\s+|\s*,\s*){DateYearRegex})?\b";
public static readonly string DateExtractor4 = $@"\b{MonthNumRegex}\s*[/\\\-]\s*{DayRegex}(\.)?\s*[/\\\-]\s*{DateYearRegex}";
public static readonly string DateExtractor5 = $@"\b{DayRegex}\s*[/\\\-\.]\s*{MonthNumRegex}\s*[/\\\-\.]\s*{DateYearRegex}";
Expand All @@ -99,7 +99,7 @@ public static class DateTimeDefinitions
public static readonly string DateExtractorA = $@"\b{DateYearRegex}\s*[/\\\-\.]\s*{MonthNumRegex}\s*[/\\\-\.]\s*{DayRegex}";
public static readonly string OfMonth = $@"^\s*of\s*{MonthRegex}";
public static readonly string MonthEnd = $@"{MonthRegex}\s*(the)?\s*$";
public static readonly string WeekDayEnd = $@"{WeekDayRegex}\s*,?\s*$";
public static readonly string WeekDayEnd = $@"(this\s+)?{WeekDayRegex}\s*,?\s*$";
public const string RangeUnitRegex = @"\b(?<unit>years|year|months|month|weeks|week)\b";
public const string OclockRegex = @"(?<oclock>o\s*’\s*clock|o\s*‘\s*clock|o\s*'\s*clock|o\s*clock)";
public static readonly string DescRegex = $@"((({OclockRegex}\s+)?(?<desc>ampm|am\b|a\.m\.|a m\b|a\. m\.|a\.m\b|a\. m\b|a m\b|pm\b|p\.m\.|p m\b|p\. m\.|p\.m\b|p\. m\b|p\b|p m\b))|{OclockRegex})";
Expand Down Expand Up @@ -181,7 +181,7 @@ public static class DateTimeDefinitions
public const string ConjunctionRegex = @"\b((and(\s+for)?)|with)\b";
public static readonly string HolidayRegex1 = $@"\b(?<holiday>clean monday|good friday|ash wednesday|mardi gras|washington's birthday|mao's birthday|chinese new Year|new years' eve|new year's eve|new year 's eve|new years eve|new year eve|new years'|new year's|new year 's|new years|new year|may\s*day|yuan dan|april fools|christmas eve|christmas|xmas|thanksgiving|halloween|yuandan|easter)(\s+(of\s+)?({YearRegex}|{RelativeRegex}\s+year))?\b";
public static readonly string HolidayRegex2 = $@"\b(?<holiday>all saint's|tree planting day|white lover|st patrick|st george|cinco de mayo|independence|us independence|all hallow|all souls|guy fawkes)(\s+(of\s+)?({YearRegex}|{RelativeRegex}\s+year))?\b";
public static readonly string HolidayRegex3 = $@"(?<holiday>(martin luther king|martin luther king jr|canberra|easter|columbus|thanks\s*giving|christmas|xmas|labour|(international|int'l)\s+workers'?|mother's|mother|mothers|father's|father|fathers|female|single|teacher's|youth|children|arbor|girls|chsmilbuild|lover|labor|inauguration|groundhog|valentine's|baptiste|bastille|halloween|veterans|memorial|mid(-| )autumn|moon|spring|lantern|qingming|dragon boat|new years'|new year's|new year 's|new years|new year)\s+(day))(\s+(of\s+)?({YearRegex}|{RelativeRegex}\s+year))?";
public static readonly string HolidayRegex3 = $@"(?<holiday>(mlk|martin luther king|martin luther king jr|canberra|easter|columbus|thanks\s*giving|christmas|xmas|labour|(international|int'l)\s+workers'?|mother's|mother|mothers|father's|father|fathers|female|single|teacher's|youth|children|arbor|girls|chsmilbuild|lover|labor|inauguration|groundhog|valentine's|baptiste|bastille|halloween|veterans|memorial|mid(-| )autumn|moon|spring|lantern|qingming|dragon boat|new years'|new year's|new year 's|new years|new year)\s+(day))(\s+(of\s+)?({YearRegex}|{RelativeRegex}\s+year))?";
public const string DateTokenPrefix = "on ";
public const string TimeTokenPrefix = "at ";
public const string TokenBeforeDate = "on ";
Expand Down Expand Up @@ -210,6 +210,7 @@ public static class DateTimeDefinitions
public const string FromToRegex = @"\b(from).+(to)\b.+";
public const string SingleAmbiguousMonthRegex = @"^(the\s+)?(may|march)$";
public const string SingleAmbiguousTermsRegex = @"^(the\s+)?(day|week|month|year)$";
public const string UnspecificDatePeriodRegex = @"^(week|weekend|month|year)$";
public const string PrepositionSuffixRegex = @"\b(on|in|at|around|from|to)$";
public const string FlexibleDayRegex = @"(?<DayOfMonth>([A-Za-z]+\s)?[A-Za-z\d]+)";
public static readonly string ForTheRegex = $@"\b(((for the {FlexibleDayRegex})|(on (the\s+)?{FlexibleDayRegex}(?<=(st|nd|rd|th))))(?<end>\s*(,|\.|!|\?|$)))";
Expand Down Expand Up @@ -539,7 +540,7 @@ public static class DateTimeDefinitions
{ "fathers", new string[] { "fatherday", "fathersday" } },
{ "mothers", new string[] { "motherday", "mothersday" } },
{ "thanksgiving", new string[] { "thanksgivingday", "thanksgiving" } },
{ "martinlutherking", new string[] { "martinlutherkingday", "martinlutherkingjrday" } },
{ "martinlutherking", new string[] { "mlkday", "martinlutherkingday", "martinlutherkingjrday" } },
{ "washingtonsbirthday", new string[] { "washingtonsbirthday", "washingtonbirthday" } },
{ "canberra", new string[] { "canberraday" } },
{ "labour", new string[] { "labourday", "laborday" } },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ public static class DateTimeDefinitions
public const string FromRegex2 = @"((depuis|de)(\s*la(s)?)?)$";
public const string FromToRegex = @"\b(du|de|des|depuis).+(à|a|au)\b.+";
public const string SingleAmbiguousMonthRegex = @"^(le\s+)?(may|march)$";
public const string UnspecificDatePeriodRegex = @"^[.]";
public const string PrepositionSuffixRegex = @"\b(du|de|[àa]|vers|dans)$";
public const string FlexibleDayRegex = @"(?<DayOfMonth>([A-Za-z]+\s)?[A-Za-z\d]+)";
public static readonly string ForTheRegex = $@"\b(((pour le {FlexibleDayRegex})|(dans (le\s+)?{FlexibleDayRegex}(?<=(st|nd|rd|th))))(?<end>\s*(,|\.|!|\?|$)))";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ public static class DateTimeDefinitions
public const string ConnectorRegex = @"^(-|,|für|t|gegen)$";
public const string FromToRegex = @"\b(vom|von).+(bis(\s*zum)?)\b.+";
public const string SingleAmbiguousMonthRegex = @"^(the\s+)?(may|march)$";
public const string UnspecificDatePeriodRegex = @"^[.]";
public const string PrepositionSuffixRegex = @"\b(am|in|um|gegen|von|vom|zum)$";
public const string FlexibleDayRegex = @"(?<DayOfMonth>([A-Za-z]+\s)?[A-Za-z\d]+)";
public static readonly string ForTheRegex = $@"\b(für den {FlexibleDayRegex})";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,7 @@ public static class DateTimeDefinitions
public const string ReferenceDatePeriodRegex = @"^[.]";
public const string FromToRegex = @"\b(from).+(to)\b.+";
public const string SingleAmbiguousMonthRegex = @"^(the\s+)?(may|march)$";
public const string UnspecificDatePeriodRegex = @"^[.]";
public const string PrepositionSuffixRegex = @"\b(on|in|at|around|from|to)$";
public const string RestOfDateTimeRegex = @"^[\.]";
public const string SetWeekDayRegex = @"^[\.]";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,8 @@ public static class DateTimeDefinitions
public const string ReferenceDatePeriodRegex = @"^[.]";
public const string FromToRegex = @"\b(from).+(to)\b.+";
public const string SingleAmbiguousMonthRegex = @"^(the\s+)?(may|march)$";
public const string PrepositionSuffixRegex = @"\b(on|in|at|around|from|to)$";
public const string UnspecificDatePeriodRegex = @"^[.]";
public const string PrepositionSuffixRegex = @"\b(on|in|at|around|for|during|since|from|to)$";
public const string RestOfDateTimeRegex = @"^[\.]";
public const string SetWeekDayRegex = @"^[\.]";
public const string NightRegex = @"\b(medionoche|noche)\b";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ public class EnglishDateExtractorConfiguration : IDateExtractorConfiguration

public static readonly Regex[] DateRegexList =
{
// (Sunday,)? April 5
// ((this)? Sunday,)? April 5
new Regex(DateTimeDefinitions.DateExtractor1, RegexOptions.IgnoreCase | RegexOptions.Singleline),

// (Sunday,)? April 5, 2016
// ((this)? Sunday,)? April 5, 2016
new Regex(DateTimeDefinitions.DateExtractor2, RegexOptions.IgnoreCase | RegexOptions.Singleline),

// (Sunday,)? 6th of April
Expand Down Expand Up @@ -140,6 +140,9 @@ public class EnglishDateExtractorConfiguration : IDateExtractorConfiguration
public static readonly ImmutableDictionary<string, int> DayOfWeek =
DateTimeDefinitions.DayOfWeek.ToImmutableDictionary();

public static readonly ImmutableDictionary<string, int> MonthOfYear =
DateTimeDefinitions.MonthOfYear.ToImmutableDictionary();

public EnglishDateExtractorConfiguration()
{
Options = DateTimeOptions.None;
Expand All @@ -166,6 +169,8 @@ public EnglishDateExtractorConfiguration()

IImmutableDictionary<string, int> IDateExtractorConfiguration.DayOfWeek => DayOfWeek;

IImmutableDictionary<string, int> IDateExtractorConfiguration.MonthOfYear => MonthOfYear;

Regex IDateExtractorConfiguration.OfMonth => OfMonth;

Regex IDateExtractorConfiguration.MonthEnd => MonthEnd;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,15 @@ public class EnglishMergedExtractorConfiguration : IMergedExtractorConfiguration
public static readonly Regex YearAfterRegex =
new Regex(DateTimeDefinitions.YearAfterRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline);

public static readonly Regex UnspecificDatePeriodRegex =
new Regex(DateTimeDefinitions.UnspecificDatePeriodRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline);

public static readonly Regex[] FilterWordRegexList =
{
// one on one
new Regex(DateTimeDefinitions.OneOnOneRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline),
// (the)? (day|week|month|year)
new Regex(DateTimeDefinitions.SingleAmbiguousTermsRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline),
new Regex(DateTimeDefinitions.SingleAmbiguousTermsRegex, RegexOptions.IgnoreCase | RegexOptions.Singleline)
};

public static readonly StringMatcher SuperfluousWordMatcher = new StringMatcher();
Expand Down Expand Up @@ -100,6 +103,7 @@ public EnglishMergedExtractorConfiguration(DateTimeOptions options)
Regex IMergedExtractorConfiguration.PrepositionSuffixRegex => PrepositionSuffixRegex;
Regex IMergedExtractorConfiguration.NumberEndingPattern => NumberEndingPattern;
Regex IMergedExtractorConfiguration.YearAfterRegex => YearAfterRegex;
Regex IMergedExtractorConfiguration.UnspecificDatePeriodRegex => UnspecificDatePeriodRegex;
IEnumerable<Regex> IMergedExtractorConfiguration.FilterWordRegexList => FilterWordRegexList;
StringMatcher IMergedExtractorConfiguration.SuperfluousWordMatcher => SuperfluousWordMatcher;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using DateObject = System.DateTime;

using Microsoft.Recognizers.Text.Number;

using System;
using System.Collections.Generic;
using System.Collections.Immutable;
using System.Text.RegularExpressions;
using DateObject = System.DateTime;

namespace Microsoft.Recognizers.Text.DateTime
{
public class BaseDateExtractor : IDateTimeExtractor
Expand Down Expand Up @@ -42,11 +41,11 @@ public int GetYearFromText(Match match)
if (!string.IsNullOrEmpty(yearStr))
{
year = int.Parse(yearStr);
if (year < 100 && year >= 90)
if (year < 100 && year >= Constants.MinTwoDigitYearPastNum)
{
year += 1900;
}
else if (year < 100 && year < 30)
else if (year >= 0 && year < Constants.MaxTwoDigitYearFutureNum)
{
year += 2000;
}
Expand All @@ -56,10 +55,12 @@ public int GetYearFromText(Match match)
var firstTwoYearNumStr = match.Groups["firsttwoyearnum"].Value;
if (!string.IsNullOrEmpty(firstTwoYearNumStr))
{
ExtractResult er = new ExtractResult();
er.Text = firstTwoYearNumStr;
er.Start = match.Groups["firsttwoyearnum"].Index;
er.Length = match.Groups["firsttwoyearnum"].Length;
var er = new ExtractResult
{
Text = firstTwoYearNumStr,
Start = match.Groups["firsttwoyearnum"].Index,
Length = match.Groups["firsttwoyearnum"].Length
};

var firstTwoYearNum = Convert.ToInt32((double)(this.config.NumberParser.Parse(er).Value ?? 0));

Expand Down Expand Up @@ -157,7 +158,9 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
var startIndex = match.Index;
var endIndex = match.Index + match.Length + (result.Length ?? 0);

ExtendWithWeekdayAndYear(ref startIndex, ref endIndex, text);
ExtendWithWeekdayAndYear(ref startIndex, ref endIndex,
config.MonthOfYear.GetValueOrDefault(match.Groups["month"].Value.ToLower(), reference.Month),
num, text, reference);

ret.Add(new Token(startIndex, endIndex));
continue;
Expand Down Expand Up @@ -204,7 +207,7 @@ private List<Token> NumberWithMonth(string text, DateObject reference)

// Get week day from text directly, compare it with the weekday generated above
// to see whether they refer to the same week day
var extractedWeekDayStr = matchCase.Groups["weekday"].Value.ToString().ToLower();
var extractedWeekDayStr = matchCase.Groups["weekday"].Value.ToLower();
if (!date.Equals(DateObject.MinValue) &&
config.DayOfWeek[numWeekDayStr] == config.DayOfWeek[extractedWeekDayStr])
{
Expand Down Expand Up @@ -258,6 +261,7 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
}
}

// For cases like "I'll go back twenty second of June"
if (result.Start + result.Length < text.Length)
{
var afterStr = text.Substring(result.Start + result.Length ?? 0);
Expand All @@ -268,33 +272,54 @@ private List<Token> NumberWithMonth(string text, DateObject reference)
var startIndex = result.Start ?? 0;
var endIndex = (result.Start + result.Length ?? 0) + match.Length;

ExtendWithWeekdayAndYear(ref startIndex, ref endIndex, text);
ExtendWithWeekdayAndYear(ref startIndex, ref endIndex,
config.MonthOfYear.GetValueOrDefault(match.Groups["month"].Value.ToLower(), reference.Month),
num, text, reference);

ret.Add(new Token(startIndex, endIndex));
continue;
}
}
}

return ret;
}

private void ExtendWithWeekdayAndYear(ref int startIndex, ref int endIndex, string text)
// TODO: Remove the parsing logic from here
private void ExtendWithWeekdayAndYear(ref int startIndex,
ref int endIndex, int month, int day, string text, DateObject reference)
{
// Check whether there's weekday
var prefix = text.Substring(0, startIndex);
var matchWeekDay = this.config.WeekDayEnd.Match(prefix);
if (matchWeekDay.Success)
{
startIndex = matchWeekDay.Index;
}
var year = reference.Year;

// Check whether there's year
// Check whether there's a year
var suffix = text.Substring(endIndex);
var matchYear = this.config.YearSuffix.Match(suffix);
if (matchYear.Success && matchYear.Index == 0)
{
year = GetYearFromText(matchYear);
endIndex += matchYear.Length;
}

var date = DateObject.MinValue.SafeCreateFromValue(year, month, day);

// Check whether there's a weekday
var prefix = text.Substring(0, startIndex);
var matchWeekDay = this.config.WeekDayEnd.Match(prefix);
if (matchWeekDay.Success)
{

// Get weekday from context directly, compare it with the weekday extraction above
// to see whether they are referred to the same weekday
var extractedWeekDayStr = matchWeekDay.Groups["weekday"].Value.ToLower();
var numWeekDayStr = date.DayOfWeek.ToString().ToLower();

if (config.DayOfWeek.TryGetValue(numWeekDayStr, out var weekDay1) &&
config.DayOfWeek.TryGetValue(extractedWeekDayStr, out var weekDay2))
{
if (!date.Equals(DateObject.MinValue) && weekDay1 == weekDay2)
{
startIndex = matchWeekDay.Index;
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,12 @@ private void AddMod(List<ExtractResult> ers, string text)
var lastEnd = 0;
foreach (var er in ers)
{
// Skip the unspecific date period
if (this.config.UnspecificDatePeriodRegex.IsMatch(er.Text))
{
continue;
}

var beforeStr = text.Substring(lastEnd, er.Start ?? 0).ToLowerInvariant();

if (HasTokenIndex(beforeStr.TrimEnd(), config.BeforeRegex, out int tokenIndex))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,7 @@ public interface IDateExtractorConfiguration : IOptionsConfiguration
IDateTimeUtilityConfiguration UtilityConfiguration { get; }

IImmutableDictionary<string, int> DayOfWeek { get; }

IImmutableDictionary<string, int> MonthOfYear { get; }
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ public interface IMergedExtractorConfiguration : IOptionsConfiguration

Regex YearAfterRegex { get; }

Regex UnspecificDatePeriodRegex { get; }

StringMatcher SuperfluousWordMatcher { get; }

}
Expand Down
Loading

0 comments on commit 5185f08

Please sign in to comment.