diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/english/extractors/EnglishTimePeriodExtractorConfiguration.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/english/extractors/EnglishTimePeriodExtractorConfiguration.java new file mode 100644 index 0000000000..d6a79e00c9 --- /dev/null +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/english/extractors/EnglishTimePeriodExtractorConfiguration.java @@ -0,0 +1,114 @@ +package com.microsoft.recognizers.text.datetime.english.extractors; + +import com.microsoft.recognizers.text.IExtractor; +import com.microsoft.recognizers.text.datetime.extractors.IDateTimeExtractor; +import com.microsoft.recognizers.text.datetime.extractors.config.ResultIndex; +import com.microsoft.recognizers.text.datetime.utilities.IDateTimeUtilityConfiguration; +import com.microsoft.recognizers.text.number.english.extractors.IntegerExtractor; +import com.microsoft.recognizers.text.utilities.RegExpUtility; +import com.microsoft.recognizers.text.datetime.DateTimeOptions; +import com.microsoft.recognizers.text.datetime.resources.EnglishDateTime; +import com.microsoft.recognizers.text.datetime.extractors.BaseTimeExtractor; +import com.microsoft.recognizers.text.datetime.config.BaseOptionsConfiguration; +import com.microsoft.recognizers.text.datetime.extractors.config.ITimePeriodExtractorConfiguration; +import com.microsoft.recognizers.text.datetime.english.parsers.EnglishDatetimeUtilityConfiguration; + +import java.util.ArrayList; +import java.util.regex.Pattern; + +import static com.microsoft.recognizers.text.datetime.resources.EnglishDateTime.TokenBeforeDate; + +public class EnglishTimePeriodExtractorConfiguration extends BaseOptionsConfiguration implements ITimePeriodExtractorConfiguration { + private String TokenBeforeDate; + public final String getTokenBeforeDate() { return TokenBeforeDate; } + + public static final Pattern AmRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.AmRegex); + public static final Pattern PmRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.PmRegex); + public static final Pattern HourRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.HourRegex); + public static final Pattern TillRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.TillRegex); + public static final Pattern PeriodDescRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.DescRegex); + public static final Pattern PureNumFromTo = RegExpUtility.getSafeRegExp(EnglishDateTime.PureNumFromTo); + public static final Pattern TimeUnitRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeUnitRegex); + public static final Pattern TimeOfDayRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeOfDayRegex); + public static final Pattern PrepositionRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.PrepositionRegex); + public static final Pattern TimeFollowedUnit = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeFollowedUnit); + public static final Pattern PureNumBetweenAnd = RegExpUtility.getSafeRegExp(EnglishDateTime.PureNumBetweenAnd); + public static final Pattern GeneralEndingRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.GeneralEndingRegex); + public static final Pattern PeriodHourNumRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.PeriodHourNumRegex); + public static final Pattern SpecificTimeFromTo = RegExpUtility.getSafeRegExp(EnglishDateTime.SpecificTimeFromTo); + public static final Pattern SpecificTimeBetweenAnd = RegExpUtility.getSafeRegExp(EnglishDateTime.SpecificTimeBetweenAnd); + public static final Pattern SpecificTimeOfDayRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.SpecificTimeOfDayRegex); + public static final Pattern TimeNumberCombinedWithUnit = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeNumberCombinedWithUnit); + + public EnglishTimePeriodExtractorConfiguration() + { + this(DateTimeOptions.None); + } + + //C# TO JAVA CONVERTER WARNING: The following constructor is declared outside of its associated class: + //ORIGINAL LINE: public EnglishTimePeriodExtractorConfiguration(DateTimeOptions options = DateTimeOptions.None) + //C# TO JAVA CONVERTER NOTE: Java does not support optional parameters. Overloaded method(s) are created above: + public EnglishTimePeriodExtractorConfiguration(DateTimeOptions options) + { + super(options); + TokenBeforeDate = EnglishDateTime.TokenBeforeDate; + SingleTimeExtractor = new BaseTimeExtractor(new EnglishTimeExtractorConfiguration(options)); + UtilityConfiguration = new EnglishDatetimeUtilityConfiguration(); + IntegerExtractor = com.microsoft.recognizers.text.number.english.extractors.IntegerExtractor.getInstance(); + } + + private IDateTimeUtilityConfiguration UtilityConfiguration; + public final IDateTimeUtilityConfiguration getUtilityConfiguration() { return UtilityConfiguration; } + + private IDateTimeExtractor SingleTimeExtractor; + public final IDateTimeExtractor getSingleTimeExtractor() { return SingleTimeExtractor; } + + private IExtractor IntegerExtractor; + public final IExtractor getIntegerExtractor() { return IntegerExtractor; } + + public Iterable getSimpleCasesRegex() { + return getSimpleCasesRegex; + } + + public final Iterable getSimpleCasesRegex = new ArrayList() { + { + add(PureNumFromTo); + add(PureNumBetweenAnd); + add(SpecificTimeFromTo); + add(SpecificTimeBetweenAnd); + } + }; + + public final Pattern getTillRegex() { return TillRegex; } + public final Pattern getTimeOfDayRegex() { return TimeOfDayRegex; } + public final Pattern getGeneralEndingRegex() { return GeneralEndingRegex; } + + public final ResultIndex GetFromTokenIndex(String input) + { + ResultIndex result = new ResultIndex(false, -1); + if (input.endsWith("from")) + { + result = result.withIndex( input.lastIndexOf("from" ) ); + result = result.withResult(true); + } + + return result; + } + + public final ResultIndex GetBetweenTokenIndex(String input) + { + ResultIndex result = new ResultIndex(false, -1); + if (input.endsWith("between")) + { + result = result.withIndex( input.lastIndexOf("between" ) ); + result = result.withResult(true); + } + + return result; + } + + public final boolean HasConnectorToken(String input) + { + return input.equals("and"); + } +} diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/BaseTimePeriodExtractor.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/BaseTimePeriodExtractor.java new file mode 100644 index 0000000000..452183b3d4 --- /dev/null +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/BaseTimePeriodExtractor.java @@ -0,0 +1,266 @@ +package com.microsoft.recognizers.text.datetime.extractors; + +import com.microsoft.recognizers.text.ExtractResult; +import com.microsoft.recognizers.text.datetime.Constants; +import com.microsoft.recognizers.text.datetime.extractors.config.ITimePeriodExtractorConfiguration; +import com.microsoft.recognizers.text.datetime.extractors.config.ResultIndex; +import com.microsoft.recognizers.text.datetime.utilities.Token; +import com.microsoft.recognizers.text.utilities.Match; +import com.microsoft.recognizers.text.utilities.RegExpUtility; +import com.microsoft.recognizers.text.utilities.StringUtility; + +import java.time.LocalDateTime; +import java.util.*; +import java.util.regex.Pattern; + +public class BaseTimePeriodExtractor implements IDateTimeExtractor { + + private final ITimePeriodExtractorConfiguration config; + + @Override + public String getExtractorName() { + return Constants.SYS_DATETIME_TIMEPERIOD; + } + + public BaseTimePeriodExtractor(ITimePeriodExtractorConfiguration config) { + this.config = config; + } + + @Override + public List extract(String input, LocalDateTime reference) { + List tokens = new ArrayList<>(); + tokens.addAll(MatchSimpleCases(input)); + tokens.addAll(MergeTwoTimePoints(input, reference)); + tokens.addAll(MatchTimeOfDay(input)); + + return Token.mergeAllTokens(tokens, input, getExtractorName()); + } + + @Override + public List extract(String input) { + return this.extract(input, LocalDateTime.now()); + } + + // Cases like "from 3 to 5am" or "between 3:30 and 5" are extracted here + // Note that cases like "from 3 to 5" will not be extracted here because no "am/pm" or "hh:mm" to infer it's a time period + // Also cases like "from 3:30 to 4 people" should not be extracted as a time period + private List MatchSimpleCases(String input) + { + List ret = new ArrayList<>(); + + for (Pattern regex : this.config.getSimpleCasesRegex()) + { + Match[] matches = RegExpUtility.getMatches(regex, input); + + for (Match match: matches) + { + // Cases like "from 10:30 to 11", don't necessarily need "am/pm" + if (match.getGroup(Constants.MinuteGroupName).value != null || match.getGroup(Constants.SecondGroupName).value != null) + { + // Cases like "from 3:30 to 4" should be supported + // Cases like "from 3:30 to 4 on 1/1/2015" should be supported + // Cases like "from 3:30 to 4 people" is considered not valid + Boolean endWithValidToken = false; + + // "No extra tokens after the time period" + if (match.index + match.length == input.length()) + { + endWithValidToken = true; + } + else + { + String afterStr = input.substring(match.index + match.length); + + // "End with general ending tokens or "TokenBeforeDate" (like "on") + Pattern generalEndingRegex = this.config.getGeneralEndingRegex(); + Optional endingMatch = Arrays.stream(RegExpUtility.getMatches(generalEndingRegex, afterStr)).findFirst(); + if (endingMatch.isPresent() || afterStr.trim().startsWith(this.config.getTokenBeforeDate())) + { + endWithValidToken = true; + } + } + + if (endWithValidToken) + { + ret.add(new Token(match.index, match.index + match.length)); + } + } + else + { + // Is there Constants.PmGroupName or Constants.AmGroupName ? + String pmStr = match.getGroup(Constants.PmGroupName).value; + String amStr = match.getGroup(Constants.AmGroupName).value; + String descStr = match.getGroup(Constants.DescGroupName).value; + + // Check Constants.PmGroupName, Constants.AmGroupName + if (!StringUtility.isNullOrEmpty(pmStr) || !StringUtility.isNullOrEmpty(amStr) || !StringUtility.isNullOrEmpty(descStr)) + { + ret.add(new Token(match.index, match.index + match.length)); + } + } + } + } + + return ret; + } + + private List MergeTwoTimePoints(String input, LocalDateTime reference) + { + List ret = new ArrayList<>(); + List ers = this.config.getSingleTimeExtractor().extract(input); + + // Handling ending number as a time point. + List numErs = this.config.getIntegerExtractor().extract(input); + + // Check if it is an ending number + if (numErs.size() > 0) + { + List timeNumbers = new ArrayList<>(); + + // check if it is a ending number + boolean endingNumber = false; + ExtractResult num = numErs.get(numErs.size() - 1); + if (num.start + num.length == input.length()) + { + endingNumber = true; + } + else + { + String afterStr = input.substring(num.start + num.length); + Pattern generalEndingRegex = this.config.getGeneralEndingRegex(); + Optional endingMatch = Arrays.stream(RegExpUtility.getMatches(generalEndingRegex, input)).findFirst(); + if (endingMatch.isPresent()) + { + endingNumber = true; + } + } + if (endingNumber) + { + timeNumbers.add(num); + } + + int i = 0; + int j = 0; + + while (i < numErs.size()) + { + // find subsequent time point + int numEndPoint = numErs.get(i).start + numErs.get(i).length; + while (j < ers.size() && ers.get(j).start <= numEndPoint) + { + j++; + } + + if (j >= ers.size()) break; + + // check connector string + String midStr = input.substring(numEndPoint, ers.get(j).start); + Pattern tillRegex = this.config.getTillRegex(); + Optional match = Arrays.stream(RegExpUtility.getMatches(tillRegex, midStr)).findFirst(); + if (match.isPresent() && match.get().length == midStr.trim().length()) + { + timeNumbers.add(numErs.get(i)); + } + + i++; + } + + // check overlap + for (ExtractResult timeNum : timeNumbers) + { + boolean overlap = false; + for (ExtractResult er :ers) + { + if(er.start <= timeNum.start && er.start + er.length >= timeNum.start) + { + overlap = true; + } + } + + if (!overlap) + { + ers.add(timeNum); + } + } + + ers.sort((x, y) -> x.start - y.start); + } + + int idx = 0; + while (idx < ers.size() - 1) + { + int middleBegin = ers.get(idx).start + ers.get(idx).length; + int middleEnd = ers.get(idx + 1).start; + + if (middleEnd - middleBegin <= 0) + { + idx++; + continue; + } + + String middleStr = input.substring(middleBegin, middleEnd).trim().toLowerCase(java.util.Locale.ROOT); + Pattern tillRegex = this.config.getTillRegex(); + Optional match = Arrays.stream(RegExpUtility.getMatches(tillRegex, middleStr)).findFirst(); + + // Handle "{TimePoint} to {TimePoint}" + if (match.isPresent() && match.get().index == 0 && match.get().length == middleStr.length()) + { + int periodBegin = ers.get(idx).start; + int periodEnd = ers.get(idx + 1).start + ers.get(idx + 1).length; + + // Handle "from" + String beforeStr = input.substring(0, periodBegin).trim().toLowerCase(java.util.Locale.ROOT); + ResultIndex fromIndex = this.config.GetFromTokenIndex(beforeStr); + ResultIndex betweenIndex = this.config.GetBetweenTokenIndex(beforeStr); + if (fromIndex.result) + { + // Handle "from" + periodBegin = fromIndex.index; + } + else if (betweenIndex.result) + { + // Handle "between" + periodBegin = betweenIndex.index; + } + + ret.add(new Token(periodBegin, periodEnd)); + idx += 2; + continue; + } + + // Handle "between {TimePoint} and {TimePoint}" + if (this.config.HasConnectorToken(middleStr)) + { + int periodBegin = ers.get(idx).start; + int periodEnd = ers.get(idx + 1).start + ers.get(idx + 1).length; + + // Handle "between" + String beforeStr = input.substring(0, periodBegin).trim().toLowerCase(java.util.Locale.ROOT); + ResultIndex betweenIndex = this.config.GetBetweenTokenIndex(beforeStr); + if (betweenIndex.result) + { + periodBegin = betweenIndex.index; + ret.add(new Token(periodBegin, periodEnd)); + idx += 2; + continue; + } + } + + idx++; + } + + return ret; + } + private List MatchTimeOfDay(String input) + { + List ret = new ArrayList<>(); + Pattern timeOfDayRegex = this.config.getTimeOfDayRegex(); + Match[] matches = RegExpUtility.getMatches(timeOfDayRegex, input); + for (Match match : matches) + { + ret.add(new Token(match.index, match.index + match.length)); + } + + return ret; + } +} diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ITimePeriodExtractorConfiguration.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ITimePeriodExtractorConfiguration.java new file mode 100644 index 0000000000..f9c5435df8 --- /dev/null +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ITimePeriodExtractorConfiguration.java @@ -0,0 +1,20 @@ +package com.microsoft.recognizers.text.datetime.extractors.config; + +import com.microsoft.recognizers.text.IExtractor; +import com.microsoft.recognizers.text.datetime.config.IOptionsConfiguration; +import com.microsoft.recognizers.text.datetime.extractors.IDateTimeExtractor; + +import java.util.regex.Pattern; + +public interface ITimePeriodExtractorConfiguration extends IOptionsConfiguration { + String getTokenBeforeDate(); + IExtractor getIntegerExtractor(); + Iterable getSimpleCasesRegex(); + Pattern getTillRegex(); + Pattern getTimeOfDayRegex(); + Pattern getGeneralEndingRegex(); + IDateTimeExtractor getSingleTimeExtractor(); + ResultIndex GetFromTokenIndex(String text); + boolean HasConnectorToken(String text); + ResultIndex GetBetweenTokenIndex(String text); +} diff --git a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ResultIndex.java b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ResultIndex.java index 3a88b1a5ab..0c7ad0548c 100644 --- a/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ResultIndex.java +++ b/Java/libraries/recognizers-text-date-time/src/main/java/com/microsoft/recognizers/text/datetime/extractors/config/ResultIndex.java @@ -8,4 +8,12 @@ public ResultIndex(boolean result, int index) { this.result = result; this.index = index; } + + public ResultIndex withResult(boolean newResult) { + return new ResultIndex(newResult, this.index); + } + + public ResultIndex withIndex(int newIndex) { + return new ResultIndex(this.result, newIndex); + } } diff --git a/Java/tests/src/test/java/com/microsoft/recognizers/text/tests/datetime/DateTimeExtractorTest.java b/Java/tests/src/test/java/com/microsoft/recognizers/text/tests/datetime/DateTimeExtractorTest.java index 26684ce540..cca9eb0517 100644 --- a/Java/tests/src/test/java/com/microsoft/recognizers/text/tests/datetime/DateTimeExtractorTest.java +++ b/Java/tests/src/test/java/com/microsoft/recognizers/text/tests/datetime/DateTimeExtractorTest.java @@ -8,14 +8,7 @@ import com.microsoft.recognizers.text.ExtractResult; import com.microsoft.recognizers.text.ModelResult; import com.microsoft.recognizers.text.datetime.DateTimeOptions; -import com.microsoft.recognizers.text.datetime.english.extractors.EnglishTimeZoneExtractorConfiguration; -import com.microsoft.recognizers.text.datetime.english.extractors.EnglishDateExtractorConfiguration; -import com.microsoft.recognizers.text.datetime.english.extractors.EnglishDatePeriodExtractorConfiguration; -import com.microsoft.recognizers.text.datetime.english.extractors.EnglishDurationExtractorConfiguration; -import com.microsoft.recognizers.text.datetime.english.extractors.EnglishHolidayExtractorConfiguration; -import com.microsoft.recognizers.text.datetime.english.extractors.EnglishTimeExtractorConfiguration; -import com.microsoft.recognizers.text.datetime.extractors.BaseTimeExtractor; -import com.microsoft.recognizers.text.datetime.extractors.BaseTimeZoneExtractor; +import com.microsoft.recognizers.text.datetime.english.extractors.*; import com.microsoft.recognizers.text.datetime.extractors.*; import com.microsoft.recognizers.text.tests.AbstractTest; import com.microsoft.recognizers.text.tests.TestCase; @@ -100,12 +93,15 @@ private static IDateTimeExtractor getEnglishExtractor(String name) { return new BaseDurationExtractor(new EnglishDurationExtractorConfiguration()); case "HolidayExtractor": return new BaseHolidayExtractor(new EnglishHolidayExtractorConfiguration()); + case "TimeExtractor": + return new BaseTimeExtractor(new EnglishTimeExtractorConfiguration()); + case "TimePeriodExtractor": + return new BaseTimePeriodExtractor(new EnglishTimePeriodExtractorConfiguration()); case "TimeZoneExtractor": return new BaseTimeZoneExtractor(new EnglishTimeZoneExtractorConfiguration(DateTimeOptions.EnablePreview)); - case "TimeExtractor": - return new BaseTimeExtractor(new EnglishTimeExtractorConfiguration()); + default: throw new AssumptionViolatedException("Extractor Type/Name not supported."); } } -} \ No newline at end of file +}