Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package com.microsoft.recognizers.text.datetime.english.extractors;

import com.microsoft.recognizers.text.IExtractor;
import com.microsoft.recognizers.text.datetime.extractors.IDateTimeExtractor;
import com.microsoft.recognizers.text.datetime.extractors.config.ResultIndex;
import com.microsoft.recognizers.text.datetime.utilities.IDateTimeUtilityConfiguration;
import com.microsoft.recognizers.text.number.english.extractors.IntegerExtractor;
import com.microsoft.recognizers.text.utilities.RegExpUtility;
import com.microsoft.recognizers.text.datetime.DateTimeOptions;
import com.microsoft.recognizers.text.datetime.resources.EnglishDateTime;
import com.microsoft.recognizers.text.datetime.extractors.BaseTimeExtractor;
import com.microsoft.recognizers.text.datetime.config.BaseOptionsConfiguration;
import com.microsoft.recognizers.text.datetime.extractors.config.ITimePeriodExtractorConfiguration;
import com.microsoft.recognizers.text.datetime.english.parsers.EnglishDatetimeUtilityConfiguration;

import java.util.ArrayList;
import java.util.regex.Pattern;

import static com.microsoft.recognizers.text.datetime.resources.EnglishDateTime.TokenBeforeDate;

public class EnglishTimePeriodExtractorConfiguration extends BaseOptionsConfiguration implements ITimePeriodExtractorConfiguration {
private String TokenBeforeDate;
public final String getTokenBeforeDate() { return TokenBeforeDate; }

public static final Pattern AmRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.AmRegex);
public static final Pattern PmRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.PmRegex);
public static final Pattern HourRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.HourRegex);
public static final Pattern TillRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.TillRegex);
public static final Pattern PeriodDescRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.DescRegex);
public static final Pattern PureNumFromTo = RegExpUtility.getSafeRegExp(EnglishDateTime.PureNumFromTo);
public static final Pattern TimeUnitRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeUnitRegex);
public static final Pattern TimeOfDayRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeOfDayRegex);
public static final Pattern PrepositionRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.PrepositionRegex);
public static final Pattern TimeFollowedUnit = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeFollowedUnit);
public static final Pattern PureNumBetweenAnd = RegExpUtility.getSafeRegExp(EnglishDateTime.PureNumBetweenAnd);
public static final Pattern GeneralEndingRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.GeneralEndingRegex);
public static final Pattern PeriodHourNumRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.PeriodHourNumRegex);
public static final Pattern SpecificTimeFromTo = RegExpUtility.getSafeRegExp(EnglishDateTime.SpecificTimeFromTo);
public static final Pattern SpecificTimeBetweenAnd = RegExpUtility.getSafeRegExp(EnglishDateTime.SpecificTimeBetweenAnd);
public static final Pattern SpecificTimeOfDayRegex = RegExpUtility.getSafeRegExp(EnglishDateTime.SpecificTimeOfDayRegex);
public static final Pattern TimeNumberCombinedWithUnit = RegExpUtility.getSafeRegExp(EnglishDateTime.TimeNumberCombinedWithUnit);

public EnglishTimePeriodExtractorConfiguration()
{
this(DateTimeOptions.None);
}

//C# TO JAVA CONVERTER WARNING: The following constructor is declared outside of its associated class:
//ORIGINAL LINE: public EnglishTimePeriodExtractorConfiguration(DateTimeOptions options = DateTimeOptions.None)
//C# TO JAVA CONVERTER NOTE: Java does not support optional parameters. Overloaded method(s) are created above:
public EnglishTimePeriodExtractorConfiguration(DateTimeOptions options)
{
super(options);
TokenBeforeDate = EnglishDateTime.TokenBeforeDate;
SingleTimeExtractor = new BaseTimeExtractor(new EnglishTimeExtractorConfiguration(options));
UtilityConfiguration = new EnglishDatetimeUtilityConfiguration();
IntegerExtractor = com.microsoft.recognizers.text.number.english.extractors.IntegerExtractor.getInstance();
}

private IDateTimeUtilityConfiguration UtilityConfiguration;
public final IDateTimeUtilityConfiguration getUtilityConfiguration() { return UtilityConfiguration; }

private IDateTimeExtractor SingleTimeExtractor;
public final IDateTimeExtractor getSingleTimeExtractor() { return SingleTimeExtractor; }

private IExtractor IntegerExtractor;
public final IExtractor getIntegerExtractor() { return IntegerExtractor; }

public Iterable<Pattern> getSimpleCasesRegex() {
return getSimpleCasesRegex;
}

public final Iterable<Pattern> getSimpleCasesRegex = new ArrayList<Pattern>() {
{
add(PureNumFromTo);
add(PureNumBetweenAnd);
add(SpecificTimeFromTo);
add(SpecificTimeBetweenAnd);
}
};

public final Pattern getTillRegex() { return TillRegex; }
public final Pattern getTimeOfDayRegex() { return TimeOfDayRegex; }
public final Pattern getGeneralEndingRegex() { return GeneralEndingRegex; }

public final ResultIndex GetFromTokenIndex(String input)
{
ResultIndex result = new ResultIndex(false, -1);
if (input.endsWith("from"))
{
result = result.withIndex( input.lastIndexOf("from" ) );
result = result.withResult(true);
}

return result;
}

public final ResultIndex GetBetweenTokenIndex(String input)
{
ResultIndex result = new ResultIndex(false, -1);
if (input.endsWith("between"))
{
result = result.withIndex( input.lastIndexOf("between" ) );
result = result.withResult(true);
}

return result;
}

public final boolean HasConnectorToken(String input)
{
return input.equals("and");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
package com.microsoft.recognizers.text.datetime.extractors;

import com.microsoft.recognizers.text.ExtractResult;
import com.microsoft.recognizers.text.datetime.Constants;
import com.microsoft.recognizers.text.datetime.extractors.config.ITimePeriodExtractorConfiguration;
import com.microsoft.recognizers.text.datetime.extractors.config.ResultIndex;
import com.microsoft.recognizers.text.datetime.utilities.Token;
import com.microsoft.recognizers.text.utilities.Match;
import com.microsoft.recognizers.text.utilities.RegExpUtility;
import com.microsoft.recognizers.text.utilities.StringUtility;

import java.time.LocalDateTime;
import java.util.*;
import java.util.regex.Pattern;

public class BaseTimePeriodExtractor implements IDateTimeExtractor {

private final ITimePeriodExtractorConfiguration config;

@Override
public String getExtractorName() {
return Constants.SYS_DATETIME_TIMEPERIOD;
}

public BaseTimePeriodExtractor(ITimePeriodExtractorConfiguration config) {
this.config = config;
}

@Override
public List<ExtractResult> extract(String input, LocalDateTime reference) {
List<Token> tokens = new ArrayList<>();
tokens.addAll(MatchSimpleCases(input));
tokens.addAll(MergeTwoTimePoints(input, reference));
tokens.addAll(MatchTimeOfDay(input));

return Token.mergeAllTokens(tokens, input, getExtractorName());
}

@Override
public List<ExtractResult> extract(String input) {
return this.extract(input, LocalDateTime.now());
}

// Cases like "from 3 to 5am" or "between 3:30 and 5" are extracted here
// Note that cases like "from 3 to 5" will not be extracted here because no "am/pm" or "hh:mm" to infer it's a time period
// Also cases like "from 3:30 to 4 people" should not be extracted as a time period
private List<Token> MatchSimpleCases(String input)
{
List<Token> ret = new ArrayList<>();

for (Pattern regex : this.config.getSimpleCasesRegex())
{
Match[] matches = RegExpUtility.getMatches(regex, input);

for (Match match: matches)
{
// Cases like "from 10:30 to 11", don't necessarily need "am/pm"
if (match.getGroup(Constants.MinuteGroupName).value != null || match.getGroup(Constants.SecondGroupName).value != null)
{
// Cases like "from 3:30 to 4" should be supported
// Cases like "from 3:30 to 4 on 1/1/2015" should be supported
// Cases like "from 3:30 to 4 people" is considered not valid
Boolean endWithValidToken = false;

// "No extra tokens after the time period"
if (match.index + match.length == input.length())
{
endWithValidToken = true;
}
else
{
String afterStr = input.substring(match.index + match.length);

// "End with general ending tokens or "TokenBeforeDate" (like "on")
Pattern generalEndingRegex = this.config.getGeneralEndingRegex();
Optional<Match> endingMatch = Arrays.stream(RegExpUtility.getMatches(generalEndingRegex, afterStr)).findFirst();
if (endingMatch.isPresent() || afterStr.trim().startsWith(this.config.getTokenBeforeDate()))
{
endWithValidToken = true;
}
}

if (endWithValidToken)
{
ret.add(new Token(match.index, match.index + match.length));
}
}
else
{
// Is there Constants.PmGroupName or Constants.AmGroupName ?
String pmStr = match.getGroup(Constants.PmGroupName).value;
String amStr = match.getGroup(Constants.AmGroupName).value;
String descStr = match.getGroup(Constants.DescGroupName).value;

// Check Constants.PmGroupName, Constants.AmGroupName
if (!StringUtility.isNullOrEmpty(pmStr) || !StringUtility.isNullOrEmpty(amStr) || !StringUtility.isNullOrEmpty(descStr))
{
ret.add(new Token(match.index, match.index + match.length));
}
}
}
}

return ret;
}

private List<Token> MergeTwoTimePoints(String input, LocalDateTime reference)
{
List<Token> ret = new ArrayList<>();
List<ExtractResult> ers = this.config.getSingleTimeExtractor().extract(input);

// Handling ending number as a time point.
List<ExtractResult> numErs = this.config.getIntegerExtractor().extract(input);

// Check if it is an ending number
if (numErs.size() > 0)
{
List<ExtractResult> timeNumbers = new ArrayList<>();

// check if it is a ending number
boolean endingNumber = false;
ExtractResult num = numErs.get(numErs.size() - 1);
if (num.start + num.length == input.length())
{
endingNumber = true;
}
else
{
String afterStr = input.substring(num.start + num.length);
Pattern generalEndingRegex = this.config.getGeneralEndingRegex();
Optional<Match> endingMatch = Arrays.stream(RegExpUtility.getMatches(generalEndingRegex, input)).findFirst();
if (endingMatch.isPresent())
{
endingNumber = true;
}
}
if (endingNumber)
{
timeNumbers.add(num);
}

int i = 0;
int j = 0;

while (i < numErs.size())
{
// find subsequent time point
int numEndPoint = numErs.get(i).start + numErs.get(i).length;
while (j < ers.size() && ers.get(j).start <= numEndPoint)
{
j++;
}

if (j >= ers.size()) break;

// check connector string
String midStr = input.substring(numEndPoint, ers.get(j).start);
Pattern tillRegex = this.config.getTillRegex();
Optional<Match> match = Arrays.stream(RegExpUtility.getMatches(tillRegex, midStr)).findFirst();
if (match.isPresent() && match.get().length == midStr.trim().length())
{
timeNumbers.add(numErs.get(i));
}

i++;
}

// check overlap
for (ExtractResult timeNum : timeNumbers)
{
boolean overlap = false;
for (ExtractResult er :ers)
{
if(er.start <= timeNum.start && er.start + er.length >= timeNum.start)
{
overlap = true;
}
}

if (!overlap)
{
ers.add(timeNum);
}
}

ers.sort((x, y) -> x.start - y.start);
}

int idx = 0;
while (idx < ers.size() - 1)
{
int middleBegin = ers.get(idx).start + ers.get(idx).length;
int middleEnd = ers.get(idx + 1).start;

if (middleEnd - middleBegin <= 0)
{
idx++;
continue;
}

String middleStr = input.substring(middleBegin, middleEnd).trim().toLowerCase(java.util.Locale.ROOT);
Pattern tillRegex = this.config.getTillRegex();
Optional<Match> match = Arrays.stream(RegExpUtility.getMatches(tillRegex, middleStr)).findFirst();

// Handle "{TimePoint} to {TimePoint}"
if (match.isPresent() && match.get().index == 0 && match.get().length == middleStr.length())
{
int periodBegin = ers.get(idx).start;
int periodEnd = ers.get(idx + 1).start + ers.get(idx + 1).length;

// Handle "from"
String beforeStr = input.substring(0, periodBegin).trim().toLowerCase(java.util.Locale.ROOT);
ResultIndex fromIndex = this.config.GetFromTokenIndex(beforeStr);
ResultIndex betweenIndex = this.config.GetBetweenTokenIndex(beforeStr);
if (fromIndex.result)
{
// Handle "from"
periodBegin = fromIndex.index;
}
else if (betweenIndex.result)
{
// Handle "between"
periodBegin = betweenIndex.index;
}

ret.add(new Token(periodBegin, periodEnd));
idx += 2;
continue;
}

// Handle "between {TimePoint} and {TimePoint}"
if (this.config.HasConnectorToken(middleStr))
{
int periodBegin = ers.get(idx).start;
int periodEnd = ers.get(idx + 1).start + ers.get(idx + 1).length;

// Handle "between"
String beforeStr = input.substring(0, periodBegin).trim().toLowerCase(java.util.Locale.ROOT);
ResultIndex betweenIndex = this.config.GetBetweenTokenIndex(beforeStr);
if (betweenIndex.result)
{
periodBegin = betweenIndex.index;
ret.add(new Token(periodBegin, periodEnd));
idx += 2;
continue;
}
}

idx++;
}

return ret;
}
private List<Token> MatchTimeOfDay(String input)
{
List<Token> ret = new ArrayList<>();
Pattern timeOfDayRegex = this.config.getTimeOfDayRegex();
Match[] matches = RegExpUtility.getMatches(timeOfDayRegex, input);
for (Match match : matches)
{
ret.add(new Token(match.index, match.index + match.length));
}

return ret;
}
}
Loading