2019-01-24 11:50:00 +01:00
|
|
|
package org.gcube.socialnetworking.tokenization;
|
2019-01-23 16:30:00 +01:00
|
|
|
|
2019-01-23 16:36:14 +01:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
2019-01-23 16:30:00 +01:00
|
|
|
|
2021-01-15 10:08:33 +01:00
|
|
|
/**
|
|
|
|
* @author Luca Frosini (ISTI - CNR)
|
|
|
|
*/
|
2019-01-24 11:50:55 +01:00
|
|
|
public class GCubeStringTokenizer {
|
2019-01-23 16:30:00 +01:00
|
|
|
|
2019-01-24 11:55:50 +01:00
|
|
|
/**
|
|
|
|
* The default delimiter regex is any whitespaces '\s'
|
|
|
|
*/
|
|
|
|
public static final String DEFAULT_DELIMITER_REGEX = "\\s";
|
|
|
|
|
2019-01-23 16:36:14 +01:00
|
|
|
private final String originalString;
|
2019-01-24 11:55:50 +01:00
|
|
|
private final String delimiterRegex;
|
2019-01-23 16:36:14 +01:00
|
|
|
|
|
|
|
private Pattern pattern;
|
|
|
|
private Matcher matcher;
|
|
|
|
|
|
|
|
private List<Token> tokens;
|
|
|
|
|
2019-01-24 11:50:55 +01:00
|
|
|
public GCubeStringTokenizer(String string) {
|
2019-01-24 11:55:50 +01:00
|
|
|
this(string, DEFAULT_DELIMITER_REGEX);
|
|
|
|
}
|
|
|
|
|
|
|
|
public GCubeStringTokenizer(String string, String delimiterRegex) {
|
2019-01-23 16:36:14 +01:00
|
|
|
this.originalString = string;
|
2019-01-24 11:55:50 +01:00
|
|
|
this.delimiterRegex = delimiterRegex;
|
|
|
|
this.pattern = Pattern.compile(delimiterRegex);
|
2019-01-23 16:36:14 +01:00
|
|
|
this.matcher = pattern.matcher(originalString);
|
|
|
|
}
|
|
|
|
|
|
|
|
protected Token getToken(int tokenStart) {
|
|
|
|
int tokenEnd = matcher.start();
|
|
|
|
int delimiterStart = tokenEnd;
|
|
|
|
int delimiterEnd = matcher.end();
|
|
|
|
String tokenString = originalString.substring(tokenStart, tokenEnd);
|
|
|
|
String delimiter = originalString.substring(delimiterStart, delimiterEnd);
|
|
|
|
Token token = new Token(tokenString, delimiter, tokenStart, tokenEnd);
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
|
|
|
public List<Token> getTokens() {
|
|
|
|
if(tokens==null) {
|
|
|
|
tokens = new ArrayList<>();
|
|
|
|
int tokenStart = 0;
|
|
|
|
while(matcher.find()) {
|
|
|
|
Token token = getToken(tokenStart);
|
|
|
|
tokens.add(token);
|
|
|
|
tokenStart = matcher.end();
|
|
|
|
}
|
|
|
|
if(tokenStart!=originalString.length()){
|
|
|
|
int tokenEnd = originalString.length();
|
|
|
|
String tokenString = originalString.substring(tokenStart, tokenEnd);
|
|
|
|
Token token = new Token(tokenString, "", tokenStart, tokenEnd);
|
|
|
|
tokens.add(token);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return tokens;
|
2019-01-23 16:30:00 +01:00
|
|
|
}
|
|
|
|
|
2019-01-24 11:55:50 +01:00
|
|
|
public String getOriginalString() {
|
|
|
|
return originalString;
|
|
|
|
}
|
|
|
|
|
|
|
|
public String getDelimiterRegex() {
|
|
|
|
return delimiterRegex;
|
|
|
|
}
|
|
|
|
|
2019-01-23 16:30:00 +01:00
|
|
|
}
|