fixed url recognition in text

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/social-networking/social-util-library@132831 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Costantino Perciante 2016-10-06 14:54:45 +00:00
parent 483c585f14
commit af643cdc15
3 changed files with 58 additions and 18 deletions

View File

@ -2,6 +2,7 @@
<Changeset component="org.gcube.socialnetworking.social-util-library.1-1-0"
date="2016-10-01">
<Change>Hashtag regular expression updated (See ticket #4937)</Change>
<Change>Url regular expression updated</Change>
</Changeset>
<Changeset component="org.gcube.socialnetworking.social-util-library.1-0-0"
date="2016-06-01">

View File

@ -41,6 +41,21 @@ public class Utils {
*/
private static final String HASHTAG_REGEX = "^#\\w+([.]?\\w+)*|\\s#\\w+([.]?\\w+)*";
/**
* Pattern for URLS
*/
private static final Pattern URL_PATTERN = Pattern.compile(
"\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" +
"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" +
"|mil|biz|info|mobi|name|aero|jobs|museum" +
"|travel|[a-z]{2,5}))(:[\\d]{1,5})?" +
"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" +
"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" +
"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" +
"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b");
/**
*
* @param preview
@ -143,7 +158,8 @@ public class Utils {
}
/**
* utility method that extract an url ina text when you paste a link
* utility method that extract an url ina text when you paste a link.
* It returns the first (if any) meaningful url among the ones available.
* @param feedText
* @return the text with the clickable url in it
*/
@ -154,14 +170,14 @@ public class Utils {
// Attempt to convert each item into an URL.
for( String item : parts ) {
String toCheck = getHttpToken(item);
logger.debug("To check is " + toCheck);
if (toCheck != null) {
try {
new URL(toCheck);
return toCheck;
} catch (MalformedURLException e) {
// If there was an URL then it's not valid
logger.error("MalformedURLException returning... ");
return null;
logger.error("MalformedURLException skipping token " + toCheck);
}
}
}
@ -224,8 +240,10 @@ public class Utils {
sb.append("<a class=\"link\" style=\"font-size:14px;\" href=\"").append(url).append("\" target=\"_blank\">").append(url).append("</a> ");
} catch (MalformedURLException e) {
// If there was an URL then it's not valid
logger.error("MalformedURLException returning... ");
return feedText;
logger.error("MalformedURLException not converting token = " + toCheck);
sb.append(parts[i]);
sb.append(" ");
//return feedText;
}
} else {
sb.append(parts[i]);
@ -239,18 +257,31 @@ public class Utils {
* @param item a text token
* @return the actual http link
*/
public static String getHttpToken(String item) {
if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
if (item.startsWith("("))
item = item.substring(1, item.length());
if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a .
item = item.substring(0, item.length()-1);
}
item = item.startsWith("www") ? "http://"+item : item;
logger.debug("getHttpToken returns -> " + item);
return item;
}
return null;
public static String getHttpToken(String originalItem) {
// apply pattern
String item = null;
Matcher matcher = URL_PATTERN.matcher(originalItem);
if(matcher.find()){
logger.debug("Found match url " + matcher.group());
item = matcher.group();
}else
return null;
item = item.startsWith("www") ? "http://"+item : item;
logger.debug("getHttpToken returns -> " + item);
return item;
// if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
// if (item.startsWith("("))
// item = item.substring(1, item.length());
// if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a .
// item = item.substring(0, item.length()-1);
// }
//
// return item;
// }
// return null;
}
/**

View File

@ -5,7 +5,6 @@ import java.util.List;
public class TestUnit {
//@Test
public void testHashtag() {
String text = "This is a test with hashtag #T6 and #T6.1 but also #T6. that has '.' that is useless and #T43.43 and #gcube4.1.0gcore #gcube4.1.0";
@ -13,4 +12,13 @@ public class TestUnit {
System.out.println("Hashtags are " + hashtags);
}
//@Test
public void extractUrl(){
String url = "http tosajndjsa :httphttps://www.google.tv www.google.cloud www https http (http://digirolamo.com: www.google.it";
String result = Utils.transformUrls(url);
System.out.println("urls are " + result);
}
}