fixed url recognition in text

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/social-networking/social-util-library@132831 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Costantino Perciante 2016-10-06 14:54:45 +00:00
parent 483c585f14
commit af643cdc15
3 changed files with 58 additions and 18 deletions

View File

@ -2,6 +2,7 @@
<Changeset component="org.gcube.socialnetworking.social-util-library.1-1-0" <Changeset component="org.gcube.socialnetworking.social-util-library.1-1-0"
date="2016-10-01"> date="2016-10-01">
<Change>Hashtag regular expression updated (See ticket #4937)</Change> <Change>Hashtag regular expression updated (See ticket #4937)</Change>
<Change>Url regular expression updated</Change>
</Changeset> </Changeset>
<Changeset component="org.gcube.socialnetworking.social-util-library.1-0-0" <Changeset component="org.gcube.socialnetworking.social-util-library.1-0-0"
date="2016-06-01"> date="2016-06-01">

View File

@ -41,6 +41,21 @@ public class Utils {
*/ */
private static final String HASHTAG_REGEX = "^#\\w+([.]?\\w+)*|\\s#\\w+([.]?\\w+)*"; private static final String HASHTAG_REGEX = "^#\\w+([.]?\\w+)*|\\s#\\w+([.]?\\w+)*";
/**
* Pattern for URLS
*/
private static final Pattern URL_PATTERN = Pattern.compile(
"\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" +
"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" +
"|mil|biz|info|mobi|name|aero|jobs|museum" +
"|travel|[a-z]{2,5}))(:[\\d]{1,5})?" +
"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" +
"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" +
"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" +
"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" +
"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b");
/** /**
* *
* @param preview * @param preview
@ -143,7 +158,8 @@ public class Utils {
} }
/** /**
* utility method that extract an url ina text when you paste a link * utility method that extract an url ina text when you paste a link.
* It returns the first (if any) meaningful url among the ones available.
* @param feedText * @param feedText
* @return the text with the clickable url in it * @return the text with the clickable url in it
*/ */
@ -154,14 +170,14 @@ public class Utils {
// Attempt to convert each item into an URL. // Attempt to convert each item into an URL.
for( String item : parts ) { for( String item : parts ) {
String toCheck = getHttpToken(item); String toCheck = getHttpToken(item);
logger.debug("To check is " + toCheck);
if (toCheck != null) { if (toCheck != null) {
try { try {
new URL(toCheck); new URL(toCheck);
return toCheck; return toCheck;
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
// If there was an URL then it's not valid // If there was an URL then it's not valid
logger.error("MalformedURLException returning... "); logger.error("MalformedURLException skipping token " + toCheck);
return null;
} }
} }
} }
@ -224,8 +240,10 @@ public class Utils {
sb.append("<a class=\"link\" style=\"font-size:14px;\" href=\"").append(url).append("\" target=\"_blank\">").append(url).append("</a> "); sb.append("<a class=\"link\" style=\"font-size:14px;\" href=\"").append(url).append("\" target=\"_blank\">").append(url).append("</a> ");
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
// If there was an URL then it's not valid // If there was an URL then it's not valid
logger.error("MalformedURLException returning... "); logger.error("MalformedURLException not converting token = " + toCheck);
return feedText; sb.append(parts[i]);
sb.append(" ");
//return feedText;
} }
} else { } else {
sb.append(parts[i]); sb.append(parts[i]);
@ -239,18 +257,31 @@ public class Utils {
* @param item a text token * @param item a text token
* @return the actual http link * @return the actual http link
*/ */
public static String getHttpToken(String item) { public static String getHttpToken(String originalItem) {
if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
if (item.startsWith("(")) // apply pattern
item = item.substring(1, item.length()); String item = null;
if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a . Matcher matcher = URL_PATTERN.matcher(originalItem);
item = item.substring(0, item.length()-1); if(matcher.find()){
} logger.debug("Found match url " + matcher.group());
item = matcher.group();
}else
return null;
item = item.startsWith("www") ? "http://"+item : item; item = item.startsWith("www") ? "http://"+item : item;
logger.debug("getHttpToken returns -> " + item); logger.debug("getHttpToken returns -> " + item);
return item; return item;
}
return null; // if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
// if (item.startsWith("("))
// item = item.substring(1, item.length());
// if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a .
// item = item.substring(0, item.length()-1);
// }
//
// return item;
// }
// return null;
} }
/** /**

View File

@ -5,7 +5,6 @@ import java.util.List;
public class TestUnit { public class TestUnit {
//@Test //@Test
public void testHashtag() { public void testHashtag() {
String text = "This is a test with hashtag #T6 and #T6.1 but also #T6. that has '.' that is useless and #T43.43 and #gcube4.1.0gcore #gcube4.1.0"; String text = "This is a test with hashtag #T6 and #T6.1 but also #T6. that has '.' that is useless and #T43.43 and #gcube4.1.0gcore #gcube4.1.0";
@ -13,4 +12,13 @@ public class TestUnit {
System.out.println("Hashtags are " + hashtags); System.out.println("Hashtags are " + hashtags);
} }
//@Test
public void extractUrl(){
String url = "http tosajndjsa :httphttps://www.google.tv www.google.cloud www https http (http://digirolamo.com: www.google.it";
String result = Utils.transformUrls(url);
System.out.println("urls are " + result);
}
} }