diff --git a/distro/changelog.xml b/distro/changelog.xml index 28ced40..3d8e4f9 100644 --- a/distro/changelog.xml +++ b/distro/changelog.xml @@ -2,6 +2,7 @@ Hashtag regular expression updated (See ticket #4937) + Url regular expression updated diff --git a/src/main/java/org/gcube/social_networking/socialutillibrary/Utils.java b/src/main/java/org/gcube/social_networking/socialutillibrary/Utils.java index 5346fb0..bc3d260 100644 --- a/src/main/java/org/gcube/social_networking/socialutillibrary/Utils.java +++ b/src/main/java/org/gcube/social_networking/socialutillibrary/Utils.java @@ -41,6 +41,21 @@ public class Utils { */ private static final String HASHTAG_REGEX = "^#\\w+([.]?\\w+)*|\\s#\\w+([.]?\\w+)*"; + /** + * Pattern for URLS + */ + private static final Pattern URL_PATTERN = Pattern.compile( + "\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" + + "(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" + + "|mil|biz|info|mobi|name|aero|jobs|museum" + + "|travel|[a-z]{2,5}))(:[\\d]{1,5})?" + + "(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" + + "((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + + "([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" + + "(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + + "([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" + + "(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b"); + /** * * @param preview @@ -143,7 +158,8 @@ public class Utils { } /** - * utility method that extract an url ina text when you paste a link + * utility method that extract an url ina text when you paste a link. + * It returns the first (if any) meaningful url among the ones available. * @param feedText * @return the text with the clickable url in it */ @@ -154,14 +170,14 @@ public class Utils { // Attempt to convert each item into an URL. for( String item : parts ) { String toCheck = getHttpToken(item); + logger.debug("To check is " + toCheck); if (toCheck != null) { try { new URL(toCheck); return toCheck; } catch (MalformedURLException e) { // If there was an URL then it's not valid - logger.error("MalformedURLException returning... "); - return null; + logger.error("MalformedURLException skipping token " + toCheck); } } } @@ -224,8 +240,10 @@ public class Utils { sb.append("").append(url).append(" "); } catch (MalformedURLException e) { // If there was an URL then it's not valid - logger.error("MalformedURLException returning... "); - return feedText; + logger.error("MalformedURLException not converting token = " + toCheck); + sb.append(parts[i]); + sb.append(" "); + //return feedText; } } else { sb.append(parts[i]); @@ -239,18 +257,31 @@ public class Utils { * @param item a text token * @return the actual http link */ - public static String getHttpToken(String item) { - if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) { - if (item.startsWith("(")) - item = item.substring(1, item.length()); - if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a . - item = item.substring(0, item.length()-1); - } - item = item.startsWith("www") ? "http://"+item : item; - logger.debug("getHttpToken returns -> " + item); - return item; - } - return null; + public static String getHttpToken(String originalItem) { + + // apply pattern + String item = null; + Matcher matcher = URL_PATTERN.matcher(originalItem); + if(matcher.find()){ + logger.debug("Found match url " + matcher.group()); + item = matcher.group(); + }else + return null; + + item = item.startsWith("www") ? "http://"+item : item; + logger.debug("getHttpToken returns -> " + item); + return item; + + // if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) { + // if (item.startsWith("(")) + // item = item.substring(1, item.length()); + // if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a . + // item = item.substring(0, item.length()-1); + // } + // + // return item; + // } + // return null; } /** diff --git a/src/test/java/org/gcube/social_networking/socialutillibrary/TestUnit.java b/src/test/java/org/gcube/social_networking/socialutillibrary/TestUnit.java index 926e8cc..6e6c364 100644 --- a/src/test/java/org/gcube/social_networking/socialutillibrary/TestUnit.java +++ b/src/test/java/org/gcube/social_networking/socialutillibrary/TestUnit.java @@ -5,7 +5,6 @@ import java.util.List; public class TestUnit { - //@Test public void testHashtag() { String text = "This is a test with hashtag #T6 and #T6.1 but also #T6. that has '.' that is useless and #T43.43 and #gcube4.1.0gcore #gcube4.1.0"; @@ -13,4 +12,13 @@ public class TestUnit { System.out.println("Hashtags are " + hashtags); } + //@Test + public void extractUrl(){ + + String url = "http tosajndjsa :httphttps://www.google.tv www.google.cloud www https http (http://digirolamo.com: www.google.it"; + String result = Utils.transformUrls(url); + System.out.println("urls are " + result); + } + + }