fixed url recognition in text

git-svn-id: https://svn.d4science.research-infrastructures.eu/gcube/trunk/social-networking/social-util-library@132831 82a268e6-3cf1-43bd-a215-b396298e98cf
2016-10-06 14:54:45 +00:00 · 2016-10-06 14:54:45 +00:00 · af643cdc15
parent 483c585f14
commit af643cdc15
3 changed files with 58 additions and 18 deletions
--- a/distro/changelog.xml
+++ b/distro/changelog.xml
@ -2,6 +2,7 @@
 	<Changeset component="org.gcube.socialnetworking.social-util-library.1-1-0"
 		date="2016-10-01">
 		<Change>Hashtag regular expression updated (See ticket #4937)</Change>
 		<Change>Url regular expression updated</Change>
 	</Changeset>
 	<Changeset component="org.gcube.socialnetworking.social-util-library.1-0-0"
 		date="2016-06-01">
--- a/src/main/java/org/gcube/social_networking/socialutillibrary/Utils.java
+++ b/src/main/java/org/gcube/social_networking/socialutillibrary/Utils.java
@ -41,6 +41,21 @@ public class Utils {
 	 */
 	private static final String HASHTAG_REGEX = "^#\\w+([.]?\\w+)*|\\s#\\w+([.]?\\w+)*";
 	/**
 	 * Pattern for URLS
 	 */
 	private static final Pattern URL_PATTERN = Pattern.compile(
 			"\\b(((ht|f)tp(s?)\\:\\/\\/|~\\/|\\/)|www.)" + 
 					"(\\w+:\\w+@)?(([-\\w]+\\.)+(com|org|net|gov" + 
 					"|mil|biz|info|mobi|name|aero|jobs|museum" + 
 					"|travel|[a-z]{2,5}))(:[\\d]{1,5})?" + 
 					"(((\\/([-\\w~!$+|.,=]|%[a-f\\d]{2})+)+|\\/)+|\\?|#)?" + 
 					"((\\?([-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + 
 					"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)" + 
 					"(&(?:[-\\w~!$+|.,*:]|%[a-f\\d{2}])+=?" + 
 					"([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)*)*" + 
 			"(#([-\\w~!$+|.,*:=]|%[a-f\\d]{2})*)?\\b");
 	/**
 	 * 
 	 * @param preview
@ -143,7 +158,8 @@ public class Utils {
 	}
 	/**
-	 * utility method that extract an url ina text when you paste a link
+	 * utility method that extract an url ina text when you paste a link.
 	 * It returns the first (if any) meaningful url among the ones available.
 	 * @param feedText
 	 * @return the text with the clickable url in it
 	 */
@ -154,14 +170,14 @@ public class Utils {
 		// Attempt to convert each item into an URL.   
 		for( String item : parts ) {
 			String toCheck = getHttpToken(item);
 			logger.debug("To check is " + toCheck);
 			if (toCheck != null) {
 				try {					
 					new URL(toCheck);
 					return toCheck;
 				} catch (MalformedURLException e) {
 					// If there was an URL then it's not valid
-					logger.error("MalformedURLException returning... ");
+					logger.error("MalformedURLException skipping token " + toCheck);
 					return null;
 				}
 			}
 		}
@ -224,8 +240,10 @@ public class Utils {
 					sb.append("<a class=\"link\" style=\"font-size:14px;\" href=\"").append(url).append("\" target=\"_blank\">").append(url).append("</a> ");    
 				} catch (MalformedURLException e) {
 					// If there was an URL then it's not valid
-					logger.error("MalformedURLException returning... ");
+					logger.error("MalformedURLException not converting token = " + toCheck);
-					return feedText;
+					sb.append(parts[i]);
 					sb.append(" ");
 					//return feedText;
 				}
 			} else {
 				sb.append(parts[i]);
@ -239,18 +257,31 @@ public class Utils {
 	 * @param item a text token
 	 * @return the actual http link
 	 */
-	public static String getHttpToken(String item) {
+	public static String getHttpToken(String originalItem) {
-		if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
+
-			if (item.startsWith("(")) 
+		// apply pattern
-				item = item.substring(1, item.length());
+		String item = null;
-			if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a .
+		Matcher matcher = URL_PATTERN.matcher(originalItem);
-				item = item.substring(0, item.length()-1);
+		if(matcher.find()){
-			}
+			logger.debug("Found match url " + matcher.group());
 			item = matcher.group();
 		}else
 			return null;
 		item = item.startsWith("www") ? "http://"+item : item;
 		logger.debug("getHttpToken returns -> " + item);
 		return item;
-		}
+
-		return null;
+		//		if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
 		//			if (item.startsWith("(")) 
 		//				item = item.substring(1, item.length());
 		//			if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a .
 		//				item = item.substring(0, item.length()-1);
 		//			}
 		//
 		//			return item;
 		//		}
 		//		return null;
 	}
 	/**
--- a/src/test/java/org/gcube/social_networking/socialutillibrary/TestUnit.java
+++ b/src/test/java/org/gcube/social_networking/socialutillibrary/TestUnit.java
@ -5,7 +5,6 @@ import java.util.List;
 public class TestUnit {
 	//@Test
 	public void testHashtag() {
 		String text = "This is a test with hashtag #T6 and #T6.1 but also #T6. that has '.' that is useless and #T43.43 and #gcube4.1.0gcore #gcube4.1.0";
@ -13,4 +12,13 @@ public class TestUnit {
 		System.out.println("Hashtags are " + hashtags);
 	}
 	//@Test
 	public void extractUrl(){
 		String url = "http tosajndjsa :httphttps://www.google.tv www.google.cloud  www https http (http://digirolamo.com: www.google.it";
 		String result = Utils.transformUrls(url);
 		System.out.println("urls are " + result);
 	}
 }