413 lines
15 KiB
Java
413 lines
15 KiB
Java
package org.gcube.portlets.user.shareupdates.server;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.IOException;
|
|
import java.io.InputStreamReader;
|
|
import java.net.MalformedURLException;
|
|
import java.net.URL;
|
|
import java.net.URLConnection;
|
|
import java.nio.charset.Charset;
|
|
import java.util.ArrayList;
|
|
import java.util.Collections;
|
|
import java.util.List;
|
|
import java.util.regex.Matcher;
|
|
import java.util.regex.Pattern;
|
|
|
|
import javax.net.ssl.SSLContext;
|
|
import javax.net.ssl.TrustManager;
|
|
import javax.net.ssl.X509TrustManager;
|
|
|
|
import org.apache.commons.codec.binary.Base64;
|
|
import org.gcube.portal.databook.client.GCubeSocialNetworking;
|
|
import org.gcube.portlets.user.shareupdates.server.metaseeker.MetaSeeker;
|
|
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
|
|
import org.gcube.portlets.user.shareupdates.shared.LinkPreview;
|
|
import org.gcube.portlets.widgets.pickitem.shared.ItemBean;
|
|
import org.htmlcleaner.HtmlCleaner;
|
|
import org.htmlcleaner.TagNode;
|
|
import org.htmlparser.beans.StringBean;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import com.sun.net.ssl.HttpsURLConnection;
|
|
/**
|
|
* this class contains utility method for parsing and trasforming users pasted text containing URLs and other utility methods
|
|
* @author Massimiliano Assante, ISTI-CNR
|
|
*
|
|
*/
|
|
@SuppressWarnings({ "deprecation", "restriction" })
|
|
public class TextTransfromUtils {
|
|
/**
|
|
*
|
|
*/
|
|
private static Logger _log = LoggerFactory.getLogger(ShareUpdateServiceImpl.class);
|
|
|
|
/**
|
|
*
|
|
* @param preview
|
|
* @return
|
|
*/
|
|
protected static String convertFileNameAnchorHTML(String url) {
|
|
StringBuilder sb = new StringBuilder();
|
|
sb.append("<span style=\"color:gray; font-size:12px;\">shared </span><a class=\"link\" href=\"").append(url).append("\" target=\"_blank\">").append("a file.").append("</a> ").toString();
|
|
return sb.toString();
|
|
}
|
|
|
|
|
|
/**
|
|
* convert the mentioned people in HTML anchor and also Encode the params Base64
|
|
* @param escapedFeedText
|
|
* @param taggedPeople
|
|
* @return
|
|
*/
|
|
protected static String convertMentionPeopleAnchorHTML(String escapedFeedText, List<ItemBean> taggedPeople) {
|
|
for (ItemBean tagged : taggedPeople) {
|
|
String taggedHTML = "<a class=\"link\" style=\"font-size:14px;\" href=\""+GCubeSocialNetworking.USER_PROFILE_LINK
|
|
+"?"+
|
|
new String(Base64.encodeBase64(GCubeSocialNetworking.USER_PROFILE_OID.getBytes()))+"="+
|
|
new String(Base64.encodeBase64(tagged.getName().getBytes()))+"\">"+tagged.getAlternativeName()+"</a> ";
|
|
escapedFeedText = escapedFeedText.replace(tagged.getAlternativeName(), taggedHTML);
|
|
}
|
|
return escapedFeedText;
|
|
}
|
|
|
|
/**
|
|
* utility method that extract the hashtags from a text
|
|
* @param postText
|
|
* @return the list of hashtags present in the text
|
|
*/
|
|
protected static List<String> getHashTags(String postText) {
|
|
List<String> hashtags = new ArrayList<>();
|
|
Pattern MY_PATTERN = Pattern.compile("#(\\w+)");
|
|
Matcher matcher = MY_PATTERN.matcher(postText);
|
|
while (matcher.find()) {
|
|
hashtags.add("#"+matcher.group(1));
|
|
}
|
|
return hashtags;
|
|
}
|
|
/**
|
|
* convert the hashtag in HTML anchor and also Encode the params Base64
|
|
* @param escapedFeedText
|
|
* @param hashtags
|
|
* @return
|
|
*/
|
|
protected static String convertHashtagsAnchorHTML(String escapedFeedText, List<String> hashtags) {
|
|
for (String hashtag : hashtags) {
|
|
String taggedHTML = "<a class=\"link\" style=\"font-size:14px;\" href=\"?"+
|
|
new String(Base64.encodeBase64(GCubeSocialNetworking.HASHTAG_OID.getBytes()))+"="+
|
|
new String(Base64.encodeBase64(hashtag.getBytes()))+"\">"+hashtag+"</a>";
|
|
|
|
//dirty trick: double replace because \\boundaries do not accept # char
|
|
final String placeholder = "TO_REPLACE_4321234";
|
|
String first = escapedFeedText.replaceAll("(?i)\\b"+hashtag.substring(1)+"\\b", placeholder);
|
|
escapedFeedText = first.replaceAll("#"+placeholder, taggedHTML);
|
|
}
|
|
return escapedFeedText;
|
|
}
|
|
|
|
/**
|
|
* generate the description parsing the content (Best Guess)
|
|
* @param link the link to check
|
|
* @return the description guessed
|
|
*/
|
|
private static String createDescriptionFromContent(String link) {
|
|
StringBean sb = new StringBean();
|
|
sb.setURL(link);
|
|
sb.setLinks(false);
|
|
String description = sb.getStrings();
|
|
description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description);
|
|
return description;
|
|
}
|
|
/**
|
|
* Escape an html string. Escaping data received from the client helps to
|
|
* prevent cross-site script vulnerabilities.
|
|
*
|
|
* @param html the html string to escape
|
|
* @return the escaped string
|
|
*/
|
|
protected static String escapeHtmlAndTransformUrl(String html) {
|
|
if (html == null) {
|
|
return null;
|
|
}
|
|
String toReturn = html.replaceAll("&", "&").replaceAll("<", "<")
|
|
.replaceAll(">", ">");
|
|
|
|
// replace all the line breaks by <br/>
|
|
toReturn = toReturn.replaceAll("(\r\n|\n)"," <br/> ");
|
|
//transfrom the URL in a clickable URL
|
|
toReturn = transformUrls(toReturn);
|
|
// then replace all the double spaces by the html version
|
|
toReturn = toReturn.replaceAll("\\s\\s"," ");
|
|
return toReturn;
|
|
}
|
|
|
|
/**
|
|
* utility method that extract an url ina text when you paste a link
|
|
* @param feedText
|
|
* @return the text with the clickable url in it
|
|
*/
|
|
protected static String extractURL(String feedText) {
|
|
// separate input by spaces ( URLs have no spaces )
|
|
feedText = feedText.replaceAll("(\r\n|\n)"," <br/> ");
|
|
String [] parts = feedText.split("\\s");
|
|
// Attempt to convert each item into an URL.
|
|
for( String item : parts ) {
|
|
String toCheck = getHttpToken(item);
|
|
if (toCheck != null) {
|
|
try {
|
|
new URL(toCheck);
|
|
return toCheck;
|
|
} catch (MalformedURLException e) {
|
|
// If there was an URL then it's not valid
|
|
_log.error("MalformedURLException returning... ");
|
|
return null;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
/**
|
|
* check the tokens of a pasted text and see if there's any http link in it
|
|
* @param item a text token
|
|
* @return the actual http link
|
|
*/
|
|
private static String getHttpToken(String item) {
|
|
if (item.startsWith("http") || item.startsWith("www") || item.startsWith("(www") || item.startsWith("(http")) {
|
|
if (item.startsWith("("))
|
|
item = item.substring(1, item.length());
|
|
if (item.endsWith(".") || item.endsWith(")")) { //sometimes people write the url and close the phrase with a .
|
|
item = item.substring(0, item.length()-1);
|
|
}
|
|
item = item.startsWith("www") ? "http://"+item : item;
|
|
System.out.println("getHttpToken returns -> " + item);
|
|
return item;
|
|
}
|
|
return null;
|
|
}
|
|
/**
|
|
* try with HtmlCleaner API to read the images
|
|
* @param pageURL
|
|
* @return the title of the page or null if can't read it
|
|
* @throws IOException
|
|
*/
|
|
protected static ArrayList<String> getImagesWithCleaner(URL pageURL) throws IOException {
|
|
ArrayList<String> images = new ArrayList<String>();
|
|
URLConnection conn = pageURL.openConnection();
|
|
//pretend you're a browser (make my request from Java more “browsery-like”.)
|
|
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
|
|
|
Charset charset = OpenGraph.getConnectionCharset(conn);
|
|
BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
|
|
String inputLine;
|
|
StringBuffer headContents = new StringBuffer();
|
|
|
|
// Loop through each line, looking for the closing head element
|
|
while ((inputLine = dis.readLine()) != null) {
|
|
headContents.append(inputLine + "\r\n");
|
|
}
|
|
|
|
String headContentsStr = headContents.toString();
|
|
HtmlCleaner cleaner = new HtmlCleaner();
|
|
// parse the string HTML
|
|
TagNode pageData = cleaner.clean(headContentsStr);
|
|
// open only the title tags
|
|
TagNode[] imgs = pageData.getElementsByName("img", true);
|
|
int upTo = (imgs.length > 15) ? 15 : imgs.length;
|
|
for (int i = 0; i < upTo; i++) {
|
|
if (imgs[i].hasAttribute("src")) {
|
|
String imageUrl = getImageUrlFromSrcAttribute(pageURL, imgs[i].getAttributeByName("src"));
|
|
images.add(imageUrl);
|
|
_log.trace("[FOUND image] " + imageUrl);
|
|
}
|
|
}
|
|
return images;
|
|
}
|
|
/**
|
|
* There are several ways to refer an image in a HTML, this method use an heuristic to get the actual image url
|
|
* @param pageURL the url
|
|
* @param srcAttr the content of the img src attribute
|
|
* @return the image url ready to be referred outside native environment
|
|
*/
|
|
protected static String getImageUrlFromSrcAttribute(URL pageURL, String srcAttr) {
|
|
String imageUrl = srcAttr;
|
|
_log.trace("imageUrl="+imageUrl);
|
|
if (imageUrl.startsWith("http") || imageUrl.startsWith("//")) { // double slash inherits the protocol
|
|
_log.trace("Direct link case");
|
|
return imageUrl;
|
|
}
|
|
if (imageUrl.startsWith("/")) {//referred as absolute path case
|
|
_log.trace("Absolute Path case");
|
|
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
|
|
}
|
|
else if (imageUrl.startsWith("../")) { //relative path case
|
|
_log.trace("Relative Path case");
|
|
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
|
|
imageUrl= imageFolder + "/" + imageUrl;
|
|
}
|
|
else if (!imageUrl.contains("/") || !imageUrl.startsWith("/")) { //the image is probably in the same folder or in a path starting from the last slash
|
|
_log.trace("probably in the same folder");
|
|
// e.g. http://www.adomain.com/docrep/018/i3328e/i3328e00.htm?utm_source
|
|
String checkedURL = pageURL.toString();
|
|
if (! checkedURL.endsWith("/")) {
|
|
checkedURL+="/";
|
|
}
|
|
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
|
|
//it means the url was sth like http://www.asite.com without ending slash
|
|
if (imageFolder.compareToIgnoreCase("http:/") == 0 || imageFolder.compareToIgnoreCase("https:/") == 0) {
|
|
imageFolder = pageURL.toString();
|
|
}
|
|
imageUrl= imageFolder + "/" + imageUrl;
|
|
}
|
|
else if (!imageUrl.startsWith("http") ) { //e.g. http://adomain.com/anImage.png
|
|
_log.trace("In the root");
|
|
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
|
}
|
|
return imageUrl;
|
|
}
|
|
|
|
/**
|
|
* to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content
|
|
* @param pageUrl
|
|
* @param link
|
|
* @param host
|
|
* @return a LinPreview object instance filled with the extracted information
|
|
* @throws IOException
|
|
*/
|
|
protected static LinkPreview getInfoFromHTML(URLConnection connection, URL pageUrl, String link, String host) throws Exception {
|
|
LinkPreview toReturn = null;
|
|
String title = "";
|
|
String description = "";
|
|
|
|
URLConnection conn = pageUrl.openConnection();
|
|
//pretend you're a browser (make my request from Java more “browsery-like”.)
|
|
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
|
|
|
MetaSeeker ms = null;
|
|
try {
|
|
title = getTitleFromHeader(pageUrl);
|
|
_log.trace("Found Title=" + title);
|
|
ms = new MetaSeeker(connection, pageUrl);
|
|
|
|
//try the metadata, otherwise ask the guesser
|
|
description = (ms.getContent("description") != null && ! ms.getContent("description").isEmpty()) ? ms.getContent("description") : createDescriptionFromContent(link);
|
|
|
|
ArrayList<String> images = new ArrayList<String>();
|
|
images = getImagesWithCleaner(pageUrl);
|
|
toReturn = new LinkPreview(title, description, link, host, images);
|
|
|
|
} catch(Exception e) {
|
|
_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
|
|
e.printStackTrace();
|
|
return toReturn;
|
|
}
|
|
return toReturn;
|
|
}
|
|
|
|
/**
|
|
* @param pageURL
|
|
* @return the title of the page or null if can't read it
|
|
* @throws IOException
|
|
*/
|
|
private static String getTitleFromHeader(URL pageURL) throws IOException {
|
|
URLConnection conn = pageURL.openConnection();
|
|
//pretend you're a browser (make my request from Java more “browsery-like”.)
|
|
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
|
|
|
|
Charset charset = OpenGraph.getConnectionCharset(conn);
|
|
BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
|
|
String inputLine;
|
|
StringBuffer headContents = new StringBuffer();
|
|
|
|
// Loop through each line, looking for the closing head element
|
|
while ((inputLine = dis.readLine()) != null)
|
|
{
|
|
if (inputLine.contains("</head>")) {
|
|
inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7);
|
|
inputLine = inputLine.concat("<body></body></html>");
|
|
headContents.append(inputLine + "\r\n");
|
|
break;
|
|
}
|
|
headContents.append(inputLine + "\r\n");
|
|
}
|
|
|
|
String headContentsStr = headContents.toString();
|
|
HtmlCleaner cleaner = new HtmlCleaner();
|
|
// parse the string HTML
|
|
TagNode pageData = cleaner.clean(headContentsStr);
|
|
// open only the title tags
|
|
TagNode[] title = pageData.getElementsByName("title", true);
|
|
if (title != null && title.length > 0 && title[0].getChildren().size() > 0) {
|
|
String theTitle = title[0].getChildren().get(0).toString();
|
|
_log.trace("theTitle: " + theTitle);
|
|
return theTitle;
|
|
}
|
|
return "No-title";
|
|
}
|
|
protected static String replaceAmpersand(String toReplace) {
|
|
String toReturn = toReplace.replaceAll("&", "&");
|
|
return toReturn;
|
|
}
|
|
/**
|
|
* utility method that convert a url ina text in a clickable url by the browser
|
|
* and if the user has just pasted a link, converts the link in: shared a link
|
|
* @param feedText
|
|
* @return the text with the clickable url in it
|
|
*/
|
|
protected static String transformUrls(String feedText) {
|
|
StringBuilder sb = new StringBuilder();
|
|
// separate input by spaces ( URLs have no spaces )
|
|
String [] parts = feedText.split("\\s");
|
|
// Attempt to convert each item into an URL.
|
|
for (int i = 0; i < parts.length; i++) {
|
|
String toCheck = getHttpToken(parts[i]);
|
|
if (toCheck != null) {
|
|
try {
|
|
URL url = new URL(toCheck);
|
|
if (i == 0 && parts.length == 1) //then he shared just a link
|
|
return sb.append("<span style=\"color:gray; font-size:12px;\">shared </span><a class=\"link\" href=\"").append(url).append("\" target=\"_blank\">").append("a link.").append("</a> ").toString();
|
|
// If possible then replace with anchor...
|
|
sb.append("<a class=\"link\" style=\"font-size:14px;\" href=\"").append(url).append("\" target=\"_blank\">").append(url).append("</a> ");
|
|
} catch (MalformedURLException e) {
|
|
// If there was an URL then it's not valid
|
|
_log.error("MalformedURLException returning... ");
|
|
return feedText;
|
|
}
|
|
} else {
|
|
sb.append(parts[i]);
|
|
sb.append(" ");
|
|
}
|
|
}
|
|
return sb.toString();
|
|
}
|
|
/**
|
|
* this method handles the non trusted https connections
|
|
*/
|
|
protected static void trustAllHTTPSConnections() {
|
|
// Create a trust manager that does not validate certificate chains
|
|
TrustManager[] trustAllCerts = new TrustManager[]{
|
|
new X509TrustManager() {
|
|
public java.security.cert.X509Certificate[] getAcceptedIssuers() {
|
|
return null;
|
|
}
|
|
|
|
public void checkClientTrusted(
|
|
java.security.cert.X509Certificate[] certs, String authType) {
|
|
}
|
|
|
|
public void checkServerTrusted(
|
|
java.security.cert.X509Certificate[] certs, String authType) {
|
|
}
|
|
}
|
|
};
|
|
try {
|
|
SSLContext sc = SSLContext.getInstance("SSL");
|
|
sc.init(null, trustAllCerts, new java.security.SecureRandom());
|
|
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
|
|
} catch (Exception e) {
|
|
System.out.println("Error" + e);
|
|
}
|
|
}
|
|
|
|
|
|
}
|