diff --git a/.classpath b/.classpath index 13ea37a..e1c5019 100644 --- a/.classpath +++ b/.classpath @@ -1,6 +1,6 @@ - + @@ -31,5 +31,5 @@ - + diff --git a/.settings/com.google.gdt.eclipse.core.prefs b/.settings/com.google.gdt.eclipse.core.prefs index f7ef8a9..49c320f 100644 --- a/.settings/com.google.gdt.eclipse.core.prefs +++ b/.settings/com.google.gdt.eclipse.core.prefs @@ -1,5 +1,5 @@ eclipse.preferences.version=1 jarsExcludedFromWebInfLib= -lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT +lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.2-SNAPSHOT warSrcDir=src/main/webapp warSrcDirIsOutput=false diff --git a/.settings/org.eclipse.wst.common.component b/.settings/org.eclipse.wst.common.component index fb4e233..dde9c8f 100644 --- a/.settings/org.eclipse.wst.common.component +++ b/.settings/org.eclipse.wst.common.component @@ -4,6 +4,9 @@ + + uses + diff --git a/pom.xml b/pom.xml index 7b78946..1e3fd03 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.gcube.portlets.user share-updates war - 1.2.1-SNAPSHOT + 1.2.2-SNAPSHOT gCube Share Updates Portlet @@ -190,11 +190,6 @@ htmlcleaner 2.2 - - net.sf.jtidy - jtidy - r938 - net.eliasbalasis tibcopagebus4gwt diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java index aa2871d..823ec5e 100644 --- a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java +++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java @@ -68,9 +68,6 @@ import org.htmlcleaner.TagNode; import org.htmlparser.beans.StringBean; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; -import org.w3c.dom.NodeList; -import org.w3c.tidy.Tidy; import com.google.gwt.user.server.rpc.RemoteServiceServlet; import com.liferay.portal.kernel.exception.PortalException; @@ -656,14 +653,19 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host); } else { //there is OpenGraph + _log.info("OpenGraph Found") ; title = ogLink.getContent("title"); description = (ogLink.getContent("description") != null) ? ogLink.getContent("description") : ""; description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description); //look for the image ask the guesser if not present - if (ogLink.getContent("image") != null) - imageUrls.add(ogLink.getContent("image")); + if (ogLink.getContent("image") != null) { + String imageUrl = getImageUrlFromSrcAttribute(pageURL, ogLink.getContent("image")); + imageUrls.add(imageUrl); + _log.trace("OpenGraph getImage = " +imageUrl) ; + } else { - ArrayList images = getImagesFromHTML(siteConnection, pageURL); + _log.trace("OpenGraph No Image, trying manuale parsing"); + ArrayList images = getImagesWithCleaner(pageURL); if (! images.isEmpty()) imageUrls = images; } @@ -677,29 +679,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar } return toReturn; } - /** - * get all the image urls from an HTML page up to 15 - * @param pageURL the url - * @return a list of image url - * @throws IOException - */ - private ArrayList getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException { - ArrayList toReturn = new ArrayList(); - try { - Document document = new Tidy().parseDOM(pageURL.openStream(), null); - NodeList imgs = document.getElementsByTagName("img"); - int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength(); - for (int i = 0; i < upTo; i++) { - System.out.println(i); - toReturn.add(imgs.item(i).getAttributes().getNamedItem("src").getNodeValue()); - } - }catch (NullPointerException e) { - _log.error("Error parsing HTML for images, malformed HTML returning what I found so far ... "); - return toReturn; - } - return toReturn; - } - /** * to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content * @param pageUrl @@ -770,14 +749,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar TagNode[] title = pageData.getElementsByName("title", true); if (title != null && title.length > 0) { String theTitle = title[0].getChildren().get(0).toString(); - System.out.println("theTitle: " + theTitle); + _log.trace("theTitle: " + theTitle); return theTitle; } return null; } /** - * if jTidy has problems try with with HtmlCleaner API to read the images + * try with HtmlCleaner API to read the images * @param pageURL * @return the title of the page or null if can't read it * @throws IOException @@ -807,27 +786,36 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar int upTo = (imgs.length > 15) ? 15 : imgs.length; for (int i = 0; i < upTo; i++) { if (imgs[i].hasAttribute("src")) { - String imageUrl = imgs[i].getAttributeByName("src"); - if (imageUrl.startsWith("/")) - imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl; - else if (imageUrl.startsWith("../")) { - imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl; - } - else if (!imageUrl.contains("/")) { //then the image is probably in the same folder - // e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source - String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/")); - imageUrl= imageFolder + "/" + imageUrl; - } - else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png - imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl; - } + String imageUrl = getImageUrlFromSrcAttribute(pageURL, imgs[i].getAttributeByName("src")); images.add(imageUrl); _log.trace("[FOUND image] " + imageUrl); } } return images; } - + /** + * There are several ways to refer an image in a HTML, this method use an heuristic to get the actual image url + * @param pageURL the url + * @param srcAttr the content of the img src attribute + * @return the image url ready to be referred outside native environment + */ + private String getImageUrlFromSrcAttribute(URL pageURL, String srcAttr) { + String imageUrl = srcAttr; + if (imageUrl.startsWith("/")) //referred as absolute path case + imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl; + else if (imageUrl.startsWith("../")) { //relative path case + imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl; + } + else if (!imageUrl.contains("/")) { //the image is probably in the same folder + // e.g. http://www.adomain.com/docrep/018/i3328e/i3328e00.htm?utm_source + String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/")); + imageUrl= imageFolder + "/" + imageUrl; + } + else if (!imageUrl.startsWith("http") ) { //e.g. http://adomain.com/anImage.png + imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl; + } + return imageUrl; + } /** * generate the description parsing the content (Best Guess) * @param link the link to check @@ -867,7 +855,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory()); } catch (Exception e) { System.out.println("Error" + e); - } + } } /**