diff --git a/.classpath b/.classpath
index 13ea37a..e1c5019 100644
--- a/.classpath
+++ b/.classpath
@@ -1,6 +1,6 @@
-
+
@@ -31,5 +31,5 @@
-
+
diff --git a/.settings/com.google.gdt.eclipse.core.prefs b/.settings/com.google.gdt.eclipse.core.prefs
index f7ef8a9..49c320f 100644
--- a/.settings/com.google.gdt.eclipse.core.prefs
+++ b/.settings/com.google.gdt.eclipse.core.prefs
@@ -1,5 +1,5 @@
eclipse.preferences.version=1
jarsExcludedFromWebInfLib=
-lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
+lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.2-SNAPSHOT
warSrcDir=src/main/webapp
warSrcDirIsOutput=false
diff --git a/.settings/org.eclipse.wst.common.component b/.settings/org.eclipse.wst.common.component
index fb4e233..dde9c8f 100644
--- a/.settings/org.eclipse.wst.common.component
+++ b/.settings/org.eclipse.wst.common.component
@@ -4,6 +4,9 @@
+
+ uses
+
diff --git a/pom.xml b/pom.xml
index 7b78946..1e3fd03 100644
--- a/pom.xml
+++ b/pom.xml
@@ -13,7 +13,7 @@
org.gcube.portlets.user
share-updates
war
- 1.2.1-SNAPSHOT
+ 1.2.2-SNAPSHOT
gCube Share Updates Portlet
@@ -190,11 +190,6 @@
htmlcleaner
2.2
-
- net.sf.jtidy
- jtidy
- r938
-
net.eliasbalasis
tibcopagebus4gwt
diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java
index aa2871d..823ec5e 100644
--- a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java
+++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java
@@ -68,9 +68,6 @@ import org.htmlcleaner.TagNode;
import org.htmlparser.beans.StringBean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.NodeList;
-import org.w3c.tidy.Tidy;
import com.google.gwt.user.server.rpc.RemoteServiceServlet;
import com.liferay.portal.kernel.exception.PortalException;
@@ -656,14 +653,19 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
} else {
//there is OpenGraph
+ _log.info("OpenGraph Found") ;
title = ogLink.getContent("title");
description = (ogLink.getContent("description") != null) ? ogLink.getContent("description") : "";
description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description);
//look for the image ask the guesser if not present
- if (ogLink.getContent("image") != null)
- imageUrls.add(ogLink.getContent("image"));
+ if (ogLink.getContent("image") != null) {
+ String imageUrl = getImageUrlFromSrcAttribute(pageURL, ogLink.getContent("image"));
+ imageUrls.add(imageUrl);
+ _log.trace("OpenGraph getImage = " +imageUrl) ;
+ }
else {
- ArrayList images = getImagesFromHTML(siteConnection, pageURL);
+ _log.trace("OpenGraph No Image, trying manuale parsing");
+ ArrayList images = getImagesWithCleaner(pageURL);
if (! images.isEmpty())
imageUrls = images;
}
@@ -677,29 +679,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
}
return toReturn;
}
- /**
- * get all the image urls from an HTML page up to 15
- * @param pageURL the url
- * @return a list of image url
- * @throws IOException
- */
- private ArrayList getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
- ArrayList toReturn = new ArrayList();
- try {
- Document document = new Tidy().parseDOM(pageURL.openStream(), null);
- NodeList imgs = document.getElementsByTagName("img");
- int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
- for (int i = 0; i < upTo; i++) {
- System.out.println(i);
- toReturn.add(imgs.item(i).getAttributes().getNamedItem("src").getNodeValue());
- }
- }catch (NullPointerException e) {
- _log.error("Error parsing HTML for images, malformed HTML returning what I found so far ... ");
- return toReturn;
- }
- return toReturn;
- }
-
/**
* to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content
* @param pageUrl
@@ -770,14 +749,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
TagNode[] title = pageData.getElementsByName("title", true);
if (title != null && title.length > 0) {
String theTitle = title[0].getChildren().get(0).toString();
- System.out.println("theTitle: " + theTitle);
+ _log.trace("theTitle: " + theTitle);
return theTitle;
}
return null;
}
/**
- * if jTidy has problems try with with HtmlCleaner API to read the images
+ * try with HtmlCleaner API to read the images
* @param pageURL
* @return the title of the page or null if can't read it
* @throws IOException
@@ -807,27 +786,36 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
int upTo = (imgs.length > 15) ? 15 : imgs.length;
for (int i = 0; i < upTo; i++) {
if (imgs[i].hasAttribute("src")) {
- String imageUrl = imgs[i].getAttributeByName("src");
- if (imageUrl.startsWith("/"))
- imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
- else if (imageUrl.startsWith("../")) {
- imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
- }
- else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
- // e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
- String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
- imageUrl= imageFolder + "/" + imageUrl;
- }
- else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
- imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
- }
+ String imageUrl = getImageUrlFromSrcAttribute(pageURL, imgs[i].getAttributeByName("src"));
images.add(imageUrl);
_log.trace("[FOUND image] " + imageUrl);
}
}
return images;
}
-
+ /**
+ * There are several ways to refer an image in a HTML, this method use an heuristic to get the actual image url
+ * @param pageURL the url
+ * @param srcAttr the content of the img src attribute
+ * @return the image url ready to be referred outside native environment
+ */
+ private String getImageUrlFromSrcAttribute(URL pageURL, String srcAttr) {
+ String imageUrl = srcAttr;
+ if (imageUrl.startsWith("/")) //referred as absolute path case
+ imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
+ else if (imageUrl.startsWith("../")) { //relative path case
+ imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
+ }
+ else if (!imageUrl.contains("/")) { //the image is probably in the same folder
+ // e.g. http://www.adomain.com/docrep/018/i3328e/i3328e00.htm?utm_source
+ String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
+ imageUrl= imageFolder + "/" + imageUrl;
+ }
+ else if (!imageUrl.startsWith("http") ) { //e.g. http://adomain.com/anImage.png
+ imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
+ }
+ return imageUrl;
+ }
/**
* generate the description parsing the content (Best Guess)
* @param link the link to check
@@ -867,7 +855,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
} catch (Exception e) {
System.out.println("Error" + e);
- }
+ }
}
/**