improved images recognition when pargin html pages having no image indication from openGraph

git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93875 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Massimiliano Assante 2014-03-31 09:18:22 +00:00
parent 312692c163
commit d240989512
5 changed files with 42 additions and 56 deletions

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes" path="src/main/java">
<classpathentry kind="src" output="target/share-updates-1.2.2-SNAPSHOT/WEB-INF/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
@ -31,5 +31,5 @@
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes"/>
<classpathentry kind="output" path="target/share-updates-1.2.2-SNAPSHOT/WEB-INF/classes"/>
</classpath>

View File

@ -1,5 +1,5 @@
eclipse.preferences.version=1
jarsExcludedFromWebInfLib=
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.2-SNAPSHOT
warSrcDir=src/main/webapp
warSrcDirIsOutput=false

View File

@ -4,6 +4,9 @@
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
<dependent-module archiveName="fileupload-progress-bar-1.0.0-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/fileupload-progress-bar/fileupload-progress-bar">
<dependency-type>uses</dependency-type>
</dependent-module>
<property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
<property name="context-root" value="share-updates"/>
</wb-module>

View File

@ -13,7 +13,7 @@
<groupId>org.gcube.portlets.user</groupId>
<artifactId>share-updates</artifactId>
<packaging>war</packaging>
<version>1.2.1-SNAPSHOT</version>
<version>1.2.2-SNAPSHOT</version>
<name>gCube Share Updates Portlet</name>
<description>
@ -190,11 +190,6 @@
<artifactId>htmlcleaner</artifactId>
<version>2.2</version>
</dependency>
<dependency>
<groupId>net.sf.jtidy</groupId>
<artifactId>jtidy</artifactId>
<version>r938</version>
</dependency>
<dependency>
<groupId>net.eliasbalasis</groupId>
<artifactId>tibcopagebus4gwt</artifactId>

View File

@ -68,9 +68,6 @@ import org.htmlcleaner.TagNode;
import org.htmlparser.beans.StringBean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.w3c.tidy.Tidy;
import com.google.gwt.user.server.rpc.RemoteServiceServlet;
import com.liferay.portal.kernel.exception.PortalException;
@ -656,14 +653,19 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
} else {
//there is OpenGraph
_log.info("OpenGraph Found") ;
title = ogLink.getContent("title");
description = (ogLink.getContent("description") != null) ? ogLink.getContent("description") : "";
description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description);
//look for the image ask the guesser if not present
if (ogLink.getContent("image") != null)
imageUrls.add(ogLink.getContent("image"));
if (ogLink.getContent("image") != null) {
String imageUrl = getImageUrlFromSrcAttribute(pageURL, ogLink.getContent("image"));
imageUrls.add(imageUrl);
_log.trace("OpenGraph getImage = " +imageUrl) ;
}
else {
ArrayList<String> images = getImagesFromHTML(siteConnection, pageURL);
_log.trace("OpenGraph No Image, trying manuale parsing");
ArrayList<String> images = getImagesWithCleaner(pageURL);
if (! images.isEmpty())
imageUrls = images;
}
@ -677,29 +679,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
}
return toReturn;
}
/**
* get all the image urls from an HTML page up to 15
* @param pageURL the url
* @return a list of image url
* @throws IOException
*/
private ArrayList<String> getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
ArrayList<String> toReturn = new ArrayList<String>();
try {
Document document = new Tidy().parseDOM(pageURL.openStream(), null);
NodeList imgs = document.getElementsByTagName("img");
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
for (int i = 0; i < upTo; i++) {
System.out.println(i);
toReturn.add(imgs.item(i).getAttributes().getNamedItem("src").getNodeValue());
}
}catch (NullPointerException e) {
_log.error("Error parsing HTML for images, malformed HTML returning what I found so far ... ");
return toReturn;
}
return toReturn;
}
/**
* to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content
* @param pageUrl
@ -770,14 +749,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
TagNode[] title = pageData.getElementsByName("title", true);
if (title != null && title.length > 0) {
String theTitle = title[0].getChildren().get(0).toString();
System.out.println("theTitle: " + theTitle);
_log.trace("theTitle: " + theTitle);
return theTitle;
}
return null;
}
/**
* if jTidy has problems try with with HtmlCleaner API to read the images
* try with HtmlCleaner API to read the images
* @param pageURL
* @return the title of the page or null if can't read it
* @throws IOException
@ -807,27 +786,36 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
int upTo = (imgs.length > 15) ? 15 : imgs.length;
for (int i = 0; i < upTo; i++) {
if (imgs[i].hasAttribute("src")) {
String imageUrl = imgs[i].getAttributeByName("src");
if (imageUrl.startsWith("/"))
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
else if (imageUrl.startsWith("../")) {
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
imageUrl= imageFolder + "/" + imageUrl;
}
else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
String imageUrl = getImageUrlFromSrcAttribute(pageURL, imgs[i].getAttributeByName("src"));
images.add(imageUrl);
_log.trace("[FOUND image] " + imageUrl);
}
}
return images;
}
/**
* There are several ways to refer an image in a HTML, this method use an heuristic to get the actual image url
* @param pageURL the url
* @param srcAttr the content of the img src attribute
* @return the image url ready to be referred outside native environment
*/
private String getImageUrlFromSrcAttribute(URL pageURL, String srcAttr) {
String imageUrl = srcAttr;
if (imageUrl.startsWith("/")) //referred as absolute path case
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
else if (imageUrl.startsWith("../")) { //relative path case
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
else if (!imageUrl.contains("/")) { //the image is probably in the same folder
// e.g. http://www.adomain.com/docrep/018/i3328e/i3328e00.htm?utm_source
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
imageUrl= imageFolder + "/" + imageUrl;
}
else if (!imageUrl.startsWith("http") ) { //e.g. http://adomain.com/anImage.png
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
return imageUrl;
}
/**
* generate the description parsing the content (Best Guess)
* @param link the link to check
@ -867,7 +855,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
} catch (Exception e) {
System.out.println("Error" + e);
}
}
}
/**