improved images recognition when pargin html pages having no image indication from openGraph
git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93875 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
parent
312692c163
commit
d240989512
|
@ -1,6 +1,6 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<classpath>
|
<classpath>
|
||||||
<classpathentry kind="src" output="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes" path="src/main/java">
|
<classpathentry kind="src" output="target/share-updates-1.2.2-SNAPSHOT/WEB-INF/classes" path="src/main/java">
|
||||||
<attributes>
|
<attributes>
|
||||||
<attribute name="optional" value="true"/>
|
<attribute name="optional" value="true"/>
|
||||||
<attribute name="maven.pomderived" value="true"/>
|
<attribute name="maven.pomderived" value="true"/>
|
||||||
|
@ -31,5 +31,5 @@
|
||||||
<attribute name="maven.pomderived" value="true"/>
|
<attribute name="maven.pomderived" value="true"/>
|
||||||
</attributes>
|
</attributes>
|
||||||
</classpathentry>
|
</classpathentry>
|
||||||
<classpathentry kind="output" path="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes"/>
|
<classpathentry kind="output" path="target/share-updates-1.2.2-SNAPSHOT/WEB-INF/classes"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
eclipse.preferences.version=1
|
eclipse.preferences.version=1
|
||||||
jarsExcludedFromWebInfLib=
|
jarsExcludedFromWebInfLib=
|
||||||
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
|
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.2-SNAPSHOT
|
||||||
warSrcDir=src/main/webapp
|
warSrcDir=src/main/webapp
|
||||||
warSrcDirIsOutput=false
|
warSrcDirIsOutput=false
|
||||||
|
|
|
@ -4,6 +4,9 @@
|
||||||
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
|
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
|
||||||
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
|
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
|
||||||
<wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
|
<wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
|
||||||
|
<dependent-module archiveName="fileupload-progress-bar-1.0.0-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/fileupload-progress-bar/fileupload-progress-bar">
|
||||||
|
<dependency-type>uses</dependency-type>
|
||||||
|
</dependent-module>
|
||||||
<property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
|
<property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
|
||||||
<property name="context-root" value="share-updates"/>
|
<property name="context-root" value="share-updates"/>
|
||||||
</wb-module>
|
</wb-module>
|
||||||
|
|
7
pom.xml
7
pom.xml
|
@ -13,7 +13,7 @@
|
||||||
<groupId>org.gcube.portlets.user</groupId>
|
<groupId>org.gcube.portlets.user</groupId>
|
||||||
<artifactId>share-updates</artifactId>
|
<artifactId>share-updates</artifactId>
|
||||||
<packaging>war</packaging>
|
<packaging>war</packaging>
|
||||||
<version>1.2.1-SNAPSHOT</version>
|
<version>1.2.2-SNAPSHOT</version>
|
||||||
|
|
||||||
<name>gCube Share Updates Portlet</name>
|
<name>gCube Share Updates Portlet</name>
|
||||||
<description>
|
<description>
|
||||||
|
@ -190,11 +190,6 @@
|
||||||
<artifactId>htmlcleaner</artifactId>
|
<artifactId>htmlcleaner</artifactId>
|
||||||
<version>2.2</version>
|
<version>2.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
|
||||||
<groupId>net.sf.jtidy</groupId>
|
|
||||||
<artifactId>jtidy</artifactId>
|
|
||||||
<version>r938</version>
|
|
||||||
</dependency>
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>net.eliasbalasis</groupId>
|
<groupId>net.eliasbalasis</groupId>
|
||||||
<artifactId>tibcopagebus4gwt</artifactId>
|
<artifactId>tibcopagebus4gwt</artifactId>
|
||||||
|
|
|
@ -68,9 +68,6 @@ import org.htmlcleaner.TagNode;
|
||||||
import org.htmlparser.beans.StringBean;
|
import org.htmlparser.beans.StringBean;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.w3c.dom.Document;
|
|
||||||
import org.w3c.dom.NodeList;
|
|
||||||
import org.w3c.tidy.Tidy;
|
|
||||||
|
|
||||||
import com.google.gwt.user.server.rpc.RemoteServiceServlet;
|
import com.google.gwt.user.server.rpc.RemoteServiceServlet;
|
||||||
import com.liferay.portal.kernel.exception.PortalException;
|
import com.liferay.portal.kernel.exception.PortalException;
|
||||||
|
@ -656,14 +653,19 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
||||||
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
|
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
|
||||||
} else {
|
} else {
|
||||||
//there is OpenGraph
|
//there is OpenGraph
|
||||||
|
_log.info("OpenGraph Found") ;
|
||||||
title = ogLink.getContent("title");
|
title = ogLink.getContent("title");
|
||||||
description = (ogLink.getContent("description") != null) ? ogLink.getContent("description") : "";
|
description = (ogLink.getContent("description") != null) ? ogLink.getContent("description") : "";
|
||||||
description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description);
|
description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description);
|
||||||
//look for the image ask the guesser if not present
|
//look for the image ask the guesser if not present
|
||||||
if (ogLink.getContent("image") != null)
|
if (ogLink.getContent("image") != null) {
|
||||||
imageUrls.add(ogLink.getContent("image"));
|
String imageUrl = getImageUrlFromSrcAttribute(pageURL, ogLink.getContent("image"));
|
||||||
|
imageUrls.add(imageUrl);
|
||||||
|
_log.trace("OpenGraph getImage = " +imageUrl) ;
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
ArrayList<String> images = getImagesFromHTML(siteConnection, pageURL);
|
_log.trace("OpenGraph No Image, trying manuale parsing");
|
||||||
|
ArrayList<String> images = getImagesWithCleaner(pageURL);
|
||||||
if (! images.isEmpty())
|
if (! images.isEmpty())
|
||||||
imageUrls = images;
|
imageUrls = images;
|
||||||
}
|
}
|
||||||
|
@ -677,29 +679,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
||||||
}
|
}
|
||||||
return toReturn;
|
return toReturn;
|
||||||
}
|
}
|
||||||
/**
|
|
||||||
* get all the image urls from an HTML page up to 15
|
|
||||||
* @param pageURL the url
|
|
||||||
* @return a list of image url
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
private ArrayList<String> getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
|
|
||||||
ArrayList<String> toReturn = new ArrayList<String>();
|
|
||||||
try {
|
|
||||||
Document document = new Tidy().parseDOM(pageURL.openStream(), null);
|
|
||||||
NodeList imgs = document.getElementsByTagName("img");
|
|
||||||
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
|
|
||||||
for (int i = 0; i < upTo; i++) {
|
|
||||||
System.out.println(i);
|
|
||||||
toReturn.add(imgs.item(i).getAttributes().getNamedItem("src").getNodeValue());
|
|
||||||
}
|
|
||||||
}catch (NullPointerException e) {
|
|
||||||
_log.error("Error parsing HTML for images, malformed HTML returning what I found so far ... ");
|
|
||||||
return toReturn;
|
|
||||||
}
|
|
||||||
return toReturn;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content
|
* to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content
|
||||||
* @param pageUrl
|
* @param pageUrl
|
||||||
|
@ -770,14 +749,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
||||||
TagNode[] title = pageData.getElementsByName("title", true);
|
TagNode[] title = pageData.getElementsByName("title", true);
|
||||||
if (title != null && title.length > 0) {
|
if (title != null && title.length > 0) {
|
||||||
String theTitle = title[0].getChildren().get(0).toString();
|
String theTitle = title[0].getChildren().get(0).toString();
|
||||||
System.out.println("theTitle: " + theTitle);
|
_log.trace("theTitle: " + theTitle);
|
||||||
return theTitle;
|
return theTitle;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* if jTidy has problems try with with HtmlCleaner API to read the images
|
* try with HtmlCleaner API to read the images
|
||||||
* @param pageURL
|
* @param pageURL
|
||||||
* @return the title of the page or null if can't read it
|
* @return the title of the page or null if can't read it
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
@ -807,27 +786,36 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
||||||
int upTo = (imgs.length > 15) ? 15 : imgs.length;
|
int upTo = (imgs.length > 15) ? 15 : imgs.length;
|
||||||
for (int i = 0; i < upTo; i++) {
|
for (int i = 0; i < upTo; i++) {
|
||||||
if (imgs[i].hasAttribute("src")) {
|
if (imgs[i].hasAttribute("src")) {
|
||||||
String imageUrl = imgs[i].getAttributeByName("src");
|
String imageUrl = getImageUrlFromSrcAttribute(pageURL, imgs[i].getAttributeByName("src"));
|
||||||
if (imageUrl.startsWith("/"))
|
|
||||||
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
|
|
||||||
else if (imageUrl.startsWith("../")) {
|
|
||||||
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
|
||||||
}
|
|
||||||
else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
|
|
||||||
// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
|
|
||||||
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
|
|
||||||
imageUrl= imageFolder + "/" + imageUrl;
|
|
||||||
}
|
|
||||||
else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
|
|
||||||
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
|
||||||
}
|
|
||||||
images.add(imageUrl);
|
images.add(imageUrl);
|
||||||
_log.trace("[FOUND image] " + imageUrl);
|
_log.trace("[FOUND image] " + imageUrl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return images;
|
return images;
|
||||||
}
|
}
|
||||||
|
/**
|
||||||
|
* There are several ways to refer an image in a HTML, this method use an heuristic to get the actual image url
|
||||||
|
* @param pageURL the url
|
||||||
|
* @param srcAttr the content of the img src attribute
|
||||||
|
* @return the image url ready to be referred outside native environment
|
||||||
|
*/
|
||||||
|
private String getImageUrlFromSrcAttribute(URL pageURL, String srcAttr) {
|
||||||
|
String imageUrl = srcAttr;
|
||||||
|
if (imageUrl.startsWith("/")) //referred as absolute path case
|
||||||
|
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
|
||||||
|
else if (imageUrl.startsWith("../")) { //relative path case
|
||||||
|
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
||||||
|
}
|
||||||
|
else if (!imageUrl.contains("/")) { //the image is probably in the same folder
|
||||||
|
// e.g. http://www.adomain.com/docrep/018/i3328e/i3328e00.htm?utm_source
|
||||||
|
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
|
||||||
|
imageUrl= imageFolder + "/" + imageUrl;
|
||||||
|
}
|
||||||
|
else if (!imageUrl.startsWith("http") ) { //e.g. http://adomain.com/anImage.png
|
||||||
|
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
||||||
|
}
|
||||||
|
return imageUrl;
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* generate the description parsing the content (Best Guess)
|
* generate the description parsing the content (Best Guess)
|
||||||
* @param link the link to check
|
* @param link the link to check
|
||||||
|
@ -867,7 +855,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
||||||
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
|
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
System.out.println("Error" + e);
|
System.out.println("Error" + e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue