improved images recognition when pargin html pages having no image indication from openGraph
git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93875 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
parent
312692c163
commit
d240989512
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<classpath>
|
||||
<classpathentry kind="src" output="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes" path="src/main/java">
|
||||
<classpathentry kind="src" output="target/share-updates-1.2.2-SNAPSHOT/WEB-INF/classes" path="src/main/java">
|
||||
<attributes>
|
||||
<attribute name="optional" value="true"/>
|
||||
<attribute name="maven.pomderived" value="true"/>
|
||||
|
@ -31,5 +31,5 @@
|
|||
<attribute name="maven.pomderived" value="true"/>
|
||||
</attributes>
|
||||
</classpathentry>
|
||||
<classpathentry kind="output" path="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes"/>
|
||||
<classpathentry kind="output" path="target/share-updates-1.2.2-SNAPSHOT/WEB-INF/classes"/>
|
||||
</classpath>
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
eclipse.preferences.version=1
|
||||
jarsExcludedFromWebInfLib=
|
||||
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
|
||||
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.2-SNAPSHOT
|
||||
warSrcDir=src/main/webapp
|
||||
warSrcDirIsOutput=false
|
||||
|
|
|
@ -4,6 +4,9 @@
|
|||
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
|
||||
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
|
||||
<wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
|
||||
<dependent-module archiveName="fileupload-progress-bar-1.0.0-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/fileupload-progress-bar/fileupload-progress-bar">
|
||||
<dependency-type>uses</dependency-type>
|
||||
</dependent-module>
|
||||
<property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
|
||||
<property name="context-root" value="share-updates"/>
|
||||
</wb-module>
|
||||
|
|
7
pom.xml
7
pom.xml
|
@ -13,7 +13,7 @@
|
|||
<groupId>org.gcube.portlets.user</groupId>
|
||||
<artifactId>share-updates</artifactId>
|
||||
<packaging>war</packaging>
|
||||
<version>1.2.1-SNAPSHOT</version>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
|
||||
<name>gCube Share Updates Portlet</name>
|
||||
<description>
|
||||
|
@ -190,11 +190,6 @@
|
|||
<artifactId>htmlcleaner</artifactId>
|
||||
<version>2.2</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.jtidy</groupId>
|
||||
<artifactId>jtidy</artifactId>
|
||||
<version>r938</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.eliasbalasis</groupId>
|
||||
<artifactId>tibcopagebus4gwt</artifactId>
|
||||
|
|
|
@ -68,9 +68,6 @@ import org.htmlcleaner.TagNode;
|
|||
import org.htmlparser.beans.StringBean;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.w3c.tidy.Tidy;
|
||||
|
||||
import com.google.gwt.user.server.rpc.RemoteServiceServlet;
|
||||
import com.liferay.portal.kernel.exception.PortalException;
|
||||
|
@ -656,14 +653,19 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
|||
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
|
||||
} else {
|
||||
//there is OpenGraph
|
||||
_log.info("OpenGraph Found") ;
|
||||
title = ogLink.getContent("title");
|
||||
description = (ogLink.getContent("description") != null) ? ogLink.getContent("description") : "";
|
||||
description = ((description.length() > 256) ? description.substring(0, 256)+"..." : description);
|
||||
//look for the image ask the guesser if not present
|
||||
if (ogLink.getContent("image") != null)
|
||||
imageUrls.add(ogLink.getContent("image"));
|
||||
if (ogLink.getContent("image") != null) {
|
||||
String imageUrl = getImageUrlFromSrcAttribute(pageURL, ogLink.getContent("image"));
|
||||
imageUrls.add(imageUrl);
|
||||
_log.trace("OpenGraph getImage = " +imageUrl) ;
|
||||
}
|
||||
else {
|
||||
ArrayList<String> images = getImagesFromHTML(siteConnection, pageURL);
|
||||
_log.trace("OpenGraph No Image, trying manuale parsing");
|
||||
ArrayList<String> images = getImagesWithCleaner(pageURL);
|
||||
if (! images.isEmpty())
|
||||
imageUrls = images;
|
||||
}
|
||||
|
@ -677,29 +679,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
|||
}
|
||||
return toReturn;
|
||||
}
|
||||
/**
|
||||
* get all the image urls from an HTML page up to 15
|
||||
* @param pageURL the url
|
||||
* @return a list of image url
|
||||
* @throws IOException
|
||||
*/
|
||||
private ArrayList<String> getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
|
||||
ArrayList<String> toReturn = new ArrayList<String>();
|
||||
try {
|
||||
Document document = new Tidy().parseDOM(pageURL.openStream(), null);
|
||||
NodeList imgs = document.getElementsByTagName("img");
|
||||
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
|
||||
for (int i = 0; i < upTo; i++) {
|
||||
System.out.println(i);
|
||||
toReturn.add(imgs.item(i).getAttributes().getNamedItem("src").getNodeValue());
|
||||
}
|
||||
}catch (NullPointerException e) {
|
||||
_log.error("Error parsing HTML for images, malformed HTML returning what I found so far ... ");
|
||||
return toReturn;
|
||||
}
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
/**
|
||||
* to use when OpenGraph is not available, Tries Metadata first, then Best guess from page content
|
||||
* @param pageUrl
|
||||
|
@ -770,14 +749,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
|||
TagNode[] title = pageData.getElementsByName("title", true);
|
||||
if (title != null && title.length > 0) {
|
||||
String theTitle = title[0].getChildren().get(0).toString();
|
||||
System.out.println("theTitle: " + theTitle);
|
||||
_log.trace("theTitle: " + theTitle);
|
||||
return theTitle;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* if jTidy has problems try with with HtmlCleaner API to read the images
|
||||
* try with HtmlCleaner API to read the images
|
||||
* @param pageURL
|
||||
* @return the title of the page or null if can't read it
|
||||
* @throws IOException
|
||||
|
@ -807,27 +786,36 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
|||
int upTo = (imgs.length > 15) ? 15 : imgs.length;
|
||||
for (int i = 0; i < upTo; i++) {
|
||||
if (imgs[i].hasAttribute("src")) {
|
||||
String imageUrl = imgs[i].getAttributeByName("src");
|
||||
if (imageUrl.startsWith("/"))
|
||||
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
|
||||
else if (imageUrl.startsWith("../")) {
|
||||
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
||||
}
|
||||
else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
|
||||
// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
|
||||
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
|
||||
imageUrl= imageFolder + "/" + imageUrl;
|
||||
}
|
||||
else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
|
||||
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
||||
}
|
||||
String imageUrl = getImageUrlFromSrcAttribute(pageURL, imgs[i].getAttributeByName("src"));
|
||||
images.add(imageUrl);
|
||||
_log.trace("[FOUND image] " + imageUrl);
|
||||
}
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
/**
|
||||
* There are several ways to refer an image in a HTML, this method use an heuristic to get the actual image url
|
||||
* @param pageURL the url
|
||||
* @param srcAttr the content of the img src attribute
|
||||
* @return the image url ready to be referred outside native environment
|
||||
*/
|
||||
private String getImageUrlFromSrcAttribute(URL pageURL, String srcAttr) {
|
||||
String imageUrl = srcAttr;
|
||||
if (imageUrl.startsWith("/")) //referred as absolute path case
|
||||
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
|
||||
else if (imageUrl.startsWith("../")) { //relative path case
|
||||
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
||||
}
|
||||
else if (!imageUrl.contains("/")) { //the image is probably in the same folder
|
||||
// e.g. http://www.adomain.com/docrep/018/i3328e/i3328e00.htm?utm_source
|
||||
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
|
||||
imageUrl= imageFolder + "/" + imageUrl;
|
||||
}
|
||||
else if (!imageUrl.startsWith("http") ) { //e.g. http://adomain.com/anImage.png
|
||||
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
|
||||
}
|
||||
return imageUrl;
|
||||
}
|
||||
/**
|
||||
* generate the description parsing the content (Best Guess)
|
||||
* @param link the link to check
|
||||
|
@ -867,7 +855,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
|
|||
HttpsURLConnection.setDefaultSSLSocketFactory(sc.getSocketFactory());
|
||||
} catch (Exception e) {
|
||||
System.out.println("Error" + e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
Loading…
Reference in New Issue