added user agent property to http requests to avoid getting 403 errors, refined the way to guess content and images when parsing HTML

git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93471 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Massimiliano Assante 2014-03-22 16:32:46 +00:00
parent bb6f5cb846
commit 91b8e92a47
9 changed files with 424 additions and 336 deletions

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/share-updates-1.2.0-SNAPSHOT/WEB-INF/classes" path="src/main/java">
<classpathentry kind="src" output="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
@ -31,5 +31,5 @@
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/share-updates-1.2.0-SNAPSHOT/WEB-INF/classes"/>
<classpathentry kind="output" path="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes"/>
</classpath>

View File

@ -1,5 +1,5 @@
eclipse.preferences.version=1
jarsExcludedFromWebInfLib=
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.0-SNAPSHOT
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
warSrcDir=src/main/webapp
warSrcDirIsOutput=false

View File

@ -4,9 +4,6 @@
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
<dependent-module archiveName="fileupload-progress-bar-1.0.0-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/fileupload-progress-bar/fileupload-progress-bar">
<dependency-type>uses</dependency-type>
</dependent-module>
<property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
<property name="context-root" value="share-updates"/>
</wb-module>

View File

@ -13,7 +13,7 @@
<groupId>org.gcube.portlets.user</groupId>
<artifactId>share-updates</artifactId>
<packaging>war</packaging>
<version>1.2.0-SNAPSHOT</version>
<version>1.2.1-SNAPSHOT</version>
<name>gCube Share Updates Portlet</name>
<description>
@ -106,13 +106,13 @@
<groupId>org.gcube.contentmanagement</groupId>
<artifactId>storage-manager-core</artifactId>
<version>[2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT)</version>
<scope>compile</scope>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.gcube.contentmanagement</groupId>
<artifactId>storage-manager-wrapper</artifactId>
<version>[2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT)</version>
<scope>compile</scope>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.gcube.applicationsupportlayer</groupId>

View File

@ -10,9 +10,6 @@ import com.google.gwt.user.client.ui.RootPanel;
*/
public class ShareUpdates implements EntryPoint {
/**
* This is the entry point method.
*/
public void onModuleLoad() {
RootPanel.get("shareUpdateDiv").add(new ShareUpdateForm());
}

View File

@ -1,12 +1,17 @@
package org.gcube.portlets.user.shareupdates.server;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
@ -57,6 +62,7 @@ import org.gcube.portlets.user.shareupdates.client.ShareUpdateService;
import org.gcube.portlets.user.shareupdates.client.view.ShareUpdateForm;
import org.gcube.portlets.user.shareupdates.server.metaseeker.MetaSeeker;
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraphNamespace;
import org.gcube.portlets.user.shareupdates.shared.LinkPreview;
import org.gcube.portlets.user.shareupdates.shared.UserSettings;
import org.gcube.portlets.widgets.pickuser.shared.PickingUser;
@ -66,6 +72,8 @@ import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayGroupManager;
import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayUserManager;
import org.gcube.vomanagement.usermanagement.model.GroupModel;
import org.gcube.vomanagement.usermanagement.model.UserModel;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlparser.beans.StringBean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -134,8 +142,8 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
if (user == null) {
_log.warn("USER IS NULL setting test.user and Running OUTSIDE PORTAL");
user = "test.user";
// user = "massimiliano.assante";
// SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE");
user = "massimiliano.assante";
SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE");
withinPortal = false;
}
else {
@ -278,10 +286,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
return escapedFeedText;
}
private UserSettings getUserSettingsFromSession() {
return (UserSettings) getASLSession().getAttribute(UserInfo.USER_INFO_ATTR);
}
private void setUserSettingsInSession(UserSettings user) {
getASLSession().setAttribute(UserInfo.USER_INFO_ATTR, user);
}
@ -357,13 +361,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
return toReturn;
}
else {
_log.info("Returning test USER");
_log.info("Returning test USER = " + session.getUsername());
HashMap<String, String> fakeVreNames = new HashMap<String, String>();
fakeVreNames.put("/gcube/devsec/devVRE","devVRE");
//fakeVreNames.put("/gcube/devsec/devVRE","devVRE");
//fakeVreNames.put("/gcube/devNext/NexNext","NexNext");
UserInfo user = new UserInfo(getASLSession().getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames);
return new UserSettings(user, 0, session.getScopeName(), isInfrastructureScope());
UserInfo user = new UserInfo(session.getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames);
return new UserSettings(user, 0, session.getScopeName(), false);
}
} catch (Exception e) {
@ -638,6 +643,9 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
_log.error("url is not reachable");
return null;
}
//pretend you're a browser (make my request from Java more browsery-like.)
siteConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
String title;
String description;
ArrayList<String> imageUrls = new ArrayList<String>();
@ -651,7 +659,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
if (ogLink == null || ogLink.getContent("title") == null) {
//there is no OpenGraph for this link
_log.info("No OpenGraph Found, going Best guess from page content") ;
toReturn = getInfoFromHTML(pageURL, linkToCheck, host);
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
} else {
//there is OpenGraph
title = ogLink.getContent("title");
@ -661,7 +669,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
if (ogLink.getContent("image") != null)
imageUrls.add(ogLink.getContent("image"));
else {
ArrayList<String> images = getImagesFromHTML(pageURL);
ArrayList<String> images = getImagesFromHTML(siteConnection, pageURL);
if (! images.isEmpty())
imageUrls = images;
}
@ -681,11 +689,10 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
* @return a list of image url
* @throws IOException
*/
private ArrayList<String> getImagesFromHTML(URL pageURL) throws IOException {
private ArrayList<String> getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
ArrayList<String> toReturn = new ArrayList<String>();
InputStream input = pageURL.openStream();
try {
Document document = new Tidy().parseDOM(input, null);
Document document = new Tidy().parseDOM(pageURL.openStream(), null);
NodeList imgs = document.getElementsByTagName("img");
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
for (int i = 0; i < upTo; i++) {
@ -707,49 +714,126 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
* @return a LinPreview object instance filled with the extracted information
* @throws IOException
*/
private LinkPreview getInfoFromHTML(URL pageUrl, String link, String host) throws Exception {
private LinkPreview getInfoFromHTML(URLConnection connection, URL pageUrl, String link, String host) throws Exception {
LinkPreview toReturn = null;
String title = "";
String description = "";
InputStream input = pageUrl.openStream();
Document document = new Tidy().parseDOM(input, null);
NodeList titles = document.getElementsByTagName("title");
if (titles != null && titles.getLength()>0) {
if (titles.item(0).getFirstChild() == null || titles.item(0).getFirstChild().getNodeValue() == null) {
_log.error("[MANUAL-PARSE] Something wrong with the title element, returning ... ");
return toReturn;
}
title = titles.item(0).getFirstChild().getNodeValue();
URLConnection conn = pageUrl.openConnection();
//pretend you're a browser (make my request from Java more browsery-like.)
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
MetaSeeker ms = null;
try {
ms = new MetaSeeker(link);
} catch(Exception e) {
_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
return toReturn;
}
title = getTitleFromHeader(pageUrl);
ms = new MetaSeeker(connection, pageUrl);
//try the metadata, otherwise ask the guesser
description = (ms.getContent("description") != null && ! ms.getContent("description").isEmpty()) ? ms.getContent("description") : createDescriptionFromContent(link);
ArrayList<String> images = new ArrayList<String>();
NodeList imgs = document.getElementsByTagName("img");
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
images = getImagesWithCleaner(pageUrl);
toReturn = new LinkPreview(title, description, link, host, images);
} catch(Exception e) {
_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
return toReturn;
}
return toReturn;
}
/**
* @param pageURL
* @return the title of the page or null if can't read it
* @throws IOException
*/
private String getTitleFromHeader(URL pageURL) throws IOException {
URLConnection conn = pageURL.openConnection();
//pretend you're a browser (make my request from Java more browsery-like.)
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
Charset charset = OpenGraph.getConnectionCharset(conn);
BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();
// Loop through each line, looking for the closing head element
while ((inputLine = dis.readLine()) != null)
{
if (inputLine.contains("</head>")) {
inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7);
inputLine = inputLine.concat("<body></body></html>");
headContents.append(inputLine + "\r\n");
break;
}
headContents.append(inputLine + "\r\n");
}
String headContentsStr = headContents.toString();
HtmlCleaner cleaner = new HtmlCleaner();
// parse the string HTML
TagNode pageData = cleaner.clean(headContentsStr);
// open only the title tags
TagNode[] title = pageData.getElementsByName("title", true);
if (title != null && title.length > 0) {
String theTitle = title[0].getChildren().get(0).toString();
System.out.println("theTitle: " + theTitle);
return theTitle;
}
return null;
}
/**
* if jTidy has problems try with with HtmlCleaner API to read the images
* @param pageURL
* @return the title of the page or null if can't read it
* @throws IOException
*/
private ArrayList<String> getImagesWithCleaner(URL pageURL) throws IOException {
ArrayList<String> images = new ArrayList<String>();
URLConnection conn = pageURL.openConnection();
//pretend you're a browser (make my request from Java more browsery-like.)
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
Charset charset = OpenGraph.getConnectionCharset(conn);
BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();
// Loop through each line, looking for the closing head element
while ((inputLine = dis.readLine()) != null) {
headContents.append(inputLine + "\r\n");
}
String headContentsStr = headContents.toString();
HtmlCleaner cleaner = new HtmlCleaner();
// parse the string HTML
TagNode pageData = cleaner.clean(headContentsStr);
// open only the title tags
TagNode[] imgs = pageData.getElementsByName("img", true);
int upTo = (imgs.length > 15) ? 15 : imgs.length;
for (int i = 0; i < upTo; i++) {
String imageUrl = imgs.item(i).getAttributes().getNamedItem("src").getNodeValue();
if (imgs[i].hasAttribute("src")) {
String imageUrl = imgs[i].getAttributeByName("src");
if (imageUrl.startsWith("/"))
imageUrl = pageUrl.getProtocol()+"://"+pageUrl.getHost()+imageUrl;
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
else if (imageUrl.startsWith("../")) {
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
String imageFolder = pageUrl.toString().substring(0, pageUrl.toString().lastIndexOf("/"));
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
imageUrl= imageFolder + "/" + imageUrl;
}
else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
images.add(imageUrl);
_log.trace("[FOUND image] " + imageUrl);
}
toReturn = new LinkPreview(title, description, link, host, images);
}
return toReturn;
return images;
}
/**
* generate the description parsing the content (Best Guess)
* @param link the link to check
@ -857,18 +941,5 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
}
return portalUsers;
}
/**
*
* @return the workspace instance
* @throws InternalErrorException
* @throws HomeNotFoundException
* @throws WorkspaceFolderNotFoundException
*/
private Workspace getWorkspace() throws InternalErrorException, HomeNotFoundException, WorkspaceFolderNotFoundException {
final ASLSession session = getASLSession();
Workspace workspace = HomeLibrary.getUserWorkspace(session.getUsername());
return workspace;
}
}

View File

@ -8,6 +8,8 @@ import java.util.Date;
import org.gcube.common.homelibrary.home.HomeLibrary;
import org.gcube.common.homelibrary.home.workspace.Workspace;
import org.gcube.common.homelibrary.home.workspace.exceptions.ItemAlreadyExistException;
import org.gcube.common.portal.PortalContext;
import org.gcube.common.scope.api.ScopeProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -46,26 +48,37 @@ public class UploadToWorkspaceThread implements Runnable {
@Override
public void run() {
try {
String currScope = ScopeProvider.instance.get();
ScopeProvider.instance.set("/"+PortalContext.getConfiguration().getInfrastructureName());
Workspace ws = HomeLibrary
.getHomeManagerFactory()
.getHomeManager()
.getHome(username).getWorkspace();
_log.info("File to upload="+fileabsolutePathOnServer);
File file = new File(fileabsolutePathOnServer);
String mimeType = ShareUpdateServiceImpl.getMimeType(file, fileName);
InputStream fileData = new FileInputStream(file);
String theId = "";
_log.info("mimeType="+mimeType + " fileData null? " + (fileData == null) );
try {
theId = ws.createExternalFile(fileName ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId();
}
catch (NullPointerException exn) {
_log.warn("null pointer");
exn.printStackTrace();
}
catch (ItemAlreadyExistException ex) {
_log.warn("fileName " + fileName + " exists, appending timestamp");
theId = ws.createExternalFile(fileName+" ("+ new Date()+")" ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId();
ex.printStackTrace();
} finally {
fileData.close();
}
fileData.close();
_log.debug("Uploaded " + fileName + " - Returned Workspace id=" + theId);
ScopeProvider.instance.set(currScope);
}
catch (Exception e) {

View File

@ -3,9 +3,12 @@ package org.gcube.portlets.user.shareupdates.server.metaseeker;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Hashtable;
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
@ -61,12 +64,11 @@ public class MetaSeeker {
* @param url The address to the web page to fetch the meta
* @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception
*/
public MetaSeeker(String url) throws java.io.IOException, Exception {
public MetaSeeker(URLConnection connection, URL httpURL) throws java.io.IOException, Exception {
this();
isImported = true;
// download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content
URL httpURL = new URL(url);
BufferedReader dis = new BufferedReader(new InputStreamReader(httpURL.openStream()));
Charset charset = OpenGraph.getConnectionCharset(connection);
BufferedReader dis = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();

View File

@ -4,6 +4,7 @@ import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
@ -163,9 +164,16 @@ public class OpenGraph
* @return the Charset object for response charset name;
* if it's not found then the default charset.
*/
private static Charset getConnectionCharset(URLConnection connection)
{
String contentType = connection.getContentType();
public static Charset getConnectionCharset(URLConnection connection) {
String contentType = null;
try {
contentType = connection.getContentType();
}
catch (Exception e) {
// specified charset is not found,
// skip it to return the default one
return Charset.defaultCharset();
}
if (contentType != null && contentType.length() > 0)
{
contentType = contentType.toLowerCase();