added user agent property to http requests to avoid getting 403 errors, refined the way to guess content and images when parsing HTML

git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93471 82a268e6-3cf1-43bd-a215-b396298e98cf
2014-03-22 16:32:46 +00:00 · 2014-03-22 16:32:46 +00:00 · 91b8e92a47
parent bb6f5cb846
commit 91b8e92a47
9 changed files with 424 additions and 336 deletions
--- a/.classpath
+++ b/.classpath
@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <classpath>
-	<classpathentry kind="src" output="target/share-updates-1.2.0-SNAPSHOT/WEB-INF/classes" path="src/main/java">
+	<classpathentry kind="src" output="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes" path="src/main/java">
 		<attributes>
 			<attribute name="optional" value="true"/>
 			<attribute name="maven.pomderived" value="true"/>
@ -31,5 +31,5 @@
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="output" path="target/share-updates-1.2.0-SNAPSHOT/WEB-INF/classes"/>
+	<classpathentry kind="output" path="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes"/>
 </classpath>
--- a/.settings/com.google.gdt.eclipse.core.prefs
+++ b/.settings/com.google.gdt.eclipse.core.prefs
@ -1,5 +1,5 @@
 eclipse.preferences.version=1
 jarsExcludedFromWebInfLib=
-lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.0-SNAPSHOT
+lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
 warSrcDir=src/main/webapp
 warSrcDirIsOutput=false
--- a/.settings/org.eclipse.wst.common.component
+++ b/.settings/org.eclipse.wst.common.component
@ -4,9 +4,6 @@
    <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
    <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
        <wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
-        <dependent-module archiveName="fileupload-progress-bar-1.0.0-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/fileupload-progress-bar/fileupload-progress-bar">
-            <dependency-type>uses</dependency-type>
-        </dependent-module>
    <property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
        <property name="context-root" value="share-updates"/>
  </wb-module>
--- a/pom.xml
+++ b/pom.xml
@ -13,7 +13,7 @@
 	<groupId>org.gcube.portlets.user</groupId>
 	<artifactId>share-updates</artifactId>
 	<packaging>war</packaging>
-	<version>1.2.0-SNAPSHOT</version>
+	<version>1.2.1-SNAPSHOT</version>

 	<name>gCube Share Updates Portlet</name>
 	<description>
@ -106,13 +106,13 @@
 			<groupId>org.gcube.contentmanagement</groupId>
 			<artifactId>storage-manager-core</artifactId>
 			<version>[2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT)</version>
-			<scope>compile</scope>
+			<scope>provided</scope>
 		</dependency>
 		<dependency>
 			<groupId>org.gcube.contentmanagement</groupId>
 			<artifactId>storage-manager-wrapper</artifactId>
 			<version>[2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT)</version>
-			<scope>compile</scope>
+			<scope>provided</scope>
 		</dependency>
 		<dependency>
 			<groupId>org.gcube.applicationsupportlayer</groupId>
--- a/src/main/java/org/gcube/portlets/user/shareupdates/client/ShareUpdates.java
+++ b/src/main/java/org/gcube/portlets/user/shareupdates/client/ShareUpdates.java
@ -10,9 +10,6 @@ import com.google.gwt.user.client.ui.RootPanel;
 */
 public class ShareUpdates implements EntryPoint {
 	
-	/**
-	 * This is the entry point method.
-	 */
 	public void onModuleLoad() {
 		RootPanel.get("shareUpdateDiv").add(new ShareUpdateForm());
 	}
--- a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java
+++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java
@ -1,12 +1,17 @@
 package org.gcube.portlets.user.shareupdates.server;

+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
 import java.net.HttpURLConnection;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.net.URLConnection;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
@ -57,6 +62,7 @@ import org.gcube.portlets.user.shareupdates.client.ShareUpdateService;
 import org.gcube.portlets.user.shareupdates.client.view.ShareUpdateForm;
 import org.gcube.portlets.user.shareupdates.server.metaseeker.MetaSeeker;
 import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
+import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraphNamespace;
 import org.gcube.portlets.user.shareupdates.shared.LinkPreview;
 import org.gcube.portlets.user.shareupdates.shared.UserSettings;
 import org.gcube.portlets.widgets.pickuser.shared.PickingUser;
@ -66,6 +72,8 @@ import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayGroupManager;
 import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayUserManager;
 import org.gcube.vomanagement.usermanagement.model.GroupModel;
 import org.gcube.vomanagement.usermanagement.model.UserModel;
+import org.htmlcleaner.HtmlCleaner;
+import org.htmlcleaner.TagNode;
 import org.htmlparser.beans.StringBean;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -134,8 +142,8 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 		if (user == null) {
 			_log.warn("USER IS NULL setting test.user and Running OUTSIDE PORTAL");
 			user = "test.user";
-//			user = "massimiliano.assante";
-//			SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE");
+			user = "massimiliano.assante";
+			SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE");
 			withinPortal = false;
 		}
 		else {
@ -278,10 +286,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 		return escapedFeedText;
 	}

-	private UserSettings getUserSettingsFromSession() {
-		return (UserSettings) getASLSession().getAttribute(UserInfo.USER_INFO_ATTR);
-	}
-
 	private void setUserSettingsInSession(UserSettings user) {
 		getASLSession().setAttribute(UserInfo.USER_INFO_ATTR, user);
 	}
@ -357,13 +361,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 				return toReturn;
 			}
 			else {
-				_log.info("Returning test USER");
+
+				_log.info("Returning test USER = " + session.getUsername());
 				HashMap<String, String> fakeVreNames = new  HashMap<String, String>();
-				fakeVreNames.put("/gcube/devsec/devVRE","devVRE");
+				//fakeVreNames.put("/gcube/devsec/devVRE","devVRE");
 				//fakeVreNames.put("/gcube/devNext/NexNext","NexNext");

-				UserInfo user =  new UserInfo(getASLSession().getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames);
-				return new UserSettings(user, 0, session.getScopeName(), isInfrastructureScope());
+				UserInfo user =  new UserInfo(session.getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames);
+				return new UserSettings(user, 0, session.getScopeName(), false);
 			}

 		} catch (Exception e) {
@ -638,6 +643,9 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 			_log.error("url is not reachable");
 			return null;
 		}
+		//pretend you're a browser (make my request from Java more “browsery-like”.) 
+		siteConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
+
 		String title;
 		String description;
 		ArrayList<String> imageUrls = new ArrayList<String>();
@ -651,7 +659,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 			if (ogLink == null || ogLink.getContent("title") == null) { 
 				//there is no OpenGraph for this link
 				_log.info("No OpenGraph Found, going Best guess from page content") ;
-				toReturn =  getInfoFromHTML(pageURL, linkToCheck, host);				
+				toReturn =  getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);				
 			} else {
 				//there is OpenGraph
 				title =  ogLink.getContent("title");
@ -661,7 +669,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 				if (ogLink.getContent("image") != null) 
 					imageUrls.add(ogLink.getContent("image"));	
 				else {
-					ArrayList<String> images = getImagesFromHTML(pageURL);
+					ArrayList<String> images = getImagesFromHTML(siteConnection, pageURL);
 					if (! images.isEmpty())
 						imageUrls = images;
 				}
@ -681,11 +689,10 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 	 * @return a list of image url
 	 * @throws IOException
 	 */
-	private ArrayList<String> getImagesFromHTML(URL pageURL) throws IOException {
+	private ArrayList<String> getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
 		ArrayList<String> toReturn = new ArrayList<String>();
-		InputStream input = pageURL.openStream();
 		try {
-			Document document = new Tidy().parseDOM(input, null);
+			Document document = new Tidy().parseDOM(pageURL.openStream(), null);
 			NodeList imgs = document.getElementsByTagName("img");
 			int upTo =  (imgs.getLength() > 15) ? 15 : imgs.getLength();
 			for (int i = 0; i < upTo; i++) {
@ -707,49 +714,126 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 	 * @return a LinPreview object instance filled with the extracted information
 	 * @throws IOException
 	 */
-	private LinkPreview getInfoFromHTML(URL pageUrl, String link, String host)  throws Exception {
+	private LinkPreview getInfoFromHTML(URLConnection connection, URL pageUrl, String link, String host)  throws Exception {
 		LinkPreview toReturn = null;
 		String title = "";
 		String description = "";

-		InputStream input = pageUrl.openStream();
-		Document document = new Tidy().parseDOM(input, null);
-		NodeList titles = document.getElementsByTagName("title");
-		if (titles != null && titles.getLength()>0) {
-			if (titles.item(0).getFirstChild() == null || titles.item(0).getFirstChild().getNodeValue() == null) {
-				_log.error("[MANUAL-PARSE] Something wrong with the title element, returning ... ");
-				return toReturn;
-			}
-			title = titles.item(0).getFirstChild().getNodeValue();
+		URLConnection conn = pageUrl.openConnection();
+		//pretend you're a browser (make my request from Java more “browsery-like”.) 
+		conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
+
 		MetaSeeker ms = null;
 		try {
-				ms = new MetaSeeker(link);
-			} catch(Exception e) {
-				_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
-				return toReturn;
-			}
+			title = getTitleFromHeader(pageUrl);
+			ms = new MetaSeeker(connection, pageUrl);
+			
 			//try the metadata, otherwise ask the guesser
 			description = (ms.getContent("description") != null &&  ! ms.getContent("description").isEmpty()) ?  ms.getContent("description") : createDescriptionFromContent(link);

 			ArrayList<String> images = new ArrayList<String>();
-			NodeList imgs = document.getElementsByTagName("img");
-			int upTo =  (imgs.getLength() > 15) ? 15 : imgs.getLength();
+			images = getImagesWithCleaner(pageUrl);
+			toReturn = new LinkPreview(title, description, link, host, images);
+			
+		} catch(Exception e) {
+			_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
+			return toReturn;
+		}
+		return toReturn;
+	}
+	/**
+	 * @param pageURL
+	 * @return the title of the page or null if can't read it
+	 * @throws IOException
+	 */
+	private String getTitleFromHeader(URL pageURL) throws IOException {
+		URLConnection conn = pageURL.openConnection();
+		//pretend you're a browser (make my request from Java more “browsery-like”.) 
+		conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
+
+		Charset charset = OpenGraph.getConnectionCharset(conn);
+		BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
+		String inputLine;
+		StringBuffer headContents = new StringBuffer();
+
+		// Loop through each line, looking for the closing head element
+		while ((inputLine = dis.readLine()) != null)
+		{
+			if (inputLine.contains("</head>")) {
+				inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7);
+				inputLine = inputLine.concat("<body></body></html>");
+				headContents.append(inputLine + "\r\n");
+				break;
+			}
+			headContents.append(inputLine + "\r\n");
+		}
+
+		String headContentsStr = headContents.toString();
+		HtmlCleaner cleaner = new HtmlCleaner();
+		// parse the string HTML
+		TagNode pageData = cleaner.clean(headContentsStr);
+		// open only the title tags
+		TagNode[] title = pageData.getElementsByName("title", true);
+		if (title != null && title.length > 0) {
+			String theTitle = title[0].getChildren().get(0).toString();
+			System.out.println("theTitle: " + theTitle);
+			return theTitle;
+		}
+		return null;
+	}
+
+	/**
+	 * if jTidy has problems try with with HtmlCleaner API to read the images
+	 * @param pageURL
+	 * @return the title of the page or null if can't read it
+	 * @throws IOException
+	 */
+	private ArrayList<String> getImagesWithCleaner(URL pageURL) throws IOException {
+		ArrayList<String> images = new ArrayList<String>();
+		URLConnection conn = pageURL.openConnection();
+		//pretend you're a browser (make my request from Java more “browsery-like”.) 
+		conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
+
+		Charset charset = OpenGraph.getConnectionCharset(conn);
+		BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
+		String inputLine;
+		StringBuffer headContents = new StringBuffer();
+
+		// Loop through each line, looking for the closing head element
+		while ((inputLine = dis.readLine()) != null) {
+			headContents.append(inputLine + "\r\n");
+		}
+
+		String headContentsStr = headContents.toString();
+		HtmlCleaner cleaner = new HtmlCleaner();
+		// parse the string HTML
+		TagNode pageData = cleaner.clean(headContentsStr);
+		// open only the title tags
+		TagNode[] imgs = pageData.getElementsByName("img", true);
+		int upTo =  (imgs.length > 15) ? 15 : imgs.length;
 		for (int i = 0; i < upTo; i++) {
-				String imageUrl = imgs.item(i).getAttributes().getNamedItem("src").getNodeValue();
+			if (imgs[i].hasAttribute("src")) { 
+				String imageUrl = imgs[i].getAttributeByName("src");
 				if (imageUrl.startsWith("/")) 
-					imageUrl = pageUrl.getProtocol()+"://"+pageUrl.getHost()+imageUrl;
+					imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
+				else if (imageUrl.startsWith("../")) {
+					imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl :  pageURL.toExternalForm() + "/" + imageUrl;
+				}
 				else if (!imageUrl.contains("/")) {  //then the image is probably in the same folder
 					// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
-					String imageFolder = pageUrl.toString().substring(0, pageUrl.toString().lastIndexOf("/"));
+					String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
 					imageUrl= imageFolder + "/" + imageUrl;
 				}
+				else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
+					imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl :  pageURL.toExternalForm() + "/" + imageUrl;
+				}
 				images.add(imageUrl);						
 				_log.trace("[FOUND image] " + imageUrl);
 			}
-			toReturn = new LinkPreview(title, description, link, host, images);
 		}
-		return toReturn;
+		return images;
 	}
+
 	/**
 	 * generate the description parsing the content (Best Guess)
 	 * @param link the link to check
@ -857,18 +941,5 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements	Shar
 		}
 		return portalUsers;
 	}
-	/**
-	 * 
-	 * @return the workspace instance
-	 * @throws InternalErrorException
-	 * @throws HomeNotFoundException
-	 * @throws WorkspaceFolderNotFoundException
-	 */
-	private Workspace getWorkspace() throws InternalErrorException, HomeNotFoundException, WorkspaceFolderNotFoundException {
-		final ASLSession session = getASLSession();
-		Workspace workspace = HomeLibrary.getUserWorkspace(session.getUsername());
-		return workspace;
-	}
-

 }
--- a/src/main/java/org/gcube/portlets/user/shareupdates/server/UploadToWorkspaceThread.java
+++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/UploadToWorkspaceThread.java
@ -8,6 +8,8 @@ import java.util.Date;
 import org.gcube.common.homelibrary.home.HomeLibrary;
 import org.gcube.common.homelibrary.home.workspace.Workspace;
 import org.gcube.common.homelibrary.home.workspace.exceptions.ItemAlreadyExistException;
+import org.gcube.common.portal.PortalContext;
+import org.gcube.common.scope.api.ScopeProvider;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -46,26 +48,37 @@ public class UploadToWorkspaceThread implements Runnable {
 	@Override
 	public void run() {
 		try {
+			String currScope = ScopeProvider.instance.get();
+			ScopeProvider.instance.set("/"+PortalContext.getConfiguration().getInfrastructureName());
+			
 			Workspace ws = HomeLibrary
 					.getHomeManagerFactory()
 					.getHomeManager()
 					.getHome(username).getWorkspace();
 			
+			_log.info("File to upload="+fileabsolutePathOnServer);
 			File file = new File(fileabsolutePathOnServer);
 			String mimeType = ShareUpdateServiceImpl.getMimeType(file, fileName);
 			InputStream fileData = new FileInputStream(file);
 			String theId = "";
+			_log.info("mimeType="+mimeType + " fileData null? " + (fileData == null) );
 			try {
 				theId = ws.createExternalFile(fileName ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId();
 			}
+			catch (NullPointerException exn) {
+				_log.warn("null pointer");
+				 exn.printStackTrace();
+			}
 			catch (ItemAlreadyExistException ex) {			
 				_log.warn("fileName " + fileName + " exists, appending timestamp");
 				 theId = ws.createExternalFile(fileName+" ("+ new Date()+")" ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId();
+				 ex.printStackTrace();
 			} finally {
 				fileData.close();
 			}
 			fileData.close();
 			_log.debug("Uploaded " + fileName + " - Returned Workspace id=" + theId);
+			ScopeProvider.instance.set(currScope);
 		}

 		catch (Exception e) {			
--- a/src/main/java/org/gcube/portlets/user/shareupdates/server/metaseeker/MetaSeeker.java
+++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/metaseeker/MetaSeeker.java
@ -3,9 +3,12 @@ package org.gcube.portlets.user.shareupdates.server.metaseeker;
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
 import java.net.URL;
+import java.net.URLConnection;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Hashtable;

+import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
 import org.htmlcleaner.HtmlCleaner;
 import org.htmlcleaner.TagNode;
 import org.slf4j.Logger;
@ -61,12 +64,11 @@ public class MetaSeeker {
 	 * @param url The address to the web page to fetch the meta
 	 * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception
 	 */
-	public MetaSeeker(String url) throws java.io.IOException, Exception {
+	public MetaSeeker(URLConnection connection, URL httpURL) throws java.io.IOException, Exception {
 		this();
 		isImported = true;
-		// download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content
-		URL httpURL = new URL(url);
-		BufferedReader dis = new BufferedReader(new InputStreamReader(httpURL.openStream()));
+		Charset charset = OpenGraph.getConnectionCharset(connection);
+		BufferedReader dis  = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));
 		String inputLine;
 		StringBuffer headContents = new StringBuffer();

--- a/src/main/java/org/gcube/portlets/user/shareupdates/server/opengraph/OpenGraph.java
+++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/opengraph/OpenGraph.java
@ -4,6 +4,7 @@ import org.htmlcleaner.HtmlCleaner;
 import org.htmlcleaner.TagNode;

 import java.io.BufferedReader;
+import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.URL;
 import java.net.URLConnection;
@ -163,9 +164,16 @@ public class OpenGraph
 	 * @return the Charset object for response charset name;
 	 *         if it's not found then the default charset.
 	 */
-    private static Charset getConnectionCharset(URLConnection connection)
-    {
-        String contentType = connection.getContentType();
+	public static Charset getConnectionCharset(URLConnection connection) {
+		String contentType = null;
+		try {
+			contentType = connection.getContentType();
+		}
+		catch (Exception e) {
+			// specified charset is not found,
+			// skip it to return the default one
+			return Charset.defaultCharset();
+		}
 		if (contentType != null && contentType.length() > 0)
 		{
 			contentType = contentType.toLowerCase();