From 91b8e92a479fb4b01d0827f712b5122b8f8a514c Mon Sep 17 00:00:00 2001 From: Massimiliano Assante Date: Sat, 22 Mar 2014 16:32:46 +0000 Subject: [PATCH] added user agent property to http requests to avoid getting 403 errors, refined the way to guess content and images when parsing HTML git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93471 82a268e6-3cf1-43bd-a215-b396298e98cf --- .classpath | 4 +- .settings/com.google.gdt.eclipse.core.prefs | 2 +- .settings/org.eclipse.wst.common.component | 3 - pom.xml | 6 +- .../shareupdates/client/ShareUpdates.java | 3 - .../server/ShareUpdateServiceImpl.java | 233 ++++++--- .../server/UploadToWorkspaceThread.java | 15 +- .../server/metaseeker/MetaSeeker.java | 10 +- .../server/opengraph/OpenGraph.java | 484 +++++++++--------- 9 files changed, 424 insertions(+), 336 deletions(-) diff --git a/.classpath b/.classpath index 3762556..13ea37a 100644 --- a/.classpath +++ b/.classpath @@ -1,6 +1,6 @@ - + @@ -31,5 +31,5 @@ - + diff --git a/.settings/com.google.gdt.eclipse.core.prefs b/.settings/com.google.gdt.eclipse.core.prefs index f4fc4f8..f7ef8a9 100644 --- a/.settings/com.google.gdt.eclipse.core.prefs +++ b/.settings/com.google.gdt.eclipse.core.prefs @@ -1,5 +1,5 @@ eclipse.preferences.version=1 jarsExcludedFromWebInfLib= -lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.0-SNAPSHOT +lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT warSrcDir=src/main/webapp warSrcDirIsOutput=false diff --git a/.settings/org.eclipse.wst.common.component b/.settings/org.eclipse.wst.common.component index dde9c8f..fb4e233 100644 --- a/.settings/org.eclipse.wst.common.component +++ b/.settings/org.eclipse.wst.common.component @@ -4,9 +4,6 @@ - - uses - diff --git a/pom.xml b/pom.xml index 7e5a4d2..8212263 100644 --- a/pom.xml +++ b/pom.xml @@ -13,7 +13,7 @@ org.gcube.portlets.user share-updates war - 1.2.0-SNAPSHOT + 1.2.1-SNAPSHOT gCube Share Updates Portlet @@ -106,13 +106,13 @@ org.gcube.contentmanagement storage-manager-core [2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT) - compile + provided org.gcube.contentmanagement storage-manager-wrapper [2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT) - compile + provided org.gcube.applicationsupportlayer diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/client/ShareUpdates.java b/src/main/java/org/gcube/portlets/user/shareupdates/client/ShareUpdates.java index 6bdd872..08a9860 100644 --- a/src/main/java/org/gcube/portlets/user/shareupdates/client/ShareUpdates.java +++ b/src/main/java/org/gcube/portlets/user/shareupdates/client/ShareUpdates.java @@ -10,9 +10,6 @@ import com.google.gwt.user.client.ui.RootPanel; */ public class ShareUpdates implements EntryPoint { - /** - * This is the entry point method. - */ public void onModuleLoad() { RootPanel.get("shareUpdateDiv").add(new ShareUpdateForm()); } diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java index 432d536..217fa2b 100644 --- a/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java +++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/ShareUpdateServiceImpl.java @@ -1,12 +1,17 @@ package org.gcube.portlets.user.shareupdates.server; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.net.HttpURLConnection; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -57,6 +62,7 @@ import org.gcube.portlets.user.shareupdates.client.ShareUpdateService; import org.gcube.portlets.user.shareupdates.client.view.ShareUpdateForm; import org.gcube.portlets.user.shareupdates.server.metaseeker.MetaSeeker; import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph; +import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraphNamespace; import org.gcube.portlets.user.shareupdates.shared.LinkPreview; import org.gcube.portlets.user.shareupdates.shared.UserSettings; import org.gcube.portlets.widgets.pickuser.shared.PickingUser; @@ -66,6 +72,8 @@ import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayGroupManager; import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayUserManager; import org.gcube.vomanagement.usermanagement.model.GroupModel; import org.gcube.vomanagement.usermanagement.model.UserModel; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.TagNode; import org.htmlparser.beans.StringBean; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -134,9 +142,9 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar if (user == null) { _log.warn("USER IS NULL setting test.user and Running OUTSIDE PORTAL"); user = "test.user"; -// user = "massimiliano.assante"; -// SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE"); - withinPortal = false; + user = "massimiliano.assante"; + SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE"); + withinPortal = false; } else { withinPortal = true; @@ -176,12 +184,12 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar e.printStackTrace(); } } - + String linkTitle = preview.getTitle(); String linkDesc = preview.getDescription(); String host = preview.getHost(); String url = preview.getUrl(); - + Date feedDate = new Date(); //this means the user has shared a file without text in it. String textToPost = ""; @@ -190,10 +198,10 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar } else { textToPost = transformUrls(escapedFeedText); } - + ScopeBean scope = new ScopeBean(session.getScope()); String vreId2Set = scope.is(Type.VRE) ? scope.toString() : ""; - + Feed toShare = new Feed(UUID.randomUUID().toString(), feedType, username, feedDate, vreId2Set, url, urlThumbnail, textToPost, pLevel, fullName, email, thumbnailURL, linkTitle, linkDesc, host); @@ -233,14 +241,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar ClientFeed cf = new ClientFeed(toShare.getKey(), toShare.getType().toString(), username, feedDate, toShare.getUri(), replaceAmpersand(toShare.getDescription()), fullName, email, thumbnailURL, toShare.getLinkTitle(), toShare.getLinkDescription(), toShare.getUriThumbnail(), toShare.getLinkHost()); - - + + //send the notification about this posts to everyone in the group if notifyGroup is true if (pLevel == PrivacyLevel.SINGLE_VRE && vreId != null && vreId.compareTo("") != 0 && notifyGroup) { NotificationsManager nm = new ApplicationNotificationsManager(session, NEWS_FEED_PORTLET_CLASSNAME); Thread thread = new Thread(new PostNotificationsThread(toShare.getKey(), escapedFeedText, ""+session.getGroupId(), nm)); thread.start(); - + } //send the notification to the mentioned users if (mentionedUsers != null && mentionedUsers.size() > 0) { @@ -248,7 +256,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar Thread thread = new Thread(new MentionNotificationsThread(toShare.getKey(), escapedFeedText, nm, mentionedUsers)); thread.start(); } - + //it means I also should upload a copy on the user's Workspace root folder if (fileName != null && filePathOnServer != null) { //The workspace uploader Thread starts here asyncronously @@ -278,10 +286,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar return escapedFeedText; } - private UserSettings getUserSettingsFromSession() { - return (UserSettings) getASLSession().getAttribute(UserInfo.USER_INFO_ATTR); - } - private void setUserSettingsInSession(UserSettings user) { getASLSession().setAttribute(UserInfo.USER_INFO_ATTR, user); } @@ -330,7 +334,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar sb.append("shared ").append("a file.").append(" ").toString(); return sb.toString(); } - + @Override public UserSettings getUserSettings() { try { @@ -357,13 +361,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar return toReturn; } else { - _log.info("Returning test USER"); + + _log.info("Returning test USER = " + session.getUsername()); HashMap fakeVreNames = new HashMap(); - fakeVreNames.put("/gcube/devsec/devVRE","devVRE"); + //fakeVreNames.put("/gcube/devsec/devVRE","devVRE"); //fakeVreNames.put("/gcube/devNext/NexNext","NexNext"); - UserInfo user = new UserInfo(getASLSession().getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames); - return new UserSettings(user, 0, session.getScopeName(), isInfrastructureScope()); + UserInfo user = new UserInfo(session.getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames); + return new UserSettings(user, 0, session.getScopeName(), false); } } catch (Exception e) { @@ -385,11 +390,11 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar ScopeProvider.instance.set("/"+PortalContext.getConfiguration().getInfrastructureName()); IClient storageClient = new StorageClient(STORAGE_OWNER, AccessType.SHARED, MemoryType.PERSISTENT).getClient(); ScopeProvider.instance.set(currScope); - + String httpURL = ""; //get the url to show, before actually uploading it String smpURI = storageClient.getUrl().RFile(remoteFilePath); - + //The storage uploader Thread starts here asyncronously Thread thread = new Thread(new UploadToStorageThread(storageClient, fileName, fileabsolutePathOnServer, remoteFilePath)); thread.start(); @@ -426,7 +431,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar return FilePreviewer.getUnhandledTypePreview(fileName, fileabsolutePathOnServer, httpURL, mimeType); } - + } catch (Exception e) { _log.error("Error while resolving or previewing file"); e.printStackTrace(); @@ -449,14 +454,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar */ protected static String getMimeType(File file, String filenameWithExtension) throws IOException { TikaConfig config = TikaConfig.getDefaultConfig(); - Detector detector = config.getDetector(); - TikaInputStream stream = TikaInputStream.get(file); - Metadata metadata = new Metadata(); - metadata.add(Metadata.RESOURCE_NAME_KEY, filenameWithExtension); - MediaType mediaType = detector.detect(stream, metadata); - return mediaType.getBaseType().toString(); + Detector detector = config.getDetector(); + TikaInputStream stream = TikaInputStream.get(file); + Metadata metadata = new Metadata(); + metadata.add(Metadata.RESOURCE_NAME_KEY, filenameWithExtension); + MediaType mediaType = detector.detect(stream, metadata); + return mediaType.getBaseType().toString(); } - + /** * return the id as key and the names as value of the vre a user is subscribed to * @param username @@ -564,11 +569,11 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar } String toReturn = html.replaceAll("&", "&").replaceAll("<", "<") .replaceAll(">", ">"); - + // then replace all the line breaks by
, and all the double spaces by the html version   toReturn = toReturn.replaceAll("(\r\n|\n)","
"); toReturn = toReturn.replaceAll("\\s\\s","  "); - + return toReturn; } @@ -638,6 +643,9 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar _log.error("url is not reachable"); return null; } + //pretend you're a browser (make my request from Java more “browsery-like”.) + siteConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); + String title; String description; ArrayList imageUrls = new ArrayList(); @@ -651,7 +659,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar if (ogLink == null || ogLink.getContent("title") == null) { //there is no OpenGraph for this link _log.info("No OpenGraph Found, going Best guess from page content") ; - toReturn = getInfoFromHTML(pageURL, linkToCheck, host); + toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host); } else { //there is OpenGraph title = ogLink.getContent("title"); @@ -661,7 +669,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar if (ogLink.getContent("image") != null) imageUrls.add(ogLink.getContent("image")); else { - ArrayList images = getImagesFromHTML(pageURL); + ArrayList images = getImagesFromHTML(siteConnection, pageURL); if (! images.isEmpty()) imageUrls = images; } @@ -681,11 +689,10 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar * @return a list of image url * @throws IOException */ - private ArrayList getImagesFromHTML(URL pageURL) throws IOException { + private ArrayList getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException { ArrayList toReturn = new ArrayList(); - InputStream input = pageURL.openStream(); try { - Document document = new Tidy().parseDOM(input, null); + Document document = new Tidy().parseDOM(pageURL.openStream(), null); NodeList imgs = document.getElementsByTagName("img"); int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength(); for (int i = 0; i < upTo; i++) { @@ -707,49 +714,126 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar * @return a LinPreview object instance filled with the extracted information * @throws IOException */ - private LinkPreview getInfoFromHTML(URL pageUrl, String link, String host) throws Exception { + private LinkPreview getInfoFromHTML(URLConnection connection, URL pageUrl, String link, String host) throws Exception { LinkPreview toReturn = null; String title = ""; String description = ""; - InputStream input = pageUrl.openStream(); - Document document = new Tidy().parseDOM(input, null); - NodeList titles = document.getElementsByTagName("title"); - if (titles != null && titles.getLength()>0) { - if (titles.item(0).getFirstChild() == null || titles.item(0).getFirstChild().getNodeValue() == null) { - _log.error("[MANUAL-PARSE] Something wrong with the title element, returning ... "); - return toReturn; - } - title = titles.item(0).getFirstChild().getNodeValue(); - MetaSeeker ms = null; - try { - ms = new MetaSeeker(link); - } catch(Exception e) { - _log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... "); - return toReturn; - } + URLConnection conn = pageUrl.openConnection(); + //pretend you're a browser (make my request from Java more “browsery-like”.) + conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); + + MetaSeeker ms = null; + try { + title = getTitleFromHeader(pageUrl); + ms = new MetaSeeker(connection, pageUrl); + //try the metadata, otherwise ask the guesser description = (ms.getContent("description") != null && ! ms.getContent("description").isEmpty()) ? ms.getContent("description") : createDescriptionFromContent(link); ArrayList images = new ArrayList(); - NodeList imgs = document.getElementsByTagName("img"); - int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength(); - for (int i = 0; i < upTo; i++) { - String imageUrl = imgs.item(i).getAttributes().getNamedItem("src").getNodeValue(); - if (imageUrl.startsWith("/")) - imageUrl = pageUrl.getProtocol()+"://"+pageUrl.getHost()+imageUrl; - else if (!imageUrl.contains("/")) { //then the image is probably in the same folder - // e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source - String imageFolder = pageUrl.toString().substring(0, pageUrl.toString().lastIndexOf("/")); - imageUrl= imageFolder + "/" + imageUrl; - } - images.add(imageUrl); - _log.trace("[FOUND image] " + imageUrl); - } + images = getImagesWithCleaner(pageUrl); toReturn = new LinkPreview(title, description, link, host, images); + + } catch(Exception e) { + _log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... "); + return toReturn; } return toReturn; } + /** + * @param pageURL + * @return the title of the page or null if can't read it + * @throws IOException + */ + private String getTitleFromHeader(URL pageURL) throws IOException { + URLConnection conn = pageURL.openConnection(); + //pretend you're a browser (make my request from Java more “browsery-like”.) + conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); + + Charset charset = OpenGraph.getConnectionCharset(conn); + BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); + + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) + { + if (inputLine.contains("")) { + inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); + inputLine = inputLine.concat(""); + headContents.append(inputLine + "\r\n"); + break; + } + headContents.append(inputLine + "\r\n"); + } + + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); + // open only the title tags + TagNode[] title = pageData.getElementsByName("title", true); + if (title != null && title.length > 0) { + String theTitle = title[0].getChildren().get(0).toString(); + System.out.println("theTitle: " + theTitle); + return theTitle; + } + return null; + } + + /** + * if jTidy has problems try with with HtmlCleaner API to read the images + * @param pageURL + * @return the title of the page or null if can't read it + * @throws IOException + */ + private ArrayList getImagesWithCleaner(URL pageURL) throws IOException { + ArrayList images = new ArrayList(); + URLConnection conn = pageURL.openConnection(); + //pretend you're a browser (make my request from Java more “browsery-like”.) + conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11"); + + Charset charset = OpenGraph.getConnectionCharset(conn); + BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); + + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) { + headContents.append(inputLine + "\r\n"); + } + + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); + // open only the title tags + TagNode[] imgs = pageData.getElementsByName("img", true); + int upTo = (imgs.length > 15) ? 15 : imgs.length; + for (int i = 0; i < upTo; i++) { + if (imgs[i].hasAttribute("src")) { + String imageUrl = imgs[i].getAttributeByName("src"); + if (imageUrl.startsWith("/")) + imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl; + else if (imageUrl.startsWith("../")) { + imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl; + } + else if (!imageUrl.contains("/")) { //then the image is probably in the same folder + // e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source + String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/")); + imageUrl= imageFolder + "/" + imageUrl; + } + else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png + imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl; + } + images.add(imageUrl); + _log.trace("[FOUND image] " + imageUrl); + } + } + return images; + } + /** * generate the description parsing the content (Best Guess) * @param link the link to check @@ -857,18 +941,5 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar } return portalUsers; } - /** - * - * @return the workspace instance - * @throws InternalErrorException - * @throws HomeNotFoundException - * @throws WorkspaceFolderNotFoundException - */ - private Workspace getWorkspace() throws InternalErrorException, HomeNotFoundException, WorkspaceFolderNotFoundException { - final ASLSession session = getASLSession(); - Workspace workspace = HomeLibrary.getUserWorkspace(session.getUsername()); - return workspace; - } - } diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/server/UploadToWorkspaceThread.java b/src/main/java/org/gcube/portlets/user/shareupdates/server/UploadToWorkspaceThread.java index 1d8a0e6..9fc6b82 100644 --- a/src/main/java/org/gcube/portlets/user/shareupdates/server/UploadToWorkspaceThread.java +++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/UploadToWorkspaceThread.java @@ -8,6 +8,8 @@ import java.util.Date; import org.gcube.common.homelibrary.home.HomeLibrary; import org.gcube.common.homelibrary.home.workspace.Workspace; import org.gcube.common.homelibrary.home.workspace.exceptions.ItemAlreadyExistException; +import org.gcube.common.portal.PortalContext; +import org.gcube.common.scope.api.ScopeProvider; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,26 +48,37 @@ public class UploadToWorkspaceThread implements Runnable { @Override public void run() { try { + String currScope = ScopeProvider.instance.get(); + ScopeProvider.instance.set("/"+PortalContext.getConfiguration().getInfrastructureName()); + Workspace ws = HomeLibrary .getHomeManagerFactory() .getHomeManager() .getHome(username).getWorkspace(); - + + _log.info("File to upload="+fileabsolutePathOnServer); File file = new File(fileabsolutePathOnServer); String mimeType = ShareUpdateServiceImpl.getMimeType(file, fileName); InputStream fileData = new FileInputStream(file); String theId = ""; + _log.info("mimeType="+mimeType + " fileData null? " + (fileData == null) ); try { theId = ws.createExternalFile(fileName ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId(); } + catch (NullPointerException exn) { + _log.warn("null pointer"); + exn.printStackTrace(); + } catch (ItemAlreadyExistException ex) { _log.warn("fileName " + fileName + " exists, appending timestamp"); theId = ws.createExternalFile(fileName+" ("+ new Date()+")" ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId(); + ex.printStackTrace(); } finally { fileData.close(); } fileData.close(); _log.debug("Uploaded " + fileName + " - Returned Workspace id=" + theId); + ScopeProvider.instance.set(currScope); } catch (Exception e) { diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/server/metaseeker/MetaSeeker.java b/src/main/java/org/gcube/portlets/user/shareupdates/server/metaseeker/MetaSeeker.java index c3ce70d..16d2dab 100644 --- a/src/main/java/org/gcube/portlets/user/shareupdates/server/metaseeker/MetaSeeker.java +++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/metaseeker/MetaSeeker.java @@ -3,9 +3,12 @@ package org.gcube.portlets.user.shareupdates.server.metaseeker; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; +import java.net.URLConnection; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Hashtable; +import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import org.slf4j.Logger; @@ -61,12 +64,11 @@ public class MetaSeeker { * @param url The address to the web page to fetch the meta * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception */ - public MetaSeeker(String url) throws java.io.IOException, Exception { + public MetaSeeker(URLConnection connection, URL httpURL) throws java.io.IOException, Exception { this(); isImported = true; - // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content - URL httpURL = new URL(url); - BufferedReader dis = new BufferedReader(new InputStreamReader(httpURL.openStream())); + Charset charset = OpenGraph.getConnectionCharset(connection); + BufferedReader dis = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset)); String inputLine; StringBuffer headContents = new StringBuffer(); diff --git a/src/main/java/org/gcube/portlets/user/shareupdates/server/opengraph/OpenGraph.java b/src/main/java/org/gcube/portlets/user/shareupdates/server/opengraph/OpenGraph.java index 769b9a7..46a6ffa 100644 --- a/src/main/java/org/gcube/portlets/user/shareupdates/server/opengraph/OpenGraph.java +++ b/src/main/java/org/gcube/portlets/user/shareupdates/server/opengraph/OpenGraph.java @@ -4,6 +4,7 @@ import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import java.io.BufferedReader; +import java.io.IOException; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; @@ -21,74 +22,74 @@ import java.util.regex.Pattern; */ public class OpenGraph { - private String pageUrl; + private String pageUrl; private ArrayList pageNamespaces; - private Hashtable> metaAttributes; - private String baseType; - private boolean isImported; // determine if the object is a new incarnation or representation of a web page - private boolean hasChanged; // track if object has been changed + private Hashtable> metaAttributes; + private String baseType; + private boolean isImported; // determine if the object is a new incarnation or representation of a web page + private boolean hasChanged; // track if object has been changed - public final static String[] REQUIRED_META = new String[]{"title", "type", "image", "url" }; + public final static String[] REQUIRED_META = new String[]{"title", "type", "image", "url" }; - public final static Hashtable BASE_TYPES = new Hashtable(); - static + public final static Hashtable BASE_TYPES = new Hashtable(); + static { BASE_TYPES.put("activity", new String[] {"activity", "sport"}); BASE_TYPES.put("business", new String[] {"bar", "company", "cafe", "hotel", "restaurant"}); BASE_TYPES.put("group", new String[] {"cause", "sports_league", "sports_team"}); - BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"}); - BASE_TYPES.put("person", new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"}); - BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"}); - BASE_TYPES.put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"}); - BASE_TYPES.put("website", new String[] {"blog", "website", "article"}); + BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"}); + BASE_TYPES.put("person", new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"}); + BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"}); + BASE_TYPES.put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"}); + BASE_TYPES.put("website", new String[] {"blog", "website", "article"}); } - /** - * Create an open graph representation for generating your own Open Graph object - */ - public OpenGraph() + /** + * Create an open graph representation for generating your own Open Graph object + */ + public OpenGraph() { pageNamespaces = new ArrayList(); - metaAttributes = new Hashtable>(); - hasChanged = false; - isImported = false; - } + metaAttributes = new Hashtable>(); + hasChanged = false; + isImported = false; + } - /** - * Fetch the open graph representation from a web site - * @param url The address to the web page to fetch Open Graph data - * @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes - * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception - * @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META - */ - public OpenGraph(String url, boolean ignoreSpecErrors, URLConnection siteConnection) throws java.io.IOException, Exception { - this(); - isImported = true; + /** + * Fetch the open graph representation from a web site + * @param url The address to the web page to fetch Open Graph data + * @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes + * @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception + * @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META + */ + public OpenGraph(String url, boolean ignoreSpecErrors, URLConnection siteConnection) throws java.io.IOException, Exception { + this(); + isImported = true; - // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content - Charset charset = getConnectionCharset(siteConnection); - BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); - String inputLine; - StringBuffer headContents = new StringBuffer(); + // download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content + Charset charset = getConnectionCharset(siteConnection); + BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset)); + String inputLine; + StringBuffer headContents = new StringBuffer(); - // Loop through each line, looking for the closing head element - while ((inputLine = dis.readLine()) != null) + // Loop through each line, looking for the closing head element + while ((inputLine = dis.readLine()) != null) { - if (inputLine.contains("")) + if (inputLine.contains("")) { - inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); - inputLine = inputLine.concat(""); - headContents.append(inputLine + "\r\n"); - break; - } - headContents.append(inputLine + "\r\n"); - } + inputLine = inputLine.substring(0, inputLine.indexOf("") + 7); + inputLine = inputLine.concat(""); + headContents.append(inputLine + "\r\n"); + break; + } + headContents.append(inputLine + "\r\n"); + } - String headContentsStr = headContents.toString(); - HtmlCleaner cleaner = new HtmlCleaner(); - // parse the string HTML - TagNode pageData = cleaner.clean(headContentsStr); + String headContentsStr = headContents.toString(); + HtmlCleaner cleaner = new HtmlCleaner(); + // parse the string HTML + TagNode pageData = cleaner.clean(headContentsStr); // read in the declared namespaces boolean hasOGspec = false; @@ -100,12 +101,12 @@ public class OpenGraph Matcher matcher = pattern.matcher(namespaceData); while (matcher.find()) { - String prefix = matcher.group(2); + String prefix = matcher.group(2); String documentURI = matcher.group(3); pageNamespaces.add(new OpenGraphNamespace(prefix, documentURI)); if (prefix.equals("og")) hasOGspec = true; - } + } } // some pages do not include the new OG spec @@ -113,17 +114,17 @@ public class OpenGraph if (!hasOGspec) pageNamespaces.add(new OpenGraphNamespace("og", "http:// ogp.me/ns#")); - // open only the meta tags - TagNode[] metaData = pageData.getElementsByName("meta", true); - for (TagNode metaElement : metaData) + // open only the meta tags + TagNode[] metaData = pageData.getElementsByName("meta", true); + for (TagNode metaElement : metaData) { for (OpenGraphNamespace namespace : pageNamespaces) { String target = null; - if (metaElement.hasAttribute("property")) - target = "property"; - else if (metaElement.hasAttribute("name")) - target = "name"; + if (metaElement.hasAttribute("property")) + target = "property"; + else if (metaElement.hasAttribute("name")) + target = "name"; if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":")) { @@ -131,134 +132,141 @@ public class OpenGraph break; } } - } + } - /** - * Check that page conforms to Open Graph protocol - */ - if (!ignoreSpecErrors) + /** + * Check that page conforms to Open Graph protocol + */ + if (!ignoreSpecErrors) { - for (String req : REQUIRED_META) + for (String req : REQUIRED_META) { - if (!metaAttributes.containsKey(req)) - throw new Exception("Does not conform to Open Graph protocol"); - } - } + if (!metaAttributes.containsKey(req)) + throw new Exception("Does not conform to Open Graph protocol"); + } + } - /** - * Has conformed, now determine basic sub type. - */ - baseType = null; + /** + * Has conformed, now determine basic sub type. + */ + baseType = null; String currentType = getContent("type"); - // read the original page url - URL realURL = siteConnection.getURL(); - pageUrl = realURL.toExternalForm(); - } + // read the original page url + URL realURL = siteConnection.getURL(); + pageUrl = realURL.toExternalForm(); + } - /** - * Gets the charset for specified connection. - * Content Type header is parsed to get the charset name. - * - * @param connection the connection. - * @return the Charset object for response charset name; - * if it's not found then the default charset. - */ - private static Charset getConnectionCharset(URLConnection connection) - { - String contentType = connection.getContentType(); - if (contentType != null && contentType.length() > 0) - { - contentType = contentType.toLowerCase(); - String charsetName = extractCharsetName(contentType); - if (charsetName != null && charsetName.length() > 0) - { - try - { - return Charset.forName(charsetName); - } - catch (Exception e) { - // specified charset is not found, - // skip it to return the default one - } - } - } + /** + * Gets the charset for specified connection. + * Content Type header is parsed to get the charset name. + * + * @param connection the connection. + * @return the Charset object for response charset name; + * if it's not found then the default charset. + */ + public static Charset getConnectionCharset(URLConnection connection) { + String contentType = null; + try { + contentType = connection.getContentType(); + } + catch (Exception e) { + // specified charset is not found, + // skip it to return the default one + return Charset.defaultCharset(); + } + if (contentType != null && contentType.length() > 0) + { + contentType = contentType.toLowerCase(); + String charsetName = extractCharsetName(contentType); + if (charsetName != null && charsetName.length() > 0) + { + try + { + return Charset.forName(charsetName); + } + catch (Exception e) { + // specified charset is not found, + // skip it to return the default one + } + } + } - // return the default charset - return Charset.defaultCharset(); - } + // return the default charset + return Charset.defaultCharset(); + } - /** - * Extract the charset name form the content type string. - * Content type string is received from Content-Type header. - * - * @param contentType the content type string, must be not null. - * @return the found charset name or null if not found. - */ - private static String extractCharsetName(String contentType) - { - // split onto media types - final String[] mediaTypes = contentType.split(":"); - if (mediaTypes.length > 0) - { - // use only the first one, and split it on parameters - final String[] params = mediaTypes[0].split(";"); - - // find the charset parameter and return it's value - for (String each : params) - { - each = each.trim(); - if (each.startsWith("charset=")) - { - // return the charset name - return each.substring(8).trim(); - } - } - } - - return null; - } - - /** - * Get the basic type of the Open graph page as per the specification - * @return Base type as defined by specification, null otherwise - */ - public String getBaseType() + /** + * Extract the charset name form the content type string. + * Content type string is received from Content-Type header. + * + * @param contentType the content type string, must be not null. + * @return the found charset name or null if not found. + */ + private static String extractCharsetName(String contentType) { - return baseType; - } + // split onto media types + final String[] mediaTypes = contentType.split(":"); + if (mediaTypes.length > 0) + { + // use only the first one, and split it on parameters + final String[] params = mediaTypes[0].split(";"); - /** - * Get a value of a given Open Graph property - * @param property The Open graph property key - * @return Returns the value of the first property defined, null otherwise - */ - public String getContent(String property) + // find the charset parameter and return it's value + for (String each : params) + { + each = each.trim(); + if (each.startsWith("charset=")) + { + // return the charset name + return each.substring(8).trim(); + } + } + } + + return null; + } + + /** + * Get the basic type of the Open graph page as per the specification + * @return Base type as defined by specification, null otherwise + */ + public String getBaseType() { - if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) + return baseType; + } + + /** + * Get a value of a given Open Graph property + * @param property The Open graph property key + * @return Returns the value of the first property defined, null otherwise + */ + public String getContent(String property) + { + if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0) return metaAttributes.get(property).get(0).getContent(); else return null; - } + } - /** - * Get all the defined properties of the Open Graph object - * @return An array of all currently defined properties - */ - public MetaElement[] getProperties() + /** + * Get all the defined properties of the Open Graph object + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties() { ArrayList allElements = new ArrayList(); - for (ArrayList collection : metaAttributes.values()) + for (ArrayList collection : metaAttributes.values()) allElements.addAll(collection); return (MetaElement[]) allElements.toArray(new MetaElement[allElements.size()]); - } + } - /** - * Get all the defined properties of the Open Graph object + /** + * Get all the defined properties of the Open Graph object * @param property The property to focus on - * @return An array of all currently defined properties - */ - public MetaElement[] getProperties(String property) + * @return An array of all currently defined properties + */ + public MetaElement[] getProperties(String property) { if (metaAttributes.containsKey(property)) { @@ -267,69 +275,69 @@ public class OpenGraph } else return null; - } + } - /** - * Get the original URL the Open Graph page was obtained from - * @return The address to the Open Graph object page - */ - public String getOriginalUrl() + /** + * Get the original URL the Open Graph page was obtained from + * @return The address to the Open Graph object page + */ + public String getOriginalUrl() { - return pageUrl; - } + return pageUrl; + } - /** - * Get the HTML representation of the Open Graph data. - * @return An array of meta elements as Strings - */ - public String[] toHTML() + /** + * Get the HTML representation of the Open Graph data. + * @return An array of meta elements as Strings + */ + public String[] toHTML() { - // allocate the array - ArrayList returnHTML = new ArrayList(); + // allocate the array + ArrayList returnHTML = new ArrayList(); - int index = 0; // keep track of the index to insert into - for (ArrayList elements : metaAttributes.values()) + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { for (MetaElement element : elements) - returnHTML.add(""); + returnHTML.add(""); } - // return the array - return (String[]) returnHTML.toArray(); - } + // return the array + return (String[]) returnHTML.toArray(); + } - /** - * Get the XHTML representation of the Open Graph data. - * @return An array of meta elements as Strings - */ - public String[] toXHTML() + /** + * Get the XHTML representation of the Open Graph data. + * @return An array of meta elements as Strings + */ + public String[] toXHTML() { - // allocate the array - ArrayList returnHTML = new ArrayList(); + // allocate the array + ArrayList returnHTML = new ArrayList(); - int index = 0; // keep track of the index to insert into - for (ArrayList elements : metaAttributes.values()) + int index = 0; // keep track of the index to insert into + for (ArrayList elements : metaAttributes.values()) { for (MetaElement element : elements) - returnHTML.add(""); + returnHTML.add(""); } - // return the array - return (String[]) returnHTML.toArray(); - } + // return the array + return (String[]) returnHTML.toArray(); + } - /** - * Set the Open Graph property to a specific value + /** + * Set the Open Graph property to a specific value * @param namespace The OpenGraph namespace the content belongs to - * @param property The og:XXXX where XXXX is the property you wish to set - * @param content The value or contents of the property to be set - */ - public void setProperty(OpenGraphNamespace namespace, String property, String content) + * @param property The og:XXXX where XXXX is the property you wish to set + * @param content The value or contents of the property to be set + */ + public void setProperty(OpenGraphNamespace namespace, String property, String content) { - if (!pageNamespaces.contains(namespace)) + if (!pageNamespaces.contains(namespace)) pageNamespaces.add(namespace); property = property.replaceAll(namespace.getPrefix() + ":", ""); @@ -338,41 +346,41 @@ public class OpenGraph metaAttributes.put(property, new ArrayList()); metaAttributes.get(property).add(element); - } + } - /** - * Removed a defined property - * @param property The og:XXXX where XXXX is the property you wish to remove - */ - public void removeProperty(String property) + /** + * Removed a defined property + * @param property The og:XXXX where XXXX is the property you wish to remove + */ + public void removeProperty(String property) { - metaAttributes.remove(property); - } + metaAttributes.remove(property); + } - /** - * Obtain the underlying HashTable - * @return The underlying structure as a Hashtable - */ - public Hashtable> exposeTable() { - return metaAttributes; - } + /** + * Obtain the underlying HashTable + * @return The underlying structure as a Hashtable + */ + public Hashtable> exposeTable() { + return metaAttributes; + } - /** - * Test if the Open Graph object was initially a representation of a web page - * @return True if the object is from a web page, false otherwise - */ - public boolean isFromWeb() + /** + * Test if the Open Graph object was initially a representation of a web page + * @return True if the object is from a web page, false otherwise + */ + public boolean isFromWeb() { - return isImported; - } + return isImported; + } - /** - * Test if the object has been modified by setters/deleters. - * This is only relevant if this object initially represented a web page - * @return True True if the object has been modified, false otherwise - */ - public boolean hasChanged() + /** + * Test if the object has been modified by setters/deleters. + * This is only relevant if this object initially represented a web page + * @return True True if the object has been modified, false otherwise + */ + public boolean hasChanged() { - return hasChanged; - } + return hasChanged; + } }