added user agent property to http requests to avoid getting 403 errors, refined the way to guess content and images when parsing HTML

git-svn-id: https://svn.research-infrastructures.eu/d4science/gcube/trunk/portlets/user/share-updates@93471 82a268e6-3cf1-43bd-a215-b396298e98cf
This commit is contained in:
Massimiliano Assante 2014-03-22 16:32:46 +00:00
parent bb6f5cb846
commit 91b8e92a47
9 changed files with 424 additions and 336 deletions

View File

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/share-updates-1.2.0-SNAPSHOT/WEB-INF/classes" path="src/main/java">
<classpathentry kind="src" output="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
@ -31,5 +31,5 @@
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/share-updates-1.2.0-SNAPSHOT/WEB-INF/classes"/>
<classpathentry kind="output" path="target/share-updates-1.2.1-SNAPSHOT/WEB-INF/classes"/>
</classpath>

View File

@ -1,5 +1,5 @@
eclipse.preferences.version=1
jarsExcludedFromWebInfLib=
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.0-SNAPSHOT
lastWarOutDir=/Users/massi/Documents/workspace/share-updates/target/share-updates-1.2.1-SNAPSHOT
warSrcDir=src/main/webapp
warSrcDirIsOutput=false

View File

@ -4,9 +4,6 @@
<wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
<wb-resource deploy-path="/WEB-INF/classes" source-path="/target/generated-sources/gwt"/>
<dependent-module archiveName="fileupload-progress-bar-1.0.0-SNAPSHOT.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/fileupload-progress-bar/fileupload-progress-bar">
<dependency-type>uses</dependency-type>
</dependent-module>
<property name="java-output-path" value="/${module}/target/www/WEB-INF/classes"/>
<property name="context-root" value="share-updates"/>
</wb-module>

View File

@ -13,7 +13,7 @@
<groupId>org.gcube.portlets.user</groupId>
<artifactId>share-updates</artifactId>
<packaging>war</packaging>
<version>1.2.0-SNAPSHOT</version>
<version>1.2.1-SNAPSHOT</version>
<name>gCube Share Updates Portlet</name>
<description>
@ -106,13 +106,13 @@
<groupId>org.gcube.contentmanagement</groupId>
<artifactId>storage-manager-core</artifactId>
<version>[2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT)</version>
<scope>compile</scope>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.gcube.contentmanagement</groupId>
<artifactId>storage-manager-wrapper</artifactId>
<version>[2.0.0-SNAPSHOT, 3.0.0-SNAPSHOT)</version>
<scope>compile</scope>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.gcube.applicationsupportlayer</groupId>

View File

@ -10,9 +10,6 @@ import com.google.gwt.user.client.ui.RootPanel;
*/
public class ShareUpdates implements EntryPoint {
/**
* This is the entry point method.
*/
public void onModuleLoad() {
RootPanel.get("shareUpdateDiv").add(new ShareUpdateForm());
}

View File

@ -1,12 +1,17 @@
package org.gcube.portlets.user.shareupdates.server;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
@ -57,6 +62,7 @@ import org.gcube.portlets.user.shareupdates.client.ShareUpdateService;
import org.gcube.portlets.user.shareupdates.client.view.ShareUpdateForm;
import org.gcube.portlets.user.shareupdates.server.metaseeker.MetaSeeker;
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraphNamespace;
import org.gcube.portlets.user.shareupdates.shared.LinkPreview;
import org.gcube.portlets.user.shareupdates.shared.UserSettings;
import org.gcube.portlets.widgets.pickuser.shared.PickingUser;
@ -66,6 +72,8 @@ import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayGroupManager;
import org.gcube.vomanagement.usermanagement.impl.liferay.LiferayUserManager;
import org.gcube.vomanagement.usermanagement.model.GroupModel;
import org.gcube.vomanagement.usermanagement.model.UserModel;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlparser.beans.StringBean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -134,9 +142,9 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
if (user == null) {
_log.warn("USER IS NULL setting test.user and Running OUTSIDE PORTAL");
user = "test.user";
// user = "massimiliano.assante";
// SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE");
withinPortal = false;
user = "massimiliano.assante";
SessionManager.getInstance().getASLSession(sessionID, user).setScope("/gcube/devsec/devVRE");
withinPortal = false;
}
else {
withinPortal = true;
@ -176,12 +184,12 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
e.printStackTrace();
}
}
String linkTitle = preview.getTitle();
String linkDesc = preview.getDescription();
String host = preview.getHost();
String url = preview.getUrl();
Date feedDate = new Date();
//this means the user has shared a file without text in it.
String textToPost = "";
@ -190,10 +198,10 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
} else {
textToPost = transformUrls(escapedFeedText);
}
ScopeBean scope = new ScopeBean(session.getScope());
String vreId2Set = scope.is(Type.VRE) ? scope.toString() : "";
Feed toShare = new Feed(UUID.randomUUID().toString(), feedType, username, feedDate,
vreId2Set, url, urlThumbnail, textToPost, pLevel, fullName, email, thumbnailURL, linkTitle, linkDesc, host);
@ -233,14 +241,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
ClientFeed cf = new ClientFeed(toShare.getKey(), toShare.getType().toString(), username, feedDate, toShare.getUri(),
replaceAmpersand(toShare.getDescription()), fullName, email, thumbnailURL, toShare.getLinkTitle(), toShare.getLinkDescription(),
toShare.getUriThumbnail(), toShare.getLinkHost());
//send the notification about this posts to everyone in the group if notifyGroup is true
if (pLevel == PrivacyLevel.SINGLE_VRE && vreId != null && vreId.compareTo("") != 0 && notifyGroup) {
NotificationsManager nm = new ApplicationNotificationsManager(session, NEWS_FEED_PORTLET_CLASSNAME);
Thread thread = new Thread(new PostNotificationsThread(toShare.getKey(), escapedFeedText, ""+session.getGroupId(), nm));
thread.start();
}
//send the notification to the mentioned users
if (mentionedUsers != null && mentionedUsers.size() > 0) {
@ -248,7 +256,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
Thread thread = new Thread(new MentionNotificationsThread(toShare.getKey(), escapedFeedText, nm, mentionedUsers));
thread.start();
}
//it means I also should upload a copy on the user's Workspace root folder
if (fileName != null && filePathOnServer != null) {
//The workspace uploader Thread starts here asyncronously
@ -278,10 +286,6 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
return escapedFeedText;
}
private UserSettings getUserSettingsFromSession() {
return (UserSettings) getASLSession().getAttribute(UserInfo.USER_INFO_ATTR);
}
private void setUserSettingsInSession(UserSettings user) {
getASLSession().setAttribute(UserInfo.USER_INFO_ATTR, user);
}
@ -330,7 +334,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
sb.append("<span style=\"color:gray; font-size:12px;\">shared </span><a class=\"link\" href=\"").append(url).append("\" target=\"_blank\">").append("a file.").append("</a> ").toString();
return sb.toString();
}
@Override
public UserSettings getUserSettings() {
try {
@ -357,13 +361,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
return toReturn;
}
else {
_log.info("Returning test USER");
_log.info("Returning test USER = " + session.getUsername());
HashMap<String, String> fakeVreNames = new HashMap<String, String>();
fakeVreNames.put("/gcube/devsec/devVRE","devVRE");
//fakeVreNames.put("/gcube/devsec/devVRE","devVRE");
//fakeVreNames.put("/gcube/devNext/NexNext","NexNext");
UserInfo user = new UserInfo(getASLSession().getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames);
return new UserSettings(user, 0, session.getScopeName(), isInfrastructureScope());
UserInfo user = new UserInfo(session.getUsername(), fullName, thumbnailURL, email, "fakeAccountUrl", true, false, fakeVreNames);
return new UserSettings(user, 0, session.getScopeName(), false);
}
} catch (Exception e) {
@ -385,11 +390,11 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
ScopeProvider.instance.set("/"+PortalContext.getConfiguration().getInfrastructureName());
IClient storageClient = new StorageClient(STORAGE_OWNER, AccessType.SHARED, MemoryType.PERSISTENT).getClient();
ScopeProvider.instance.set(currScope);
String httpURL = "";
//get the url to show, before actually uploading it
String smpURI = storageClient.getUrl().RFile(remoteFilePath);
//The storage uploader Thread starts here asyncronously
Thread thread = new Thread(new UploadToStorageThread(storageClient, fileName, fileabsolutePathOnServer, remoteFilePath));
thread.start();
@ -426,7 +431,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
return FilePreviewer.getUnhandledTypePreview(fileName, fileabsolutePathOnServer, httpURL, mimeType);
}
} catch (Exception e) {
_log.error("Error while resolving or previewing file");
e.printStackTrace();
@ -449,14 +454,14 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
*/
protected static String getMimeType(File file, String filenameWithExtension) throws IOException {
TikaConfig config = TikaConfig.getDefaultConfig();
Detector detector = config.getDetector();
TikaInputStream stream = TikaInputStream.get(file);
Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, filenameWithExtension);
MediaType mediaType = detector.detect(stream, metadata);
return mediaType.getBaseType().toString();
Detector detector = config.getDetector();
TikaInputStream stream = TikaInputStream.get(file);
Metadata metadata = new Metadata();
metadata.add(Metadata.RESOURCE_NAME_KEY, filenameWithExtension);
MediaType mediaType = detector.detect(stream, metadata);
return mediaType.getBaseType().toString();
}
/**
* return the id as key and the names as value of the vre a user is subscribed to
* @param username
@ -564,11 +569,11 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
}
String toReturn = html.replaceAll("&", "&amp;").replaceAll("<", "&lt;")
.replaceAll(">", "&gt;");
// then replace all the line breaks by <br/>, and all the double spaces by the html version &nbsp;
toReturn = toReturn.replaceAll("(\r\n|\n)","<br />");
toReturn = toReturn.replaceAll("\\s\\s","&nbsp;&nbsp;");
return toReturn;
}
@ -638,6 +643,9 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
_log.error("url is not reachable");
return null;
}
//pretend you're a browser (make my request from Java more browsery-like.)
siteConnection.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
String title;
String description;
ArrayList<String> imageUrls = new ArrayList<String>();
@ -651,7 +659,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
if (ogLink == null || ogLink.getContent("title") == null) {
//there is no OpenGraph for this link
_log.info("No OpenGraph Found, going Best guess from page content") ;
toReturn = getInfoFromHTML(pageURL, linkToCheck, host);
toReturn = getInfoFromHTML(siteConnection, pageURL, linkToCheck, host);
} else {
//there is OpenGraph
title = ogLink.getContent("title");
@ -661,7 +669,7 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
if (ogLink.getContent("image") != null)
imageUrls.add(ogLink.getContent("image"));
else {
ArrayList<String> images = getImagesFromHTML(pageURL);
ArrayList<String> images = getImagesFromHTML(siteConnection, pageURL);
if (! images.isEmpty())
imageUrls = images;
}
@ -681,11 +689,10 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
* @return a list of image url
* @throws IOException
*/
private ArrayList<String> getImagesFromHTML(URL pageURL) throws IOException {
private ArrayList<String> getImagesFromHTML(URLConnection connection, URL pageURL) throws IOException {
ArrayList<String> toReturn = new ArrayList<String>();
InputStream input = pageURL.openStream();
try {
Document document = new Tidy().parseDOM(input, null);
Document document = new Tidy().parseDOM(pageURL.openStream(), null);
NodeList imgs = document.getElementsByTagName("img");
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
for (int i = 0; i < upTo; i++) {
@ -707,49 +714,126 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
* @return a LinPreview object instance filled with the extracted information
* @throws IOException
*/
private LinkPreview getInfoFromHTML(URL pageUrl, String link, String host) throws Exception {
private LinkPreview getInfoFromHTML(URLConnection connection, URL pageUrl, String link, String host) throws Exception {
LinkPreview toReturn = null;
String title = "";
String description = "";
InputStream input = pageUrl.openStream();
Document document = new Tidy().parseDOM(input, null);
NodeList titles = document.getElementsByTagName("title");
if (titles != null && titles.getLength()>0) {
if (titles.item(0).getFirstChild() == null || titles.item(0).getFirstChild().getNodeValue() == null) {
_log.error("[MANUAL-PARSE] Something wrong with the title element, returning ... ");
return toReturn;
}
title = titles.item(0).getFirstChild().getNodeValue();
MetaSeeker ms = null;
try {
ms = new MetaSeeker(link);
} catch(Exception e) {
_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
return toReturn;
}
URLConnection conn = pageUrl.openConnection();
//pretend you're a browser (make my request from Java more browsery-like.)
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
MetaSeeker ms = null;
try {
title = getTitleFromHeader(pageUrl);
ms = new MetaSeeker(connection, pageUrl);
//try the metadata, otherwise ask the guesser
description = (ms.getContent("description") != null && ! ms.getContent("description").isEmpty()) ? ms.getContent("description") : createDescriptionFromContent(link);
ArrayList<String> images = new ArrayList<String>();
NodeList imgs = document.getElementsByTagName("img");
int upTo = (imgs.getLength() > 15) ? 15 : imgs.getLength();
for (int i = 0; i < upTo; i++) {
String imageUrl = imgs.item(i).getAttributes().getNamedItem("src").getNodeValue();
if (imageUrl.startsWith("/"))
imageUrl = pageUrl.getProtocol()+"://"+pageUrl.getHost()+imageUrl;
else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
String imageFolder = pageUrl.toString().substring(0, pageUrl.toString().lastIndexOf("/"));
imageUrl= imageFolder + "/" + imageUrl;
}
images.add(imageUrl);
_log.trace("[FOUND image] " + imageUrl);
}
images = getImagesWithCleaner(pageUrl);
toReturn = new LinkPreview(title, description, link, host, images);
} catch(Exception e) {
_log.error("[MANUAL-PARSE] Something wrong with the meta seeker returning ... ");
return toReturn;
}
return toReturn;
}
/**
* @param pageURL
* @return the title of the page or null if can't read it
* @throws IOException
*/
private String getTitleFromHeader(URL pageURL) throws IOException {
URLConnection conn = pageURL.openConnection();
//pretend you're a browser (make my request from Java more browsery-like.)
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
Charset charset = OpenGraph.getConnectionCharset(conn);
BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();
// Loop through each line, looking for the closing head element
while ((inputLine = dis.readLine()) != null)
{
if (inputLine.contains("</head>")) {
inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7);
inputLine = inputLine.concat("<body></body></html>");
headContents.append(inputLine + "\r\n");
break;
}
headContents.append(inputLine + "\r\n");
}
String headContentsStr = headContents.toString();
HtmlCleaner cleaner = new HtmlCleaner();
// parse the string HTML
TagNode pageData = cleaner.clean(headContentsStr);
// open only the title tags
TagNode[] title = pageData.getElementsByName("title", true);
if (title != null && title.length > 0) {
String theTitle = title[0].getChildren().get(0).toString();
System.out.println("theTitle: " + theTitle);
return theTitle;
}
return null;
}
/**
* if jTidy has problems try with with HtmlCleaner API to read the images
* @param pageURL
* @return the title of the page or null if can't read it
* @throws IOException
*/
private ArrayList<String> getImagesWithCleaner(URL pageURL) throws IOException {
ArrayList<String> images = new ArrayList<String>();
URLConnection conn = pageURL.openConnection();
//pretend you're a browser (make my request from Java more browsery-like.)
conn.addRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11");
Charset charset = OpenGraph.getConnectionCharset(conn);
BufferedReader dis = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();
// Loop through each line, looking for the closing head element
while ((inputLine = dis.readLine()) != null) {
headContents.append(inputLine + "\r\n");
}
String headContentsStr = headContents.toString();
HtmlCleaner cleaner = new HtmlCleaner();
// parse the string HTML
TagNode pageData = cleaner.clean(headContentsStr);
// open only the title tags
TagNode[] imgs = pageData.getElementsByName("img", true);
int upTo = (imgs.length > 15) ? 15 : imgs.length;
for (int i = 0; i < upTo; i++) {
if (imgs[i].hasAttribute("src")) {
String imageUrl = imgs[i].getAttributeByName("src");
if (imageUrl.startsWith("/"))
imageUrl = pageURL.getProtocol()+"://"+pageURL.getHost()+imageUrl;
else if (imageUrl.startsWith("../")) {
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
else if (!imageUrl.contains("/")) { //then the image is probably in the same folder
// e.g. http://www.fao.org/docrep/018/i3328e/i3328e00.htm?utm_source
String imageFolder = pageURL.toString().substring(0, pageURL.toString().lastIndexOf("/"));
imageUrl= imageFolder + "/" + imageUrl;
}
else if (!imageUrl.startsWith("http") ) { //e.g. img/anImage.png
imageUrl = pageURL.toExternalForm().endsWith("/") ? pageURL.toExternalForm() + imageUrl : pageURL.toExternalForm() + "/" + imageUrl;
}
images.add(imageUrl);
_log.trace("[FOUND image] " + imageUrl);
}
}
return images;
}
/**
* generate the description parsing the content (Best Guess)
* @param link the link to check
@ -857,18 +941,5 @@ public class ShareUpdateServiceImpl extends RemoteServiceServlet implements Shar
}
return portalUsers;
}
/**
*
* @return the workspace instance
* @throws InternalErrorException
* @throws HomeNotFoundException
* @throws WorkspaceFolderNotFoundException
*/
private Workspace getWorkspace() throws InternalErrorException, HomeNotFoundException, WorkspaceFolderNotFoundException {
final ASLSession session = getASLSession();
Workspace workspace = HomeLibrary.getUserWorkspace(session.getUsername());
return workspace;
}
}

View File

@ -8,6 +8,8 @@ import java.util.Date;
import org.gcube.common.homelibrary.home.HomeLibrary;
import org.gcube.common.homelibrary.home.workspace.Workspace;
import org.gcube.common.homelibrary.home.workspace.exceptions.ItemAlreadyExistException;
import org.gcube.common.portal.PortalContext;
import org.gcube.common.scope.api.ScopeProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -46,26 +48,37 @@ public class UploadToWorkspaceThread implements Runnable {
@Override
public void run() {
try {
String currScope = ScopeProvider.instance.get();
ScopeProvider.instance.set("/"+PortalContext.getConfiguration().getInfrastructureName());
Workspace ws = HomeLibrary
.getHomeManagerFactory()
.getHomeManager()
.getHome(username).getWorkspace();
_log.info("File to upload="+fileabsolutePathOnServer);
File file = new File(fileabsolutePathOnServer);
String mimeType = ShareUpdateServiceImpl.getMimeType(file, fileName);
InputStream fileData = new FileInputStream(file);
String theId = "";
_log.info("mimeType="+mimeType + " fileData null? " + (fileData == null) );
try {
theId = ws.createExternalFile(fileName ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId();
}
catch (NullPointerException exn) {
_log.warn("null pointer");
exn.printStackTrace();
}
catch (ItemAlreadyExistException ex) {
_log.warn("fileName " + fileName + " exists, appending timestamp");
theId = ws.createExternalFile(fileName+" ("+ new Date()+")" ,"File added automatically by Share Updates" , mimeType ,fileData, ws.getRoot().getId()).getId();
ex.printStackTrace();
} finally {
fileData.close();
}
fileData.close();
_log.debug("Uploaded " + fileName + " - Returned Workspace id=" + theId);
ScopeProvider.instance.set(currScope);
}
catch (Exception e) {

View File

@ -3,9 +3,12 @@ package org.gcube.portlets.user.shareupdates.server.metaseeker;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Hashtable;
import org.gcube.portlets.user.shareupdates.server.opengraph.OpenGraph;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.slf4j.Logger;
@ -61,12 +64,11 @@ public class MetaSeeker {
* @param url The address to the web page to fetch the meta
* @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception
*/
public MetaSeeker(String url) throws java.io.IOException, Exception {
public MetaSeeker(URLConnection connection, URL httpURL) throws java.io.IOException, Exception {
this();
isImported = true;
// download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content
URL httpURL = new URL(url);
BufferedReader dis = new BufferedReader(new InputStreamReader(httpURL.openStream()));
Charset charset = OpenGraph.getConnectionCharset(connection);
BufferedReader dis = new BufferedReader(new InputStreamReader(connection.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();

View File

@ -4,6 +4,7 @@ import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
@ -21,74 +22,74 @@ import java.util.regex.Pattern;
*/
public class OpenGraph
{
private String pageUrl;
private String pageUrl;
private ArrayList<OpenGraphNamespace> pageNamespaces;
private Hashtable<String, ArrayList<MetaElement>> metaAttributes;
private String baseType;
private boolean isImported; // determine if the object is a new incarnation or representation of a web page
private boolean hasChanged; // track if object has been changed
private Hashtable<String, ArrayList<MetaElement>> metaAttributes;
private String baseType;
private boolean isImported; // determine if the object is a new incarnation or representation of a web page
private boolean hasChanged; // track if object has been changed
public final static String[] REQUIRED_META = new String[]{"title", "type", "image", "url" };
public final static String[] REQUIRED_META = new String[]{"title", "type", "image", "url" };
public final static Hashtable<String, String[]> BASE_TYPES = new Hashtable<String, String[]>();
static
public final static Hashtable<String, String[]> BASE_TYPES = new Hashtable<String, String[]>();
static
{
BASE_TYPES.put("activity", new String[] {"activity", "sport"});
BASE_TYPES.put("business", new String[] {"bar", "company", "cafe", "hotel", "restaurant"});
BASE_TYPES.put("group", new String[] {"cause", "sports_league", "sports_team"});
BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"});
BASE_TYPES.put("person", new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"});
BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"});
BASE_TYPES.put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"});
BASE_TYPES.put("website", new String[] {"blog", "website", "article"});
BASE_TYPES.put("organization", new String[] {"band", "government", "non_profit", "school", "university"});
BASE_TYPES.put("person", new String[] {"actor", "athlete", "author", "director", "musician", "politician", "profile", "public_figure"});
BASE_TYPES.put("place", new String[] {"city", "country", "landmark", "state_province"});
BASE_TYPES.put("product", new String[] {"album", "book", "drink", "food", "game", "movie", "product", "song", "tv_show"});
BASE_TYPES.put("website", new String[] {"blog", "website", "article"});
}
/**
* Create an open graph representation for generating your own Open Graph object
*/
public OpenGraph()
/**
* Create an open graph representation for generating your own Open Graph object
*/
public OpenGraph()
{
pageNamespaces = new ArrayList<OpenGraphNamespace>();
metaAttributes = new Hashtable<String, ArrayList<MetaElement>>();
hasChanged = false;
isImported = false;
}
metaAttributes = new Hashtable<String, ArrayList<MetaElement>>();
hasChanged = false;
isImported = false;
}
/**
* Fetch the open graph representation from a web site
* @param url The address to the web page to fetch Open Graph data
* @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes
* @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception
* @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META
*/
public OpenGraph(String url, boolean ignoreSpecErrors, URLConnection siteConnection) throws java.io.IOException, Exception {
this();
isImported = true;
/**
* Fetch the open graph representation from a web site
* @param url The address to the web page to fetch Open Graph data
* @param ignoreSpecErrors Set this option to true if you don't wish to have an exception throw if the page does not conform to the basic 4 attributes
* @throws java.io.IOException If a network error occurs, the HTML parser will throw an IO Exception
* @throws java.lang.Exception A generic exception is throw if the specific page fails to conform to the basic Open Graph standard as define by the constant REQUIRED_META
*/
public OpenGraph(String url, boolean ignoreSpecErrors, URLConnection siteConnection) throws java.io.IOException, Exception {
this();
isImported = true;
// download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content
Charset charset = getConnectionCharset(siteConnection);
BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();
// download the (X)HTML content, but only up to the closing head tag. We do not want to waste resources parsing irrelevant content
Charset charset = getConnectionCharset(siteConnection);
BufferedReader dis = new BufferedReader(new InputStreamReader(siteConnection.getInputStream(), charset));
String inputLine;
StringBuffer headContents = new StringBuffer();
// Loop through each line, looking for the closing head element
while ((inputLine = dis.readLine()) != null)
// Loop through each line, looking for the closing head element
while ((inputLine = dis.readLine()) != null)
{
if (inputLine.contains("</head>"))
if (inputLine.contains("</head>"))
{
inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7);
inputLine = inputLine.concat("<body></body></html>");
headContents.append(inputLine + "\r\n");
break;
}
headContents.append(inputLine + "\r\n");
}
inputLine = inputLine.substring(0, inputLine.indexOf("</head>") + 7);
inputLine = inputLine.concat("<body></body></html>");
headContents.append(inputLine + "\r\n");
break;
}
headContents.append(inputLine + "\r\n");
}
String headContentsStr = headContents.toString();
HtmlCleaner cleaner = new HtmlCleaner();
// parse the string HTML
TagNode pageData = cleaner.clean(headContentsStr);
String headContentsStr = headContents.toString();
HtmlCleaner cleaner = new HtmlCleaner();
// parse the string HTML
TagNode pageData = cleaner.clean(headContentsStr);
// read in the declared namespaces
boolean hasOGspec = false;
@ -100,12 +101,12 @@ public class OpenGraph
Matcher matcher = pattern.matcher(namespaceData);
while (matcher.find())
{
String prefix = matcher.group(2);
String prefix = matcher.group(2);
String documentURI = matcher.group(3);
pageNamespaces.add(new OpenGraphNamespace(prefix, documentURI));
if (prefix.equals("og"))
hasOGspec = true;
}
}
}
// some pages do not include the new OG spec
@ -113,17 +114,17 @@ public class OpenGraph
if (!hasOGspec)
pageNamespaces.add(new OpenGraphNamespace("og", "http:// ogp.me/ns#"));
// open only the meta tags
TagNode[] metaData = pageData.getElementsByName("meta", true);
for (TagNode metaElement : metaData)
// open only the meta tags
TagNode[] metaData = pageData.getElementsByName("meta", true);
for (TagNode metaElement : metaData)
{
for (OpenGraphNamespace namespace : pageNamespaces)
{
String target = null;
if (metaElement.hasAttribute("property"))
target = "property";
else if (metaElement.hasAttribute("name"))
target = "name";
if (metaElement.hasAttribute("property"))
target = "property";
else if (metaElement.hasAttribute("name"))
target = "name";
if (target != null && metaElement.getAttributeByName(target).startsWith(namespace.getPrefix() + ":"))
{
@ -131,134 +132,141 @@ public class OpenGraph
break;
}
}
}
}
/**
* Check that page conforms to Open Graph protocol
*/
if (!ignoreSpecErrors)
/**
* Check that page conforms to Open Graph protocol
*/
if (!ignoreSpecErrors)
{
for (String req : REQUIRED_META)
for (String req : REQUIRED_META)
{
if (!metaAttributes.containsKey(req))
throw new Exception("Does not conform to Open Graph protocol");
}
}
if (!metaAttributes.containsKey(req))
throw new Exception("Does not conform to Open Graph protocol");
}
}
/**
* Has conformed, now determine basic sub type.
*/
baseType = null;
/**
* Has conformed, now determine basic sub type.
*/
baseType = null;
String currentType = getContent("type");
// read the original page url
URL realURL = siteConnection.getURL();
pageUrl = realURL.toExternalForm();
}
// read the original page url
URL realURL = siteConnection.getURL();
pageUrl = realURL.toExternalForm();
}
/**
* Gets the charset for specified connection.
* Content Type header is parsed to get the charset name.
*
* @param connection the connection.
* @return the Charset object for response charset name;
* if it's not found then the default charset.
*/
private static Charset getConnectionCharset(URLConnection connection)
{
String contentType = connection.getContentType();
if (contentType != null && contentType.length() > 0)
{
contentType = contentType.toLowerCase();
String charsetName = extractCharsetName(contentType);
if (charsetName != null && charsetName.length() > 0)
{
try
{
return Charset.forName(charsetName);
}
catch (Exception e) {
// specified charset is not found,
// skip it to return the default one
}
}
}
/**
* Gets the charset for specified connection.
* Content Type header is parsed to get the charset name.
*
* @param connection the connection.
* @return the Charset object for response charset name;
* if it's not found then the default charset.
*/
public static Charset getConnectionCharset(URLConnection connection) {
String contentType = null;
try {
contentType = connection.getContentType();
}
catch (Exception e) {
// specified charset is not found,
// skip it to return the default one
return Charset.defaultCharset();
}
if (contentType != null && contentType.length() > 0)
{
contentType = contentType.toLowerCase();
String charsetName = extractCharsetName(contentType);
if (charsetName != null && charsetName.length() > 0)
{
try
{
return Charset.forName(charsetName);
}
catch (Exception e) {
// specified charset is not found,
// skip it to return the default one
}
}
}
// return the default charset
return Charset.defaultCharset();
}
// return the default charset
return Charset.defaultCharset();
}
/**
* Extract the charset name form the content type string.
* Content type string is received from Content-Type header.
*
* @param contentType the content type string, must be not null.
* @return the found charset name or null if not found.
*/
private static String extractCharsetName(String contentType)
{
// split onto media types
final String[] mediaTypes = contentType.split(":");
if (mediaTypes.length > 0)
{
// use only the first one, and split it on parameters
final String[] params = mediaTypes[0].split(";");
// find the charset parameter and return it's value
for (String each : params)
{
each = each.trim();
if (each.startsWith("charset="))
{
// return the charset name
return each.substring(8).trim();
}
}
}
return null;
}
/**
* Get the basic type of the Open graph page as per the specification
* @return Base type as defined by specification, null otherwise
*/
public String getBaseType()
/**
* Extract the charset name form the content type string.
* Content type string is received from Content-Type header.
*
* @param contentType the content type string, must be not null.
* @return the found charset name or null if not found.
*/
private static String extractCharsetName(String contentType)
{
return baseType;
}
// split onto media types
final String[] mediaTypes = contentType.split(":");
if (mediaTypes.length > 0)
{
// use only the first one, and split it on parameters
final String[] params = mediaTypes[0].split(";");
/**
* Get a value of a given Open Graph property
* @param property The Open graph property key
* @return Returns the value of the first property defined, null otherwise
*/
public String getContent(String property)
// find the charset parameter and return it's value
for (String each : params)
{
each = each.trim();
if (each.startsWith("charset="))
{
// return the charset name
return each.substring(8).trim();
}
}
}
return null;
}
/**
* Get the basic type of the Open graph page as per the specification
* @return Base type as defined by specification, null otherwise
*/
public String getBaseType()
{
if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0)
return baseType;
}
/**
* Get a value of a given Open Graph property
* @param property The Open graph property key
* @return Returns the value of the first property defined, null otherwise
*/
public String getContent(String property)
{
if (metaAttributes.containsKey(property) && metaAttributes.get(property).size() > 0)
return metaAttributes.get(property).get(0).getContent();
else
return null;
}
}
/**
* Get all the defined properties of the Open Graph object
* @return An array of all currently defined properties
*/
public MetaElement[] getProperties()
/**
* Get all the defined properties of the Open Graph object
* @return An array of all currently defined properties
*/
public MetaElement[] getProperties()
{
ArrayList<MetaElement> allElements = new ArrayList<MetaElement>();
for (ArrayList<MetaElement> collection : metaAttributes.values())
for (ArrayList<MetaElement> collection : metaAttributes.values())
allElements.addAll(collection);
return (MetaElement[]) allElements.toArray(new MetaElement[allElements.size()]);
}
}
/**
* Get all the defined properties of the Open Graph object
/**
* Get all the defined properties of the Open Graph object
* @param property The property to focus on
* @return An array of all currently defined properties
*/
public MetaElement[] getProperties(String property)
* @return An array of all currently defined properties
*/
public MetaElement[] getProperties(String property)
{
if (metaAttributes.containsKey(property))
{
@ -267,69 +275,69 @@ public class OpenGraph
}
else
return null;
}
}
/**
* Get the original URL the Open Graph page was obtained from
* @return The address to the Open Graph object page
*/
public String getOriginalUrl()
/**
* Get the original URL the Open Graph page was obtained from
* @return The address to the Open Graph object page
*/
public String getOriginalUrl()
{
return pageUrl;
}
return pageUrl;
}
/**
* Get the HTML representation of the Open Graph data.
* @return An array of meta elements as Strings
*/
public String[] toHTML()
/**
* Get the HTML representation of the Open Graph data.
* @return An array of meta elements as Strings
*/
public String[] toHTML()
{
// allocate the array
ArrayList<String> returnHTML = new ArrayList<String>();
// allocate the array
ArrayList<String> returnHTML = new ArrayList<String>();
int index = 0; // keep track of the index to insert into
for (ArrayList<MetaElement> elements : metaAttributes.values())
int index = 0; // keep track of the index to insert into
for (ArrayList<MetaElement> elements : metaAttributes.values())
{
for (MetaElement element : elements)
returnHTML.add("<meta property=\"" + element.getNamespace() + ":" +
element.getProperty() + "\" content=\"" + element.getContent() + "\" />");
returnHTML.add("<meta property=\"" + element.getNamespace() + ":" +
element.getProperty() + "\" content=\"" + element.getContent() + "\" />");
}
// return the array
return (String[]) returnHTML.toArray();
}
// return the array
return (String[]) returnHTML.toArray();
}
/**
* Get the XHTML representation of the Open Graph data.
* @return An array of meta elements as Strings
*/
public String[] toXHTML()
/**
* Get the XHTML representation of the Open Graph data.
* @return An array of meta elements as Strings
*/
public String[] toXHTML()
{
// allocate the array
ArrayList<String> returnHTML = new ArrayList<String>();
// allocate the array
ArrayList<String> returnHTML = new ArrayList<String>();
int index = 0; // keep track of the index to insert into
for (ArrayList<MetaElement> elements : metaAttributes.values())
int index = 0; // keep track of the index to insert into
for (ArrayList<MetaElement> elements : metaAttributes.values())
{
for (MetaElement element : elements)
returnHTML.add("<meta name=\"" + element.getNamespace().getPrefix() + ":" +
element.getProperty() + "\" content=\"" + element.getContent() + "\" />");
returnHTML.add("<meta name=\"" + element.getNamespace().getPrefix() + ":" +
element.getProperty() + "\" content=\"" + element.getContent() + "\" />");
}
// return the array
return (String[]) returnHTML.toArray();
}
// return the array
return (String[]) returnHTML.toArray();
}
/**
* Set the Open Graph property to a specific value
/**
* Set the Open Graph property to a specific value
* @param namespace The OpenGraph namespace the content belongs to
* @param property The og:XXXX where XXXX is the property you wish to set
* @param content The value or contents of the property to be set
*/
public void setProperty(OpenGraphNamespace namespace, String property, String content)
* @param property The og:XXXX where XXXX is the property you wish to set
* @param content The value or contents of the property to be set
*/
public void setProperty(OpenGraphNamespace namespace, String property, String content)
{
if (!pageNamespaces.contains(namespace))
if (!pageNamespaces.contains(namespace))
pageNamespaces.add(namespace);
property = property.replaceAll(namespace.getPrefix() + ":", "");
@ -338,41 +346,41 @@ public class OpenGraph
metaAttributes.put(property, new ArrayList<MetaElement>());
metaAttributes.get(property).add(element);
}
}
/**
* Removed a defined property
* @param property The og:XXXX where XXXX is the property you wish to remove
*/
public void removeProperty(String property)
/**
* Removed a defined property
* @param property The og:XXXX where XXXX is the property you wish to remove
*/
public void removeProperty(String property)
{
metaAttributes.remove(property);
}
metaAttributes.remove(property);
}
/**
* Obtain the underlying HashTable
* @return The underlying structure as a Hashtable
*/
public Hashtable<String, ArrayList<MetaElement>> exposeTable() {
return metaAttributes;
}
/**
* Obtain the underlying HashTable
* @return The underlying structure as a Hashtable
*/
public Hashtable<String, ArrayList<MetaElement>> exposeTable() {
return metaAttributes;
}
/**
* Test if the Open Graph object was initially a representation of a web page
* @return True if the object is from a web page, false otherwise
*/
public boolean isFromWeb()
/**
* Test if the Open Graph object was initially a representation of a web page
* @return True if the object is from a web page, false otherwise
*/
public boolean isFromWeb()
{
return isImported;
}
return isImported;
}
/**
* Test if the object has been modified by setters/deleters.
* This is only relevant if this object initially represented a web page
* @return True True if the object has been modified, false otherwise
*/
public boolean hasChanged()
/**
* Test if the object has been modified by setters/deleters.
* This is only relevant if this object initially represented a web page
* @return True True if the object has been modified, false otherwise
*/
public boolean hasChanged()
{
return hasChanged;
}
return hasChanged;
}
}