collectors

2023-09-08 16:00:28 +02:00 · 2023-09-08 16:00:28 +02:00 · 25917344d6
parent 217dc672be
commit 25917344d6
10 changed files with 526 additions and 45 deletions
--- a/apps/collector/pom.xml
+++ b/apps/collector/pom.xml
@ -28,7 +28,7 @@
 			<groupId>com.vladmihalcea</groupId>
 			<artifactId>hibernate-types-52</artifactId>
 		</dependency>
-
+      
 		<!-- JAXB API, java.xml.bind module -->
 		<dependency>
 			<groupId>jakarta.xml.bind</groupId>
--- a/apps/collector/src/main/java/eu/dnetlib/services/collector/plugins/filesystem/FileSystemIterable.java
+++ b/apps/collector/src/main/java/eu/dnetlib/services/collector/plugins/filesystem/FileSystemIterable.java
@ -0,0 +1,78 @@
+package eu.dnetlib.services.collector.plugins.filesystem;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.util.Iterator;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import eu.dnetlib.common.exceptions.DnetException;
+import eu.dnetlib.common.utils.DnetStreamSupport;
+import eu.dnetlib.common.utils.XmlCleaner;
+import eu.dnetlib.data.dsm.ApiDesc;
+
+/**
+ * The Class FilesystemIterable.
+ *
+ * @author Sandro, Michele, Andrea
+ */
+public class FileSystemIterable implements Iterable<String> {
+
+	/**
+	 * The Constant log.
+	 */
+	private static final Log log = LogFactory.getLog(FileSystemIterable.class);
+
+	/**
+	 * The base dir.
+	 */
+	private File baseDir;
+
+	/**
+	 * The extensions.
+	 */
+	private String extension;
+
+	/**
+	 * Instantiates a new filesystem iterable.
+	 *
+	 * @param api
+	 *            the api
+	 * @throws DnetException
+	 *             the collector service exception
+	 */
+	public FileSystemIterable(final ApiDesc api) throws DnetException {
+		try {
+			final String baseUrl = api.getBaseUrl();
+			final URL basePath = new URL(baseUrl);
+			this.baseDir = new File(basePath.getPath());
+			if (!baseDir.exists()) { throw new DnetException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
+			this.extension = api.getParams().get("extensions");
+		} catch (final MalformedURLException e) {
+			throw new DnetException("Filesystem collector failed! ", e);
+		}
+	}
+
+	@Override
+	public Iterator<String> iterator() {
+		final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extension);
+		return DnetStreamSupport.stream(fsi)
+			.map(this::loadFile)
+			.iterator();
+	}
+
+	private String loadFile(final String inputFileName) {
+		try (FileInputStream fileInputStream = new FileInputStream(inputFileName)) {
+			final String s = IOUtils.toString(fileInputStream, StandardCharsets.UTF_8.toString());
+			return XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
+		} catch (final Exception e) {
+			log.error("Unable to read " + inputFileName);
+			return "";
+		}
+	}
+}
--- a/apps/collector/src/main/java/eu/dnetlib/services/collector/plugins/filesystem/FileSystemIterator.java
+++ b/apps/collector/src/main/java/eu/dnetlib/services/collector/plugins/filesystem/FileSystemIterator.java
@ -0,0 +1,88 @@
+package eu.dnetlib.services.collector.plugins.filesystem;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+import org.apache.commons.collections.IteratorUtils;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * Class enabling lazy & recursive iteration of a filesystem tree. The iterator iterates over file paths.
+ *
+ * @author Andrea
+ */
+public class FileSystemIterator implements Iterator<String> {
+
+	/**
+	 * The logger
+	 */
+	private static final Log log = LogFactory.getLog(FileSystemIterator.class);
+
+	private final Set<String> extensions;
+	private Iterator<Path> pathIterator;
+	private String current;
+
+	public FileSystemIterator(final String baseDir, final String extensions) {
+		this.extensions = new HashSet<>(Arrays.asList(extensions.split(",")));
+		try {
+			this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator();
+			this.current = walkTillNext();
+		} catch (final IOException e) {
+			log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir);
+			throw new RuntimeException("Filesystem collection error.", e);
+		}
+	}
+
+	@Override
+	public boolean hasNext() {
+		return current != null;
+	}
+
+	@Override
+	public synchronized String next() {
+		final String pivot = new String(current);
+		current = walkTillNext();
+		log.debug("Returning: " + pivot);
+		return pivot;
+	}
+
+	@Override
+	public void remove() {}
+
+	/**
+	 * Walk the filesystem recursively until it finds a candidate. Strategies: a) For any directory found during the walk, an iterator is
+	 * built and concat to the main one; b) Any file is checked against admitted extensions
+	 *
+	 * @return the next element to be returned by next call of this.next()
+	 */
+	@SuppressWarnings("unchecked")
+	private synchronized String walkTillNext() {
+		while (pathIterator.hasNext()) {
+			final Path nextFilePath = pathIterator.next();
+			if (Files.isDirectory(nextFilePath)) {
+				// concat
+				try {
+					pathIterator = IteratorUtils.chainedIterator(pathIterator, Files.newDirectoryStream(nextFilePath).iterator());
+					log.debug("Adding folder iterator: " + nextFilePath.toString());
+				} catch (final IOException e) {
+					log.error("Cannot create folder iterator! Is this path correct? " + nextFilePath.toString());
+					return null;
+				}
+			} else {
+				if (extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) {
+					log.debug("Returning: " + nextFilePath.toString());
+					return nextFilePath.toString();
+				}
+			}
+		}
+		return null;
+	}
+}
--- a/apps/email/pom.xml
+++ b/apps/email/pom.xml
@ -12,42 +12,13 @@
 	<packaging>jar</packaging>

 	<dependencies>
+		
+		<!-- Mail -->
 		<dependency>
-			<groupId>org.springframework.boot</groupId>
-			<artifactId>spring-boot-starter-data-jpa</artifactId>
+			<groupId>javax.mail</groupId>
+			<artifactId>mail</artifactId>
 		</dependency>
-		<dependency>
-			<groupId>org.springframework.boot</groupId>
-			<artifactId>spring-boot-starter-json</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>org.postgresql</groupId>
-			<artifactId>postgresql</artifactId>
-		</dependency>
-		<dependency>
-			<groupId>com.vladmihalcea</groupId>
-			<artifactId>hibernate-types-52</artifactId>
-		</dependency>
-
-		<!-- JAXB API, java.xml.bind module -->
-		<dependency>
-			<groupId>jakarta.xml.bind</groupId>
-			<artifactId>jakarta.xml.bind-api</artifactId>
-		</dependency>
-
-		<!-- JAXB Runtime, com.sun.xml.bind module -->
-		<dependency>
-			<groupId>org.glassfish.jaxb</groupId>
-			<artifactId>jaxb-runtime</artifactId>
-		</dependency>
-
-		<!-- CSV -->
-		<dependency>
-			<groupId>com.opencsv</groupId>
-			<artifactId>opencsv</artifactId>
-			<version>5.4</version>
-		</dependency>
-
+		
 		<!-- hot swapping, disable cache for template, enable live reload -->
 		<dependency>
 			<groupId>org.springframework.boot</groupId>
--- a/apps/email/src/main/java/eu/dnetlib/utils/mail/EmailService.java
+++ b/apps/email/src/main/java/eu/dnetlib/utils/mail/EmailService.java
@ -14,8 +14,8 @@ import javax.mail.internet.InternetAddress;
 import javax.mail.internet.MimeMessage;

 import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.stereotype.Component;

@ -26,7 +26,7 @@ import eu.dnetlib.data.mail.EmailMessage;
@Component
 public class EmailService {

-	private static final Logger log = LoggerFactory.getLogger(EmailService.class);
+	private static final Log log = LogFactory.getLog(EmailService.class);

 	@Autowired
 	private EmailConfiguration conf;
--- a/libs/dnet-common/pom.xml
+++ b/libs/dnet-common/pom.xml
@ -23,11 +23,7 @@
 			<scope>provided</scope>
 		</dependency>
 	-->	
-		<!-- Mail -->
-		<dependency>
-			<groupId>javax.mail</groupId>
-			<artifactId>mail</artifactId>
-		</dependency>
+		

 		<!-- for /metrics and /health controllers -->
 		<dependency>
--- a/libs/dnet-common/src/main/java/eu/dnetlib/common/utils/DnetStreamSupport.java
+++ b/libs/dnet-common/src/main/java/eu/dnetlib/common/utils/DnetStreamSupport.java
@ -0,0 +1,15 @@
+package eu.dnetlib.common.utils;
+
+import java.util.Iterator;
+import java.util.Spliterator;
+import java.util.Spliterators;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+public class DnetStreamSupport {
+
+	public static <T> Stream<T> stream(final Iterator<T> iterator) {
+		return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false);
+	}
+
+}
--- a/libs/dnet-common/src/main/java/eu/dnetlib/common/utils/XmlCleaner.java
+++ b/libs/dnet-common/src/main/java/eu/dnetlib/common/utils/XmlCleaner.java
@ -0,0 +1,262 @@
+package eu.dnetlib.common.utils;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * @author jochen, Andreas Czerniak
+ *
+ */
+public class XmlCleaner {
+
+	/**
+	 * Pattern for numeric entities.
+	 */
+	private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
+	// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
+
+	// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to &#11;
+	private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
+
+	/**
+	 * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
+	 * [#x10000-#x10FFFF]
+	 */
+	private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
+
+	// Map entities to their unicode equivalent
+	private static Set<String> goodEntities = new HashSet<>();
+	private static Map<String, String> badEntities = new HashMap<>();
+
+	static {
+		// pre-defined XML entities
+		goodEntities.add("&quot;"); //$NON-NLS-1$ // quotation mark
+		goodEntities.add("&amp;"); //$NON-NLS-1$ // ampersand
+		goodEntities.add("&lt;"); //$NON-NLS-1$ // less-than sign
+		goodEntities.add("&gt;"); //$NON-NLS-1$ // greater-than sign
+		// control entities
+		// badEntities.put("&#11;", "");
+		badEntities.put("&#127;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#128;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#129;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#130;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#131;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#132;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#133;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#134;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#135;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#136;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#137;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#138;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#139;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#140;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#141;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#142;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#143;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#144;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#145;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#146;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#147;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#148;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#149;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#150;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#151;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#152;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#153;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#154;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#155;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#156;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#157;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#158;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		badEntities.put("&#159;", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
+		// misc entities
+		badEntities.put("&euro;", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
+		badEntities.put("&lsquo;", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
+		badEntities.put("&rsquo;", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
+		// Latin 1 entities
+		badEntities.put("&nbsp;", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
+		badEntities.put("&iexcl;", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
+		badEntities.put("&cent;", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
+		badEntities.put("&pound;", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
+		badEntities.put("&curren;", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
+		badEntities.put("&yen;", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
+		badEntities.put("&brvbar;", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
+		badEntities.put("&sect;", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
+		badEntities.put("&uml;", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
+		badEntities.put("&copy;", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
+		badEntities.put("&ordf;", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
+		badEntities.put("&laquo;", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
+		badEntities.put("&not;", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
+		badEntities.put("&shy;", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
+		badEntities.put("&reg;", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
+		badEntities.put("&macr;", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
+		badEntities.put("&deg;", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
+		badEntities.put("&plusmn;", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
+		badEntities.put("&sup2;", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
+		badEntities.put("&sup3;", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
+		badEntities.put("&acute;", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
+		badEntities.put("&micro;", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
+		badEntities.put("&para;", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
+		badEntities.put("&middot;", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
+		badEntities.put("&cedil;", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
+		badEntities.put("&sup1;", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
+		badEntities.put("&ordm;", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
+		badEntities.put("&raquo;", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
+		badEntities.put("&frac14;", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
+		badEntities.put("&frac12;", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
+		badEntities.put("&frac34;", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
+		badEntities.put("&iquest;", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
+		badEntities.put("&Agrave;", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
+		badEntities.put("&Aacute;", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
+		badEntities.put("&Acirc;", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
+		badEntities.put("&Atilde;", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
+		badEntities.put("&Auml;", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
+		badEntities.put("&Aring;", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
+		badEntities.put("&AElig;", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
+		badEntities.put("&Ccedil;", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
+		badEntities.put("&Egrave;", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
+		badEntities.put("&Eacute;", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
+		badEntities.put("&Ecirc;", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
+		badEntities.put("&Euml;", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
+		badEntities.put("&Igrave;", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
+		badEntities.put("&Iacute;", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
+		badEntities.put("&Icirc;", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
+		badEntities.put("&Iuml;", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
+		badEntities.put("&ETH;", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
+		badEntities.put("&Ntilde;", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
+		badEntities.put("&Ograve;", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
+		badEntities.put("&Oacute;", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
+		badEntities.put("&Ocirc;", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
+		badEntities.put("&Otilde;", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
+		badEntities.put("&Ouml;", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
+		badEntities.put("&times;", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
+		badEntities.put("&Oslash;", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
+		badEntities.put("&Ugrave;", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
+		badEntities.put("&Uacute;", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
+		badEntities.put("&Ucirc;", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
+		badEntities.put("&Uuml;", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
+		badEntities.put("&Yacute;", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
+		badEntities.put("&THORN;", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
+		badEntities.put("&szlig;", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
+		badEntities.put("&agrave;", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
+		badEntities.put("&aacute;", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
+		badEntities.put("&acirc;", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
+		badEntities.put("&atilde;", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
+		badEntities.put("&auml;", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
+		badEntities.put("&aring;", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
+		badEntities.put("&aelig;", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
+		badEntities.put("&ccedil;", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
+		badEntities.put("&egrave;", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
+		badEntities.put("&eacute;", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
+		badEntities.put("&ecirc;", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
+		badEntities.put("&euml;", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
+		badEntities.put("&igrave;", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
+		badEntities.put("&iacute;", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
+		badEntities.put("&icirc;", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
+		badEntities.put("&iuml;", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
+		badEntities.put("&eth;", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
+		badEntities.put("&ntilde;", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
+		badEntities.put("&ograve;", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
+		badEntities.put("&oacute;", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
+		badEntities.put("&ocirc;", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
+		badEntities.put("&otilde;", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
+		badEntities.put("&ouml;", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
+		badEntities.put("&divide;", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
+		badEntities.put("&oslash;", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
+		badEntities.put("&ugrave;", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
+		badEntities.put("&uacute;", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
+		badEntities.put("&ucirc;", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
+		badEntities.put("&uuml;", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
+		badEntities.put("&yacute;", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
+		badEntities.put("&thorn;", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
+		badEntities.put("&yuml;", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
+	}
+
+	/**
+	 * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each
+	 * instance of a bare {@literal &}, replace it with {@literal &amp;<br/>
+	 * } XML only allows 4 entities: {@literal &amp;amp;}, {@literal &amp;quot;}, {@literal &amp;lt;} and {@literal &amp;gt;}.
+	 *
+	 * @param broken
+	 *            the string to handle entities
+	 * @return the string with entities appropriately fixed up
+	 */
+	static public String cleanAllEntities(final String broken) {
+		if (StringUtils.isBlank(broken)) { return null; }
+
+		String working = invalidControlCharPattern.matcher(broken).replaceAll("");
+		working = invalidCharacterPattern.matcher(working).replaceAll("");
+
+		int cleanfrom = 0;
+
+		while (true) {
+			int amp = working.indexOf('&', cleanfrom);
+			// If there are no more amps then we are done
+			if (amp == -1) {
+				break;
+			}
+			// Skip references of the kind &#ddd;
+			if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
+				cleanfrom = working.indexOf(';', amp) + 1;
+				continue;
+			}
+			int i = amp + 1;
+			while (true) {
+				// if we are at the end of the string then just escape the '&';
+				if (i >= working.length()) {
+					return working.substring(0, amp) + "&amp;" + working.substring(amp + 1); //$NON-NLS-1$
+				}
+				// if we have come to a ; then we have an entity
+				// If it is something that xml can't handle then replace it.
+				final char c = working.charAt(i);
+				if (c == ';') {
+					final String entity = working.substring(amp, i + 1);
+					final String replace = handleEntity(entity);
+					working = working.substring(0, amp) + replace + working.substring(i + 1);
+					break;
+				}
+				// Did we end an entity without finding a closing ;
+				// Then treat it as an '&' that needs to be replaced with &amp;
+				if (!Character.isLetterOrDigit(c)) {
+					working = working.substring(0, amp) + "&amp;" + working.substring(amp + 1); //$NON-NLS-1$
+					amp = i + 4; // account for the 4 extra characters
+					break;
+				}
+				i++;
+			}
+			cleanfrom = amp + 1;
+		}
+
+		if (Pattern.compile("<<").matcher(working).find()) {
+			working = working.replaceAll("<<", "&lt;&lt;");
+		}
+
+		if (Pattern.compile(">>").matcher(working).find()) {
+			working = working.replaceAll(">>", "&gt;&gt;");
+		}
+
+		return working;
+	}
+
+	/**
+	 * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities:
+	 * &amp;amp;, &amp;quot;, &amp;lt; and &amp;gt;.
+	 *
+	 * @param entity
+	 *            the entity to be replaced
+	 * @return the substitution for the entity, either itself, the unicode equivalent or an empty string.
+	 */
+	private static String handleEntity(final String entity) {
+		if (goodEntities.contains(entity)) { return entity; }
+
+		final String replace = badEntities.get(entity);
+		if (replace != null) { return replace; }
+
+		return replace != null ? replace : "";
+	}
+}
--- a/libs/dnet-common/src/main/java/eu/dnetlib/data/dsm/ApiDesc.java
+++ b/libs/dnet-common/src/main/java/eu/dnetlib/data/dsm/ApiDesc.java
@ -0,0 +1,65 @@
+package eu.dnetlib.data.dsm;
+
+import java.util.HashMap;
+import org.dom4j.Node;
+
+import javax.xml.bind.annotation.XmlRootElement;
+
+@XmlRootElement
+public class ApiDesc {
+
+	private String id;
+
+	private String baseUrl;
+
+	private String protocol;
+
+	private HashMap<String, String> params = new HashMap<>();
+
+	public static ApiDesc newInstance(final Node node) {
+		final ApiDesc ifc = new ApiDesc();
+		ifc.setId(node.valueOf("./@id"));
+		ifc.setBaseUrl(node.valueOf("./BASE_URL"));
+		ifc.setProtocol(node.valueOf("./ACCESS_PROTOCOL"));
+
+		for (final Object o : node.selectNodes("./ACCESS_PROTOCOL/@*")) {
+			final Node n = (Node) o;
+			ifc.getParams().put(n.getName(), n.getText());
+		}
+
+		return ifc;
+	}
+
+	public String getBaseUrl() {
+		return baseUrl;
+	}
+
+	public void setBaseUrl(final String baseUrl) {
+		this.baseUrl = baseUrl;
+	}
+
+	public String getId() {
+		return id;
+	}
+
+	public void setId(final String id) {
+		this.id = id;
+	}
+
+	public HashMap<String, String> getParams() {
+		return params;
+	}
+
+	public void setParams(final HashMap<String, String> params) {
+		this.params = params;
+	}
+
+	public String getProtocol() {
+		return protocol;
+	}
+
+	public void setProtocol(final String protocol) {
+		this.protocol = protocol;
+	}
+
+}
--- a/pom.xml
+++ b/pom.xml
@ -103,8 +103,14 @@
    <dependency>
      <groupId>org.apache.commons</groupId>
      <artifactId>commons-pool2</artifactId>
-      <version>2.11.1</version>
    </dependency>
+    
+    
+    	<!-- XML -->
+	      <dependency>
+	<groupId>org.dom4j</groupId>
+	<artifactId>dom4j</artifactId>
+      </dependency>

  </dependencies>

@ -136,7 +142,7 @@
      </dependency>

      <dependency>
-	<groupId>dom4j</groupId>
+	<groupId>org.dom4j</groupId>
 	<artifactId>dom4j</artifactId>
 	<version>2.1.4</version>
      </dependency>