collectors
This commit is contained in:
parent
217dc672be
commit
25917344d6
|
@ -28,7 +28,7 @@
|
|||
<groupId>com.vladmihalcea</groupId>
|
||||
<artifactId>hibernate-types-52</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!-- JAXB API, java.xml.bind module -->
|
||||
<dependency>
|
||||
<groupId>jakarta.xml.bind</groupId>
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
package eu.dnetlib.services.collector.plugins.filesystem;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Iterator;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.common.exceptions.DnetException;
|
||||
import eu.dnetlib.common.utils.DnetStreamSupport;
|
||||
import eu.dnetlib.common.utils.XmlCleaner;
|
||||
import eu.dnetlib.data.dsm.ApiDesc;
|
||||
|
||||
/**
|
||||
* The Class FilesystemIterable.
|
||||
*
|
||||
* @author Sandro, Michele, Andrea
|
||||
*/
|
||||
public class FileSystemIterable implements Iterable<String> {
|
||||
|
||||
/**
|
||||
* The Constant log.
|
||||
*/
|
||||
private static final Log log = LogFactory.getLog(FileSystemIterable.class);
|
||||
|
||||
/**
|
||||
* The base dir.
|
||||
*/
|
||||
private File baseDir;
|
||||
|
||||
/**
|
||||
* The extensions.
|
||||
*/
|
||||
private String extension;
|
||||
|
||||
/**
|
||||
* Instantiates a new filesystem iterable.
|
||||
*
|
||||
* @param api
|
||||
* the api
|
||||
* @throws DnetException
|
||||
* the collector service exception
|
||||
*/
|
||||
public FileSystemIterable(final ApiDesc api) throws DnetException {
|
||||
try {
|
||||
final String baseUrl = api.getBaseUrl();
|
||||
final URL basePath = new URL(baseUrl);
|
||||
this.baseDir = new File(basePath.getPath());
|
||||
if (!baseDir.exists()) { throw new DnetException(String.format("The base ULR %s, does not exist", basePath.getPath())); }
|
||||
this.extension = api.getParams().get("extensions");
|
||||
} catch (final MalformedURLException e) {
|
||||
throw new DnetException("Filesystem collector failed! ", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<String> iterator() {
|
||||
final FileSystemIterator fsi = new FileSystemIterator(baseDir.getAbsolutePath(), extension);
|
||||
return DnetStreamSupport.stream(fsi)
|
||||
.map(this::loadFile)
|
||||
.iterator();
|
||||
}
|
||||
|
||||
private String loadFile(final String inputFileName) {
|
||||
try (FileInputStream fileInputStream = new FileInputStream(inputFileName)) {
|
||||
final String s = IOUtils.toString(fileInputStream, StandardCharsets.UTF_8.toString());
|
||||
return XmlCleaner.cleanAllEntities(s.startsWith("\uFEFF") ? s.substring(1) : s);
|
||||
} catch (final Exception e) {
|
||||
log.error("Unable to read " + inputFileName);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
package eu.dnetlib.services.collector.plugins.filesystem;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.collections.IteratorUtils;
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
/**
|
||||
* Class enabling lazy & recursive iteration of a filesystem tree. The iterator iterates over file paths.
|
||||
*
|
||||
* @author Andrea
|
||||
*/
|
||||
public class FileSystemIterator implements Iterator<String> {
|
||||
|
||||
/**
|
||||
* The logger
|
||||
*/
|
||||
private static final Log log = LogFactory.getLog(FileSystemIterator.class);
|
||||
|
||||
private final Set<String> extensions;
|
||||
private Iterator<Path> pathIterator;
|
||||
private String current;
|
||||
|
||||
public FileSystemIterator(final String baseDir, final String extensions) {
|
||||
this.extensions = new HashSet<>(Arrays.asList(extensions.split(",")));
|
||||
try {
|
||||
this.pathIterator = Files.newDirectoryStream(Paths.get(baseDir)).iterator();
|
||||
this.current = walkTillNext();
|
||||
} catch (final IOException e) {
|
||||
log.error("Cannot initialize File System Iterator. Is this path correct? " + baseDir);
|
||||
throw new RuntimeException("Filesystem collection error.", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return current != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized String next() {
|
||||
final String pivot = new String(current);
|
||||
current = walkTillNext();
|
||||
log.debug("Returning: " + pivot);
|
||||
return pivot;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {}
|
||||
|
||||
/**
|
||||
* Walk the filesystem recursively until it finds a candidate. Strategies: a) For any directory found during the walk, an iterator is
|
||||
* built and concat to the main one; b) Any file is checked against admitted extensions
|
||||
*
|
||||
* @return the next element to be returned by next call of this.next()
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
private synchronized String walkTillNext() {
|
||||
while (pathIterator.hasNext()) {
|
||||
final Path nextFilePath = pathIterator.next();
|
||||
if (Files.isDirectory(nextFilePath)) {
|
||||
// concat
|
||||
try {
|
||||
pathIterator = IteratorUtils.chainedIterator(pathIterator, Files.newDirectoryStream(nextFilePath).iterator());
|
||||
log.debug("Adding folder iterator: " + nextFilePath.toString());
|
||||
} catch (final IOException e) {
|
||||
log.error("Cannot create folder iterator! Is this path correct? " + nextFilePath.toString());
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
if (extensions.contains(FilenameUtils.getExtension(nextFilePath.toString()))) {
|
||||
log.debug("Returning: " + nextFilePath.toString());
|
||||
return nextFilePath.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -12,42 +12,13 @@
|
|||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<!-- Mail -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-data-jpa</artifactId>
|
||||
<groupId>javax.mail</groupId>
|
||||
<artifactId>mail</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-json</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.vladmihalcea</groupId>
|
||||
<artifactId>hibernate-types-52</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- JAXB API, java.xml.bind module -->
|
||||
<dependency>
|
||||
<groupId>jakarta.xml.bind</groupId>
|
||||
<artifactId>jakarta.xml.bind-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- JAXB Runtime, com.sun.xml.bind module -->
|
||||
<dependency>
|
||||
<groupId>org.glassfish.jaxb</groupId>
|
||||
<artifactId>jaxb-runtime</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- CSV -->
|
||||
<dependency>
|
||||
<groupId>com.opencsv</groupId>
|
||||
<artifactId>opencsv</artifactId>
|
||||
<version>5.4</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!-- hot swapping, disable cache for template, enable live reload -->
|
||||
<dependency>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
|
|
|
@ -14,8 +14,8 @@ import javax.mail.internet.InternetAddress;
|
|||
import javax.mail.internet.MimeMessage;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
|
@ -26,7 +26,7 @@ import eu.dnetlib.data.mail.EmailMessage;
|
|||
@Component
|
||||
public class EmailService {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(EmailService.class);
|
||||
private static final Log log = LogFactory.getLog(EmailService.class);
|
||||
|
||||
@Autowired
|
||||
private EmailConfiguration conf;
|
||||
|
|
|
@ -23,11 +23,7 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
-->
|
||||
<!-- Mail -->
|
||||
<dependency>
|
||||
<groupId>javax.mail</groupId>
|
||||
<artifactId>mail</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!-- for /metrics and /health controllers -->
|
||||
<dependency>
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
package eu.dnetlib.common.utils;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
public class DnetStreamSupport {
|
||||
|
||||
public static <T> Stream<T> stream(final Iterator<T> iterator) {
|
||||
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iterator, Spliterator.ORDERED), false);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,262 @@
|
|||
package eu.dnetlib.common.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
/**
|
||||
* @author jochen, Andreas Czerniak
|
||||
*
|
||||
*/
|
||||
public class XmlCleaner {
|
||||
|
||||
/**
|
||||
* Pattern for numeric entities.
|
||||
*/
|
||||
private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); //$NON-NLS-1$
|
||||
// private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); //$NON-NLS-1$
|
||||
|
||||
// see https://www.w3.org/TR/REC-xml/#charsets , not only limited to 
|
||||
private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];");
|
||||
|
||||
/**
|
||||
* Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
|
||||
* [#x10000-#x10FFFF]
|
||||
*/
|
||||
private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); //$NON-NLS-1$
|
||||
|
||||
// Map entities to their unicode equivalent
|
||||
private static Set<String> goodEntities = new HashSet<>();
|
||||
private static Map<String, String> badEntities = new HashMap<>();
|
||||
|
||||
static {
|
||||
// pre-defined XML entities
|
||||
goodEntities.add("""); //$NON-NLS-1$ // quotation mark
|
||||
goodEntities.add("&"); //$NON-NLS-1$ // ampersand
|
||||
goodEntities.add("<"); //$NON-NLS-1$ // less-than sign
|
||||
goodEntities.add(">"); //$NON-NLS-1$ // greater-than sign
|
||||
// control entities
|
||||
// badEntities.put("", "");
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("€", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‚", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ƒ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("„", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("…", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("†", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‡", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ˆ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‰", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‹", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("‘", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("’", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("“", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("”", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("•", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("–", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("—", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("˜", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("™", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("š", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("›", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("œ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("ž", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
badEntities.put("Ÿ", " "); //$NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character
|
||||
// misc entities
|
||||
badEntities.put("€", "\u20AC"); //$NON-NLS-1$ //$NON-NLS-2$ // euro
|
||||
badEntities.put("‘", "\u2018"); //$NON-NLS-1$ //$NON-NLS-2$ // left single quotation mark
|
||||
badEntities.put("’", "\u2019"); //$NON-NLS-1$ //$NON-NLS-2$ // right single quotation mark
|
||||
// Latin 1 entities
|
||||
badEntities.put(" ", "\u00A0"); //$NON-NLS-1$ //$NON-NLS-2$ // no-break space
|
||||
badEntities.put("¡", "\u00A1"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation mark
|
||||
badEntities.put("¢", "\u00A2"); //$NON-NLS-1$ //$NON-NLS-2$ // cent sign
|
||||
badEntities.put("£", "\u00A3"); //$NON-NLS-1$ //$NON-NLS-2$ // pound sign
|
||||
badEntities.put("¤", "\u00A4"); //$NON-NLS-1$ //$NON-NLS-2$ // currency sign
|
||||
badEntities.put("¥", "\u00A5"); //$NON-NLS-1$ //$NON-NLS-2$ // yen sign
|
||||
badEntities.put("¦", "\u00A6"); //$NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar
|
||||
badEntities.put("§", "\u00A7"); //$NON-NLS-1$ //$NON-NLS-2$ // section sign
|
||||
badEntities.put("¨", "\u00A8"); //$NON-NLS-1$ //$NON-NLS-2$ // diaeresis
|
||||
badEntities.put("©", "\u00A9"); //$NON-NLS-1$ //$NON-NLS-2$ // copyright sign
|
||||
badEntities.put("ª", "\u00AA"); //$NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal indicator
|
||||
badEntities.put("«", "\u00AB"); //$NON-NLS-1$ //$NON-NLS-2$ // left-pointing double angle quotation mark
|
||||
badEntities.put("¬", "\u00AC"); //$NON-NLS-1$ //$NON-NLS-2$ // not sign
|
||||
badEntities.put("­", "\u00AD"); //$NON-NLS-1$ //$NON-NLS-2$ // soft hyphen
|
||||
badEntities.put("®", "\u00AE"); //$NON-NLS-1$ //$NON-NLS-2$ // registered sign
|
||||
badEntities.put("¯", "\u00AF"); //$NON-NLS-1$ //$NON-NLS-2$ // macron
|
||||
badEntities.put("°", "\u00B0"); //$NON-NLS-1$ //$NON-NLS-2$ // degree sign
|
||||
badEntities.put("±", "\u00B1"); //$NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign
|
||||
badEntities.put("²", "\u00B2"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript two
|
||||
badEntities.put("³", "\u00B3"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript three
|
||||
badEntities.put("´", "\u00B4"); //$NON-NLS-1$ //$NON-NLS-2$ // acute accent
|
||||
badEntities.put("µ", "\u00B5"); //$NON-NLS-1$ //$NON-NLS-2$ // micro sign
|
||||
badEntities.put("¶", "\u00B6"); //$NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign
|
||||
badEntities.put("·", "\u00B7"); //$NON-NLS-1$ //$NON-NLS-2$ // middle dot
|
||||
badEntities.put("¸", "\u00B8"); //$NON-NLS-1$ //$NON-NLS-2$ // cedilla
|
||||
badEntities.put("¹", "\u00B9"); //$NON-NLS-1$ //$NON-NLS-2$ // superscript one
|
||||
badEntities.put("º", "\u00BA"); //$NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal indicator
|
||||
badEntities.put("»", "\u00BB"); //$NON-NLS-1$ //$NON-NLS-2$ // right-pointing double angle quotation mark
|
||||
badEntities.put("¼", "\u00BC"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one quarter
|
||||
badEntities.put("½", "\u00BD"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one half
|
||||
badEntities.put("¾", "\u00BE"); //$NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three quarters
|
||||
badEntities.put("¿", "\u00BF"); //$NON-NLS-1$ //$NON-NLS-2$ // inverted question mark
|
||||
badEntities.put("À", "\u00C0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with grave
|
||||
badEntities.put("Á", "\u00C1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with acute
|
||||
badEntities.put("Â", "\u00C2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with circumflex
|
||||
badEntities.put("Ã", "\u00C3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with tilde
|
||||
badEntities.put("Ä", "\u00C4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with diaeresis
|
||||
badEntities.put("Å", "\u00C5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A with ring above
|
||||
badEntities.put("Æ", "\u00C6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter AE
|
||||
badEntities.put("Ç", "\u00C7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter C with cedilla
|
||||
badEntities.put("È", "\u00C8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with grave
|
||||
badEntities.put("É", "\u00C9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with acute
|
||||
badEntities.put("Ê", "\u00CA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with circumflex
|
||||
badEntities.put("Ë", "\u00CB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E with diaeresis
|
||||
badEntities.put("Ì", "\u00CC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with grave
|
||||
badEntities.put("Í", "\u00CD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with acute
|
||||
badEntities.put("Î", "\u00CE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with circumflex
|
||||
badEntities.put("Ï", "\u00CF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I with diaeresis
|
||||
badEntities.put("Ð", "\u00D0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH
|
||||
badEntities.put("Ñ", "\u00D1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter N with tilde
|
||||
badEntities.put("Ò", "\u00D2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with grave
|
||||
badEntities.put("Ó", "\u00D3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with acute
|
||||
badEntities.put("Ô", "\u00D4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with circumflex
|
||||
badEntities.put("Õ", "\u00D5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with tilde
|
||||
badEntities.put("Ö", "\u00D6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with diaeresis
|
||||
badEntities.put("×", "\u00D7"); //$NON-NLS-1$ //$NON-NLS-2$ // multiplication sign
|
||||
badEntities.put("Ø", "\u00D8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O with stroke
|
||||
badEntities.put("Ù", "\u00D9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with grave
|
||||
badEntities.put("Ú", "\u00DA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with acute
|
||||
badEntities.put("Û", "\u00DB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with circumflex
|
||||
badEntities.put("Ü", "\u00DC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U with diaeresis
|
||||
badEntities.put("Ý", "\u00DD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter Y with acute
|
||||
badEntities.put("Þ", "\u00DE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin capital letter THORN
|
||||
badEntities.put("ß", "\u00DF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter sharp s
|
||||
badEntities.put("à", "\u00E0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with grave
|
||||
badEntities.put("á", "\u00E1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with acute
|
||||
badEntities.put("â", "\u00E2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with circumflex
|
||||
badEntities.put("ã", "\u00E3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with tilde
|
||||
badEntities.put("ä", "\u00E4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with diaeresis
|
||||
badEntities.put("å", "\u00E5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter a with ring above
|
||||
badEntities.put("æ", "\u00E6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae
|
||||
badEntities.put("ç", "\u00E7"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter c with cedilla
|
||||
badEntities.put("è", "\u00E8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with grave
|
||||
badEntities.put("é", "\u00E9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with acute
|
||||
badEntities.put("ê", "\u00EA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with circumflex
|
||||
badEntities.put("ë", "\u00EB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter e with diaeresis
|
||||
badEntities.put("ì", "\u00EC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with grave
|
||||
badEntities.put("í", "\u00ED"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with acute
|
||||
badEntities.put("î", "\u00EE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with circumflex
|
||||
badEntities.put("ï", "\u00EF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter i with diaeresis
|
||||
badEntities.put("ð", "\u00F0"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth
|
||||
badEntities.put("ñ", "\u00F1"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter n with tilde
|
||||
badEntities.put("ò", "\u00F2"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with grave
|
||||
badEntities.put("ó", "\u00F3"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with acute
|
||||
badEntities.put("ô", "\u00F4"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with circumflex
|
||||
badEntities.put("õ", "\u00F5"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with tilde
|
||||
badEntities.put("ö", "\u00F6"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with diaeresis
|
||||
badEntities.put("÷", "\u00F7"); //$NON-NLS-1$ //$NON-NLS-2$ // division sign
|
||||
badEntities.put("ø", "\u00F8"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter o with stroke
|
||||
badEntities.put("ù", "\u00F9"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with grave
|
||||
badEntities.put("ú", "\u00FA"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with acute
|
||||
badEntities.put("û", "\u00FB"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with circumflex
|
||||
badEntities.put("ü", "\u00FC"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter u with diaeresis
|
||||
badEntities.put("ý", "\u00FD"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with acute
|
||||
badEntities.put("þ", "\u00FE"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter thorn
|
||||
badEntities.put("ÿ", "\u00FF"); //$NON-NLS-1$ //$NON-NLS-2$ // latin small letter y with diaeresis
|
||||
}
|
||||
|
||||
/**
|
||||
* For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove it. For each
|
||||
* instance of a bare {@literal &}, replace it with {@literal &<br/>
|
||||
* } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal &lt;} and {@literal &gt;}.
|
||||
*
|
||||
* @param broken
|
||||
* the string to handle entities
|
||||
* @return the string with entities appropriately fixed up
|
||||
*/
|
||||
static public String cleanAllEntities(final String broken) {
|
||||
if (StringUtils.isBlank(broken)) { return null; }
|
||||
|
||||
String working = invalidControlCharPattern.matcher(broken).replaceAll("");
|
||||
working = invalidCharacterPattern.matcher(working).replaceAll("");
|
||||
|
||||
int cleanfrom = 0;
|
||||
|
||||
while (true) {
|
||||
int amp = working.indexOf('&', cleanfrom);
|
||||
// If there are no more amps then we are done
|
||||
if (amp == -1) {
|
||||
break;
|
||||
}
|
||||
// Skip references of the kind &#ddd;
|
||||
if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) {
|
||||
cleanfrom = working.indexOf(';', amp) + 1;
|
||||
continue;
|
||||
}
|
||||
int i = amp + 1;
|
||||
while (true) {
|
||||
// if we are at the end of the string then just escape the '&';
|
||||
if (i >= working.length()) {
|
||||
return working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
||||
}
|
||||
// if we have come to a ; then we have an entity
|
||||
// If it is something that xml can't handle then replace it.
|
||||
final char c = working.charAt(i);
|
||||
if (c == ';') {
|
||||
final String entity = working.substring(amp, i + 1);
|
||||
final String replace = handleEntity(entity);
|
||||
working = working.substring(0, amp) + replace + working.substring(i + 1);
|
||||
break;
|
||||
}
|
||||
// Did we end an entity without finding a closing ;
|
||||
// Then treat it as an '&' that needs to be replaced with &
|
||||
if (!Character.isLetterOrDigit(c)) {
|
||||
working = working.substring(0, amp) + "&" + working.substring(amp + 1); //$NON-NLS-1$
|
||||
amp = i + 4; // account for the 4 extra characters
|
||||
break;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
cleanfrom = amp + 1;
|
||||
}
|
||||
|
||||
if (Pattern.compile("<<").matcher(working).find()) {
|
||||
working = working.replaceAll("<<", "<<");
|
||||
}
|
||||
|
||||
if (Pattern.compile(">>").matcher(working).find()) {
|
||||
working = working.replaceAll(">>", ">>");
|
||||
}
|
||||
|
||||
return working;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only allows 4 entities:
|
||||
* &amp;, &quot;, &lt; and &gt;.
|
||||
*
|
||||
* @param entity
|
||||
* the entity to be replaced
|
||||
* @return the substitution for the entity, either itself, the unicode equivalent or an empty string.
|
||||
*/
|
||||
private static String handleEntity(final String entity) {
|
||||
if (goodEntities.contains(entity)) { return entity; }
|
||||
|
||||
final String replace = badEntities.get(entity);
|
||||
if (replace != null) { return replace; }
|
||||
|
||||
return replace != null ? replace : "";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
package eu.dnetlib.data.dsm;
|
||||
|
||||
import java.util.HashMap;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import javax.xml.bind.annotation.XmlRootElement;
|
||||
|
||||
@XmlRootElement
|
||||
public class ApiDesc {
|
||||
|
||||
private String id;
|
||||
|
||||
private String baseUrl;
|
||||
|
||||
private String protocol;
|
||||
|
||||
private HashMap<String, String> params = new HashMap<>();
|
||||
|
||||
public static ApiDesc newInstance(final Node node) {
|
||||
final ApiDesc ifc = new ApiDesc();
|
||||
ifc.setId(node.valueOf("./@id"));
|
||||
ifc.setBaseUrl(node.valueOf("./BASE_URL"));
|
||||
ifc.setProtocol(node.valueOf("./ACCESS_PROTOCOL"));
|
||||
|
||||
for (final Object o : node.selectNodes("./ACCESS_PROTOCOL/@*")) {
|
||||
final Node n = (Node) o;
|
||||
ifc.getParams().put(n.getName(), n.getText());
|
||||
}
|
||||
|
||||
return ifc;
|
||||
}
|
||||
|
||||
public String getBaseUrl() {
|
||||
return baseUrl;
|
||||
}
|
||||
|
||||
public void setBaseUrl(final String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public HashMap<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(final HashMap<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public String getProtocol() {
|
||||
return protocol;
|
||||
}
|
||||
|
||||
public void setProtocol(final String protocol) {
|
||||
this.protocol = protocol;
|
||||
}
|
||||
|
||||
}
|
10
pom.xml
10
pom.xml
|
@ -103,8 +103,14 @@
|
|||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-pool2</artifactId>
|
||||
<version>2.11.1</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<!-- XML -->
|
||||
<dependency>
|
||||
<groupId>org.dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
@ -136,7 +142,7 @@
|
|||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<groupId>org.dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
<version>2.1.4</version>
|
||||
</dependency>
|
||||
|
|
Loading…
Reference in New Issue