diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 2ea3f35cc..9d9400068 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -19,6 +19,8 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.aggregation.common.ReporterCallback; import eu.dnetlib.dhp.aggregation.common.ReportingJob; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; @@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob { return new OaiCollectorPlugin(clientParams); case rest_json2xml: return new RestCollectorPlugin(clientParams); + case file: + return new FileCollectorPlugin(fileSystem); + case fileGZip: + return new FileGZipCollectorPlugin(fileSystem); case other: final CollectorPlugin.NAME.OTHER_NAME plugin = Optional .ofNullable(api.getParams().get("other_plugin_type")) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 841d42fea..08084e22a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException; public interface CollectorPlugin { enum NAME { - oai, other, rest_json2xml; + oai, other, rest_json2xml, file, fileGZip; public enum OTHER_NAME { mdstore_mongodb_dump, mdstore_mongodb diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java new file mode 100644 index 000000000..f2fa3d2bb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.util.Iterator; +import java.util.Optional; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; + +public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin { + + private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class); + + public static final String SPLIT_ON_ELEMENT = "splitOnElement"; + + private final FileSystem fileSystem; + + public AbstractSplittedRecordPlugin(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + @Override + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { + + // get path to file + final Path filePath = Optional + .ofNullable(api.getBaseUrl()) + .map(Path::new) + .orElseThrow(() -> new CollectorException("missing baseUrl")); + + log.info("baseUrl: {}", filePath); + + // check that path to file exists + try { + if (!fileSystem.exists(filePath)) { + throw new CollectorException("path does not exist: " + filePath); + } + } catch (IOException e) { + throw new CollectorException(e); + } + + // get split element + final String splitOnElement = Optional + .ofNullable(api.getParams().get(SPLIT_ON_ELEMENT)) + .orElseThrow( + () -> new CollectorException(String + .format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT))); + + log.info("splitOnElement: {}", splitOnElement); + + final BufferedInputStream bis = getBufferedInputStream(filePath); + + Iterator xmlIterator = new XMLIterator(splitOnElement, bis); + + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED), + false); + } + + abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException; + + public FileSystem getFileSystem() { + return fileSystem; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java new file mode 100644 index 000000000..f771def93 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java @@ -0,0 +1,33 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.BufferedInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.common.collection.CollectorException; + +public class FileCollectorPlugin extends AbstractSplittedRecordPlugin { + + private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class); + + public FileCollectorPlugin(FileSystem fileSystem) { + super(fileSystem); + } + + @Override + protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException { + + log.info("filePath: {}", filePath); + + try { + FileSystem fs = super.getFileSystem(); + return new BufferedInputStream(fs.open(filePath)); + } catch (Exception e) { + throw new CollectorException("Error reading file " + filePath, e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java new file mode 100644 index 000000000..91a6e9f16 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.BufferedInputStream; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.common.collection.CollectorException; + +public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class); + + public FileGZipCollectorPlugin(FileSystem fileSystem) { + super(fileSystem); + } + + @Override + protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException { + + log.info("filePath: {}", filePath); + + try { + FileSystem fs = super.getFileSystem(); + GZIPInputStream stream = new GZIPInputStream(fs.open(filePath)); + return new BufferedInputStream(stream); + } catch (Exception e) { + throw new CollectorException("Error reading file " + filePath, e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 566c6b216..28b2572fb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import eu.dnetlib.dhp.collection.XmlCleaner; +import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpConnector2; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 64a041fd4..e4bad2f8d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -30,7 +30,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; -import eu.dnetlib.dhp.collection.JsonUtils; +import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java index da3768a4a..15401e223 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection; +package eu.dnetlib.dhp.collection.plugin.utils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java new file mode 100644 index 000000000..e05fe263a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java @@ -0,0 +1,177 @@ + +package eu.dnetlib.dhp.collection.plugin.utils; + +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringWriter; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.Iterator; + +import javax.xml.stream.XMLEventFactory; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLEventWriter; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLOutputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class XMLIterator implements Iterator { + + private static final Log log = LogFactory.getLog(XMLIterator.class); + + private ThreadLocal inputFactory = new ThreadLocal() { + + @Override + protected XMLInputFactory initialValue() { + return XMLInputFactory.newInstance(); + } + }; + + private ThreadLocal outputFactory = new ThreadLocal() { + + @Override + protected XMLOutputFactory initialValue() { + return XMLOutputFactory.newInstance(); + } + }; + + private ThreadLocal eventFactory = new ThreadLocal() { + + @Override + protected XMLEventFactory initialValue() { + return XMLEventFactory.newInstance(); + } + }; + + public static final String UTF_8 = "UTF-8"; + + final XMLEventReader parser; + + private XMLEvent current = null; + + private String element; + + private InputStream inputStream; + + public XMLIterator(final String element, final InputStream inputStream) { + super(); + this.element = element; + this.inputStream = inputStream; + this.parser = getParser(); + try { + this.current = findElement(parser); + } catch (XMLStreamException e) { + log.warn("cannot init parser position. No element found: " + element); + current = null; + } + } + + @Override + public boolean hasNext() { + return current != null; + } + + @Override + public String next() { + String result = null; + try { + result = copy(parser); + current = findElement(parser); + return result; + } catch (XMLStreamException e) { + throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @SuppressWarnings("finally") + private String copy(final XMLEventReader parser) throws XMLStreamException { + final StringWriter result = new StringWriter(); + try { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result); + final StartElement start = current.asStartElement(); + final StartElement newRecord = eventFactory + .get() + .createStartElement(start.getName(), start.getAttributes(), start.getNamespaces()); + + // new root record + writer.add(newRecord); + + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + + // TODO: replace with depth tracking instead of close tag tracking. + if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) { + writer.add(event); + break; + } + + writer.add(event); + } + writer.close(); + } finally { + return result.toString(); + } + } + + /** + * Looks for the next occurrence of the splitter element. + * + * @param parser + * @return + * @throws XMLStreamException + */ + private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException { + + /* + * if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; } + */ + + XMLEvent peek = parser.peek(); + if (peek != null && peek.isStartElement()) { + String name = peek.asStartElement().getName().getLocalPart(); + if (element.equals(name)) { + return peek; + } + } + + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if (event != null && event.isStartElement()) { + String name = event.asStartElement().getName().getLocalPart(); + if (element.equals(name)) { + return event; + } + } + } + return null; + } + + private XMLEventReader getParser() { + try { + return inputFactory.get().createXMLEventReader(sanitize(inputStream)); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } + } + + private Reader sanitize(final InputStream in) { + final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder(); + charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); + charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + return new InputStreamReader(in, charsetDecoder); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java index c674031f6..95d1d2402 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection; +package eu.dnetlib.dhp.collection.plugin.utils; import java.util.HashMap; import java.util.HashSet; diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPluginTest.java new file mode 100644 index 000000000..6fd101634 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPluginTest.java @@ -0,0 +1,61 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.IOException; +import java.util.HashMap; +import java.util.stream.Stream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; +import net.bytebuddy.asm.Advice; + +public class FileCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); + + private final ApiDescriptor api = new ApiDescriptor(); + + private FileCollectorPlugin plugin; + + private static final String SPLIT_ON_ELEMENT = "repository"; + + @BeforeEach + public void setUp() throws IOException { + + final String gzipFile = this + .getClass() + .getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml") + .getFile(); + + api.setBaseUrl(gzipFile); + + HashMap params = new HashMap<>(); + params.put("splitOnElement", SPLIT_ON_ELEMENT); + + api.setParams(params); + + FileSystem fs = FileSystem.get(new Configuration()); + plugin = new FileCollectorPlugin(fs); + } + + @Test + void test() throws CollectorException { + + final Stream stream = plugin.collect(api, new AggregatorReport()); + + stream.limit(10).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + log.info(s); + }); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPluginTest.java new file mode 100644 index 000000000..dc24d6f13 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPluginTest.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.HashMap; +import java.util.Objects; +import java.util.stream.Stream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@ExtendWith(MockitoExtension.class) +public class FileGZipCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); + + private final ApiDescriptor api = new ApiDescriptor(); + + private FileGZipCollectorPlugin plugin; + + private static final String SPLIT_ON_ELEMENT = "repository"; + + @BeforeEach + public void setUp() throws IOException { + + final String gzipFile = Objects + .requireNonNull( + this + .getClass() + .getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz")) + .getFile(); + + api.setBaseUrl(gzipFile); + + HashMap params = new HashMap<>(); + params.put("splitOnElement", SPLIT_ON_ELEMENT); + + api.setParams(params); + + FileSystem fs = FileSystem.get(new Configuration()); + plugin = new FileGZipCollectorPlugin(fs); + } + + @Test + void test() throws CollectorException { + + final Stream stream = plugin.collect(api, new AggregatorReport()); + + stream.limit(10).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + log.info(s); + }); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml new file mode 100644 index 000000000..e5806a60e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml @@ -0,0 +1,1079 @@ + + + Copyright 2012, University of Nottingham + OpenDOAR data is available for re-use under a Creative Commons Attribution-Non-Commercial-Share Alike licence + + + Architektur-Informatik + + Y + http://architektur-informatik.scix.net/ + http://architektur-informatik.scix.net/cgi-bin/works/OAI + + + + + Arbeitskreis Architekturinformatik + AK AI + Y + http://www.architektur-informatik.org/ + + + AT + Austria + + 46.783300 + 12.950000 + + + This is a German language repository on Computer Science. Full-text is not available for all items. + + 68 + 2008-05-15 + + Disciplinary + Operational + SciX + + + + Cin + Computers and IT + + + + + de + German + + + + Journal articles + Conference and workshop papers + Theses and dissertations + Unpublished reports and working papers + + + + Content + Content policies not stated + + No policy registered in OpenDOAR. + + + + Metadata + Metadata policies not stated + + No policy registered in OpenDOAR. + + + + Data + Full data item policies unknown + + No policy registered in OpenDOAR. + + + + Submission + Submission policies not stated + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies not stated + + No policy registered in OpenDOAR. + + + + + + Bob Martens + Administrator + b.martens@tuwien.ac.at + + + + + OAI Administrator + architektur-informatik@scix.net + + + + + + Dokumentenserver des LBI-HTA + + Y + http://eprints.hta.lbg.ac.at/ + http://eprints.hta.lbg.ac.at/cgi/oai2 + + + + + Ludwig Boltzmann Institut für Health Technology Assessment + LBI-HTA + Y + http://hta.lbg.ac.at/ + + + AT + Austria + + 48.209200 + 16.372800 + + + This site provides access to the institutions outputs. Users may set up Atom and RSS feeds to be alerted to new content. The interface is available in English and German. Many items are not available as full-text. + Special items include: Newsletters and Decision Support Document + 600 + 2010-02-04 + + Institutional + Operational + EPrints + 3.0.3 + + + Ce + Health and Medicine + + + + + en + English + + + + Journal articles + Conference and workshop papers + Unpublished reports and working papers + Books, chapters and sections + Other special item types + + + + Content + Content policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Metadata + Metadata re-use policy explicitly undefined + + No policy registered in OpenDOAR. + + + + Data + Full data item policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Submission + Submission policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies explicitly undefined + + No policy registered in OpenDOAR. + + + + + + + Elektronisch archivierte Theorie - Sammelpunkt + + Y + http://sammelpunkt.philo.at:8080/ + http://sammelpunkt.philo.at:8080/cgi/oai2 + Institut für Philosophie + + Y + + Universität Wien + + Y + http://www.univie.ac.at/ + Fakultät für Philosopohie und Bildungswissenschaft, Universitätsstraße 7, A-1010 Wien + + AT + Austria + + 48.209190 + 16.372740 + + + This is a subject based institutional repository hosted by the Institute for Philosophy of the University of Vienna. The interface is primarily available in German only, as are virtually all the papers. However the search form is in English and there are plans to make it available in other European languages. + + 1293 + 2010-02-09 + + Institutional + Operational + EPrints + 3.1.3 + + + Cop + Philosophy and Religion + + + + + de + German + + + + Theses and dissertations + Books, chapters and sections + Other special item types + Journal articles + Unpublished reports and working papers + + + + Content + Content policies defined + + This is an institutional or departmental repository. + The repository holds all types of materials. + + + + Metadata + Metadata re-use permitted for not-for-profit purposes + + Anyone may access the metadata free of charge. + The metadata may be re-used in any medium without prior permission for not-for-profit purposes provided: + + the OAI Identifier or a link to the original metadata record are given + the repository is mentioned + + + The metadata must not be re-used in any medium for commercial purposes without formal permission. + + + + Data + Full data item policies explicitly undefined + + Anyone may access full items free of charge. + No full-item re-use policy defined. Assume no rights at all have been granted. + + + + Submission + Submission policies defined + + Items may only be deposited by accredited members of the institution + Authors may only submit their own work for archiving. + The administrator only vets items for relevance to the scope of the repository + The validity and authenticity of the content of submissions is not checked. + No embargo policy defined. + Any copyright violations are entirely the responsibility of the authors/depositors. + If the repository receives proof of copyright violation, the relevant item will be removed immediately. + + + + Preserve + Preservation policies not stated + + No preservation policy defined. + + + + + + H Hracovec + Site Administrator + hrachov@philo.at + + + + + + Elektronische Publikationen der Wirtschaftsuniversität Wien + Epub WU + Y + http://epub.wu.ac.at/ + http://epub.wu.ac.at/cgi/oai2 + Universtätsbibliothek (University Library) + + Y + http://www.wu.ac.at/library + Wirtschaftsuniversität Wien (Vienna University of Economics) + WU + Y + http://www.wu.ac.at/ + Augasse 2-6, A-Wien + + AT + Austria + + 48.230000 + 16.357000 + 0131-3364990 + + This is the institutional repository of the WU Vienna University of Economics and Business. It provides access to the research output of the institution. Documents are available in full text. The interface is accessable in English. + + 1216 + 2012-02-28 + + Institutional + Operational + EPrints + 3 + + + Cub + Business and Economics + + + + + de + German + + + + Conference and workshop papers + Theses and dissertations + Unpublished reports and working papers + + + + Content + Content policies defined + + This is an institutional or departmental repository. + The repository holds all types of materials. + Deposited items may include: + + working drafts + submitted versions (as sent to journals for peer-review) + accepted versions (author's final peer-reviewed drafts) + published versions (publisher-created files) + + + Items are individually tagged with: + + their version type and date. + their peer-review status. + their publication status. + + + For more information, please see webpage: http://epub.wu.ac.at/policies.html + + + + Metadata + Metadata re-use permitted for not-for-profit purposes + + Anyone may access the metadata free of charge. + The metadata may be re-used in any medium without prior permission for not-for-profit purposes provided the OAI Identifier or a link to the original metadata record are given. + The metadata must not be re-used in any medium for commercial purposes without formal permission. + For more information, please see webpage: http://epub.wu.ac.at/policies.html + + + + Data + Rights vary for the re-use of full data items + + Anyone may access full items free of charge. + Copies of full items generally can be: + + reproduced, and displayed or performed in any format or medium + for personal research or study, educational, or not-for-profit purposes without prior permission or charge. + + provided: + + the authors, title and full bibliographic details are given + a hyperlink and/or URL are given for the original metadata page + the content is not changed in any way + + + Full items must not be sold commercially in any format or medium without formal permission of the copyright holders. + Some full items are individually tagged with different rights permissions and conditions. + For more information see webpage: http://epub.wu.ac.at/policies.html. + + + + Submission + Submission policies defined + + Items may only be deposited by accredited members of the organisation, or their delegated agents. + Authors may only submit their own work for archiving. + The administrator only vets items for the eligibility of authors/depositors, relevance to the scope of the repository, valid layout & format, and the exclusion of spam + The validity and authenticity of the content of submissions is the sole responsibility of the depositor. + No embargo policy defined. + Any copyright violations are entirely the responsibility of the authors/depositors. + If the repository receives proof of copyright violation, the relevant item will be removed immediately. + For more information see webpage: http://epub.wu.ac.at/policies.html + + + + Preserve + Preservation policies defined + + Items will be retained indefinitely. + The repository will try to ensure continued readability and accessibility. + The repository regularly backs up its files according to current best practice. + The original bit stream is retained for all items, in addition to any upgraded formats. + Items may be removed at the request of the author/copyright holder, but this is strongly discouraged. + Withdrawn items are not deleted per se, but are removed from public view. + Withdrawn items' identifiers/URLs are retained indefinitely. + URLs will continue to point to 'tombstone' citations, to avoid broken links and to retain item histories. + Changes to deposited items are not permitted. + Errata and corrigenda lists may be included with the original record if required. + If necessary, an updated version may be deposited. + + The item's persistent URL will always link to the latest version. + There will be links between earlier and later versions, with the most recent version clearly identified. + + + In the event of the repository being closed down, the database will be transferred to another appropriate archive. + For more information see webpage: http://epub.wu.ac.at/policies.html + + + + + + Gertraud Novotny + Administrator + gertraud.novotny@wu.ac.at + + + + + OAI Administrator + epub@wu.ac.at + + + + + + Elektronisches Publikationsportal der Österreichischen Akademie der Wissenschaften + epub.oeaw + Y + http://epub.oeaw.ac.at/ + http://epub.oeaw.ac.at/oai + + + + + Austrian Academy of Sciences + + Y + http://www.oeaw.ac.at/ + Postgasse 7, A-1010 Wien + + AT + Austria + + 48.250000 + 16.350000 + + + This site is a repository providing access to the publication output of the organisation. However only a very small proportion of material is available via Open Access as this site is mainly concerned with subscription-only access to its eBook and printed publications. As such that material which is offered freely is intended to induce a purchasing activity from the reader. The main site interface is available in English or German, however the supporting information and help is in the latter only. Users may set up RSS feeds to be alerted to new content. + Partners: Verlag der Österreichischen Akademie der Wissenschaften + + 2012-02-27 + 2006 + Institutional + Operational + Hyperwave + + + + C + Multidisciplinary + + + + + de + German + + + en + English + + + + Books, chapters and sections + + + + Content + Content policies explicitly undefined + + This is an institutional or departmental repository. + No content policy defined. + + + + Metadata + Metadata re-use policy explicitly undefined + + Anyone may access the metadata free of charge. + No metadata re-use policy defined. Assume no rights at all have been granted. + + + + Data + Full data item policies explicitly undefined + + Anyone may access full items free of charge. + No full-item re-use policy defined. Assume no rights at all have been granted. + + + + Submission + Submission policies explicitly undefined + + No submission policy defined. + + + + Preserve + Preservation policies not stated + + No preservation policy defined. + + + + + + Herwig Stöger + Administrator + herwig.stoeger@oeaw.ac.at + + + + + + European Research Papers Archive + ERPA + Y + http://eiop.or.at/erpa/ + http://eiop.or.at/cgi-bin/oaiserv.pl + + + + + European Communities Studies Association Austria + ECSA Austria + Y + http://www2.wu-wien.ac.at/ecsa/ + Institut für Technikfolgen-Abschätzung, Österreichische Akademie der Wissenschaften, Strohgasse 45/5, A-1030 Wien + + AT + Austria + + 48.230100 + 16.359200 + +43 1 51581 6583 + +43 1 710 98 83 + This site is an aggregating repository that contains a collection of research papers from ten European institutions. The site contains working papers on European Integration. The site interface is in English but several papers are written in French and German. + + 1098 + 2007-07-17 + + Aggregating + Operational + + + + + Cog + Geography and Regional Studies + + + Cub + Business and Economics + + + Cup + Law and Politics + + + + + en + English + + + fr + French + + + de + German + + + + Journal articles + Unpublished reports and working papers + + + + Content + Content policies defined + + This is a multi-institution subject-based repository. + Subject Specialities: + + Multidisciplinary + History and Archaeology + Social Sciences General + Business and Economics + Law and Politics + + + The repository is restricted to: + + Journal articles + Conference and workshop papers + Unpublished reports and working papers + + + Deposited items may include: + + submitted versions (as sent to journals for peer-review) + accepted versions (author's final peer-reviewed drafts) + published versions (publisher-created files) + + + Principal Languages: English; German + For more information, please see webpage: http://eiop.or.at/erpa/erpainfo.htm + + + + Metadata + Metadata re-use permitted for not-for-profit purposes + + Anyone may access the metadata free of charge. + The metadata may be re-used in any medium without prior permission for not-for-profit purposes provided: + + the OAI Identifier or a link to the original metadata record are given + the repository is mentioned + + + + + + Data + Re-use of full data items permitted for not-for-profit purposes + + Anyone may access full items free of charge. + Copies of full items generally can be: + + displayed or performed + for personal research or study purposes without prior permission or charge. + + + This repository is not the publisher; it is merely the online archive. + + + + Submission + Submission policies defined + + Items may only be deposited by accredited members of the institution, or their delegated agents. + Eligible depositors must deposit bibliographic metadata for all their publications. + Eligible depositors must deposit full texts of all their publications. + No moderation policy defined. Assume nothing has been vetted. + The validity and authenticity of the content of submissions is the sole responsibility of the depositor. + No embargo policy defined. + Any copyright violations are entirely the responsibility of the authors/depositors. + For more information see webpage: http://eiop.or.at/erpa/policy.htm + + + + Preserve + Preservation policies unclearly stated + + No retention period defined. + The repository will try to ensure continued readability and accessibility. + No file preservation policy defined. + No withdrawal policy defined. + Withdrawn items are deleted entirely from the database. + Withdrawn items' identifiers/URLs are not retained. + In the event of the repository being closed down, the database will be transferred to another appropriate archive. + + + + + + Michael Nentwich + Site Administrator + mnent@oeaw.ac.at + + + + + + OTHES + + Y + http://othes.univie.ac.at/ + https://othes.univie.ac.at/secure/cgi/oai2 + + + + + Universität Wien + + Y + http://www.univie.ac.at/ + Fakultät für Philosopohie und Bildungswissenschaft, Universitätsstraße 7, A-1010 Wien + + AT + Austria + + 48.209190 + 16.372740 + + + This is an institutional repository for the University of Vienna providing access to the thesis and dissertation output of the university. Users may set up Atom and RSS feeds to be alerted to new content. + + 8066 + 2010-04-16 + + Institutional + Operational + EPrints + 3.0.1-beta-2 + + + C + Multidisciplinary + + + + + de + German + + + + Theses and dissertations + + + + Content + Content policies explicitly undefined + + This is an institutional or departmental repository. + No content policy defined. + + + + Metadata + Metadata re-use policy explicitly undefined + + Anyone may access the metadata free of charge. + No metadata re-use policy defined. Assume no rights at all have been granted. + + + + Data + Full data item policies explicitly undefined + + Anyone may access full items free of charge. + No full-item re-use policy defined. Assume no rights at all have been granted. + + + + Submission + Submission policies explicitly undefined + + No submission policy defined. + + + + Preserve + Preservation policies explicitly undefined + + No preservation policy defined. + + + + + + Adelheid Mayer + Administrator + adelheid.mayer@univie.ac.at + + + + + + thesis-help.ub@univie.ac.at + + + + + + Permanent Hosting, Archiving and Indexing of Digital Resources and Assets + Phaidra + N + https://phaidra.univie.ac.at/ + + + + + + Universität Wien + + Y + http://www.univie.ac.at/ + Fakultät für Philosopohie und Bildungswissenschaft, Universitätsstraße 7, A-1010 Wien + + AT + Austria + + 48.209190 + 16.372740 + + + This site provides access to the digitised copies of the institutions collection as well as teaching material. The interface is in German. + + + 2010-02-17 + + Institutional + Operational + Fedora + + + + C + Multidisciplinary + + + + + de + German + + + + Books, chapters and sections + Learning Objects + Multimedia and audio-visual materials + + + + Content + Content policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Metadata + Metadata re-use policy explicitly undefined + + No policy registered in OpenDOAR. + + + + Data + Full data item policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Submission + Submission policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies explicitly undefined + + No policy registered in OpenDOAR. + + + + + + Paolo Budroni + Administrator + paolo.budroni@univie.ac.at + + + + + + textfeld + + Y + http://www.textfeld.ac.at/ + + + + + + textfeld society for advancement of academic potential + + Y + http://textfeld.ac.at/ + Wien + + AT + Austria + + 48.239300 + 16.369600 + + + This site provides access to publications by students and young scholars of all fields. Interface is in German. + + 514 + 2012-02-13 + + Aggregating + Operational + + + + + C + Multidisciplinary + + + + + de + German + + + + Journal articles + Theses and dissertations + + + + Content + Content policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Metadata + Metadata re-use policy explicitly undefined + + No policy registered in OpenDOAR. + + + + Data + Full data item policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Submission + Submission policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies explicitly undefined + + No policy registered in OpenDOAR. + + + + + + + Repository of Belarusian National Technical University (BNTU) + + Y + http://rep.bntu.by/ + + + + + + Belarusian National Technical University + + Y + http://www.bntu.by/ + + + BY + Belarus + + 53.922100 + 27.590700 + + + This site provides access to the research output of the institution. The interface is available in Russian and English. Users may set up RSS feeds to be alerted to new content. + + 286 + 2012-01-26 + + Institutional + Operational + DSpace + + + + C + Multidisciplinary + + + + + en + English + + + ru + Russian + + + + Journal articles + Books, chapters and sections + Learning Objects + + + + + Alexey Skalaban + Administrator + skalaban@gmail.com + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz new file mode 100644 index 000000000..f783b69e7 Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz differ