diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java index 91c6c1825..fac9a7565 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java @@ -1,18 +1,18 @@ package eu.dnetlib.dhp.common; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.text.Normalizer; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.text.WordUtils; +import com.ctc.wstx.dtd.LargePrefixedNameSet; import com.google.common.base.Joiner; import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; @@ -29,7 +29,19 @@ public class PacePerson { private List fullname = Lists.newArrayList(); private final String original; - private static Set particles = null; + private static Set particles; + + static { + try { + particles = new HashSet<>(IOUtils + .readLines( + PacePerson.class + .getResourceAsStream( + "/eu/dnetlib/dhp/common/name_particles.txt"))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } /** * Capitalizes a string @@ -37,29 +49,20 @@ public class PacePerson { * @param s the string to capitalize * @return the input string with capital letter */ - public static final String capitalize(final String s) { + public static String capitalize(final String s) { + if (particles.contains(s)) { + return s; + } return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); } /** * Adds a dot to a string with length equals to 1 */ - public static final String dotAbbreviations(final String s) { + public static String dotAbbreviations(final String s) { return s.length() == 1 ? s + "." : s; } - public static Set loadFromClasspath(final String classpath) { - final Set h = new HashSet<>(); - try { - for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { - h.add(s); - } - } catch (final Throwable e) { - return new HashSet<>(); - } - return h; - } - /** * The constructor of the class. It fills the fields of the class basing on the input fullname. * @@ -128,10 +131,6 @@ public class PacePerson { } private List splitTerms(final String s) { - if (particles == null) { - particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); - } - final List list = Lists.newArrayList(); for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { if (!particles.contains(part.toLowerCase())) { @@ -187,17 +186,36 @@ public class PacePerson { } public List getCapitalFirstnames() { - return Lists - .newArrayList( - Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + return Optional + .ofNullable(getNameWithAbbreviations()) + .map( + name -> name + .stream() + .map(PacePerson::capitalize) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public List getCapitalSurname() { - return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + return Optional + .ofNullable(getSurname()) + .map( + surname -> surname + .stream() + .map(PacePerson::capitalize) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public List getNameWithAbbreviations() { - return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + return Optional + .ofNullable(getName()) + .map( + name -> name + .stream() + .map(PacePerson::dotAbbreviations) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public boolean isAccurate() { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt similarity index 76% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt rename to dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt index dae37c9dc..07cf06a98 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt @@ -1,7 +1,8 @@ van +von der de dell sig mr -mrs +mrs \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java index e4d458780..6b5bed5b8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -3,7 +3,6 @@ package eu.dnetlib.dhp.actionmanager.ror; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; -import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues; @@ -39,7 +38,6 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType; -import eu.dnetlib.dhp.actionmanager.ror.model.Relationship; import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; @@ -51,7 +49,6 @@ import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; @@ -168,38 +165,10 @@ public class GenerateRorActionSetJob { final List> res = new ArrayList<>(); res.add(new AtomicAction<>(Organization.class, o)); - for (final Relationship rorRel : r.getRelationships()) { - if (rorRel.getType().equalsIgnoreCase("parent")) { - final String orgId1 = calculateOpenaireId(r.getId()); - final String orgId2 = calculateOpenaireId(rorRel.getId()); - res - .add( - new AtomicAction<>(Relation.class, - calculateHierarchyRel(orgId1, orgId2, ModelConstants.IS_PARENT_OF))); - res - .add( - new AtomicAction<>(Relation.class, - calculateHierarchyRel(orgId2, orgId1, ModelConstants.IS_CHILD_OF))); - } - } - return res; } - private static Relation calculateHierarchyRel(final String source, final String target, final String relClass) { - final Relation rel = new Relation(); - rel.setSource(source); - rel.setTarget(target); - rel.setRelType(ORG_ORG_RELTYPE); - rel.setSubRelType(ModelConstants.RELATIONSHIP); - rel.setRelClass(relClass); - rel.setCollectedfrom(ROR_COLLECTED_FROM); - rel.setDataInfo(ROR_DATA_INFO); - rel.setLastupdatetimestamp(System.currentTimeMillis()); - return rel; - } - private static String calculateOpenaireId(final String rorId) { return String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(rorId)); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 2ea3f35cc..9d9400068 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -19,6 +19,8 @@ import org.slf4j.LoggerFactory; import eu.dnetlib.dhp.aggregation.common.ReporterCallback; import eu.dnetlib.dhp.aggregation.common.ReportingJob; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; @@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob { return new OaiCollectorPlugin(clientParams); case rest_json2xml: return new RestCollectorPlugin(clientParams); + case file: + return new FileCollectorPlugin(fileSystem); + case fileGZip: + return new FileGZipCollectorPlugin(fileSystem); case other: final CollectorPlugin.NAME.OTHER_NAME plugin = Optional .ofNullable(api.getParams().get("other_plugin_type")) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 841d42fea..08084e22a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException; public interface CollectorPlugin { enum NAME { - oai, other, rest_json2xml; + oai, other, rest_json2xml, file, fileGZip; public enum OTHER_NAME { mdstore_mongodb_dump, mdstore_mongodb diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java new file mode 100644 index 000000000..f2fa3d2bb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/AbstractSplittedRecordPlugin.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.util.Iterator; +import java.util.Optional; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; + +public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin { + + private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class); + + public static final String SPLIT_ON_ELEMENT = "splitOnElement"; + + private final FileSystem fileSystem; + + public AbstractSplittedRecordPlugin(FileSystem fileSystem) { + this.fileSystem = fileSystem; + } + + @Override + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { + + // get path to file + final Path filePath = Optional + .ofNullable(api.getBaseUrl()) + .map(Path::new) + .orElseThrow(() -> new CollectorException("missing baseUrl")); + + log.info("baseUrl: {}", filePath); + + // check that path to file exists + try { + if (!fileSystem.exists(filePath)) { + throw new CollectorException("path does not exist: " + filePath); + } + } catch (IOException e) { + throw new CollectorException(e); + } + + // get split element + final String splitOnElement = Optional + .ofNullable(api.getParams().get(SPLIT_ON_ELEMENT)) + .orElseThrow( + () -> new CollectorException(String + .format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT))); + + log.info("splitOnElement: {}", splitOnElement); + + final BufferedInputStream bis = getBufferedInputStream(filePath); + + Iterator xmlIterator = new XMLIterator(splitOnElement, bis); + + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED), + false); + } + + abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException; + + public FileSystem getFileSystem() { + return fileSystem; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java new file mode 100644 index 000000000..f771def93 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPlugin.java @@ -0,0 +1,33 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.BufferedInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.common.collection.CollectorException; + +public class FileCollectorPlugin extends AbstractSplittedRecordPlugin { + + private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class); + + public FileCollectorPlugin(FileSystem fileSystem) { + super(fileSystem); + } + + @Override + protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException { + + log.info("filePath: {}", filePath); + + try { + FileSystem fs = super.getFileSystem(); + return new BufferedInputStream(fs.open(filePath)); + } catch (Exception e) { + throw new CollectorException("Error reading file " + filePath, e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java new file mode 100644 index 000000000..91a6e9f16 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPlugin.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.BufferedInputStream; +import java.util.zip.GZIPInputStream; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.common.collection.CollectorException; + +public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class); + + public FileGZipCollectorPlugin(FileSystem fileSystem) { + super(fileSystem); + } + + @Override + protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException { + + log.info("filePath: {}", filePath); + + try { + FileSystem fs = super.getFileSystem(); + GZIPInputStream stream = new GZIPInputStream(fs.open(filePath)); + return new BufferedInputStream(stream); + } catch (Exception e) { + throw new CollectorException("Error reading file " + filePath, e); + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 566c6b216..28b2572fb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import eu.dnetlib.dhp.collection.XmlCleaner; +import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpConnector2; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java index 64a041fd4..e4bad2f8d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/rest/RestIterator.java @@ -30,7 +30,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; -import eu.dnetlib.dhp.collection.JsonUtils; +import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java index da3768a4a..15401e223 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/JsonUtils.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection; +package eu.dnetlib.dhp.collection.plugin.utils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java new file mode 100644 index 000000000..e05fe263a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java @@ -0,0 +1,177 @@ + +package eu.dnetlib.dhp.collection.plugin.utils; + +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringWriter; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.Iterator; + +import javax.xml.stream.XMLEventFactory; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLEventWriter; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLOutputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class XMLIterator implements Iterator { + + private static final Log log = LogFactory.getLog(XMLIterator.class); + + private ThreadLocal inputFactory = new ThreadLocal() { + + @Override + protected XMLInputFactory initialValue() { + return XMLInputFactory.newInstance(); + } + }; + + private ThreadLocal outputFactory = new ThreadLocal() { + + @Override + protected XMLOutputFactory initialValue() { + return XMLOutputFactory.newInstance(); + } + }; + + private ThreadLocal eventFactory = new ThreadLocal() { + + @Override + protected XMLEventFactory initialValue() { + return XMLEventFactory.newInstance(); + } + }; + + public static final String UTF_8 = "UTF-8"; + + final XMLEventReader parser; + + private XMLEvent current = null; + + private String element; + + private InputStream inputStream; + + public XMLIterator(final String element, final InputStream inputStream) { + super(); + this.element = element; + this.inputStream = inputStream; + this.parser = getParser(); + try { + this.current = findElement(parser); + } catch (XMLStreamException e) { + log.warn("cannot init parser position. No element found: " + element); + current = null; + } + } + + @Override + public boolean hasNext() { + return current != null; + } + + @Override + public String next() { + String result = null; + try { + result = copy(parser); + current = findElement(parser); + return result; + } catch (XMLStreamException e) { + throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @SuppressWarnings("finally") + private String copy(final XMLEventReader parser) throws XMLStreamException { + final StringWriter result = new StringWriter(); + try { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result); + final StartElement start = current.asStartElement(); + final StartElement newRecord = eventFactory + .get() + .createStartElement(start.getName(), start.getAttributes(), start.getNamespaces()); + + // new root record + writer.add(newRecord); + + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + + // TODO: replace with depth tracking instead of close tag tracking. + if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) { + writer.add(event); + break; + } + + writer.add(event); + } + writer.close(); + } finally { + return result.toString(); + } + } + + /** + * Looks for the next occurrence of the splitter element. + * + * @param parser + * @return + * @throws XMLStreamException + */ + private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException { + + /* + * if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; } + */ + + XMLEvent peek = parser.peek(); + if (peek != null && peek.isStartElement()) { + String name = peek.asStartElement().getName().getLocalPart(); + if (element.equals(name)) { + return peek; + } + } + + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if (event != null && event.isStartElement()) { + String name = event.asStartElement().getName().getLocalPart(); + if (element.equals(name)) { + return event; + } + } + } + return null; + } + + private XMLEventReader getParser() { + try { + return inputFactory.get().createXMLEventReader(sanitize(inputStream)); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } + } + + private Reader sanitize(final InputStream in) { + final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder(); + charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); + charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + return new InputStreamReader(in, charsetDecoder); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java similarity index 99% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java index c674031f6..95d1d2402 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.collection; +package eu.dnetlib.dhp.collection.plugin.utils; import java.util.HashMap; import java.util.HashSet; diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index 48d0f1497..ff966aaea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -47,13 +47,18 @@ object DataciteToOAFTransformation { } /** This method should skip record if json contains invalid text - * defined in gile datacite_filter + * defined in file datacite_filter * - * @param json + * @param record : unparsed datacite record + * @param json : parsed record * @return True if the record should be skipped */ - def skip_record(json: String): Boolean = { - datacite_filter.exists(f => json.contains(f)) + def skip_record(record: String, json: org.json4s.JValue): Boolean = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher") + .extractOrElse[String]("") + .equalsIgnoreCase("FAIRsharing") + } @deprecated("this method will be removed", "dhp") @@ -304,12 +309,13 @@ object DataciteToOAFTransformation { vocabularies: VocabularyGroup, exportLinks: Boolean ): List[Oaf] = { - if (skip_record(input)) - return List() implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats lazy val json = parse(input) + if (skip_record(input, json)) + return List() + val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPluginTest.java new file mode 100644 index 000000000..6fd101634 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileCollectorPluginTest.java @@ -0,0 +1,61 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.IOException; +import java.util.HashMap; +import java.util.stream.Stream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; +import net.bytebuddy.asm.Advice; + +public class FileCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); + + private final ApiDescriptor api = new ApiDescriptor(); + + private FileCollectorPlugin plugin; + + private static final String SPLIT_ON_ELEMENT = "repository"; + + @BeforeEach + public void setUp() throws IOException { + + final String gzipFile = this + .getClass() + .getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml") + .getFile(); + + api.setBaseUrl(gzipFile); + + HashMap params = new HashMap<>(); + params.put("splitOnElement", SPLIT_ON_ELEMENT); + + api.setParams(params); + + FileSystem fs = FileSystem.get(new Configuration()); + plugin = new FileCollectorPlugin(fs); + } + + @Test + void test() throws CollectorException { + + final Stream stream = plugin.collect(api, new AggregatorReport()); + + stream.limit(10).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + log.info(s); + }); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPluginTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPluginTest.java new file mode 100644 index 000000000..dc24d6f13 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/file/FileGZipCollectorPluginTest.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.collection.plugin.file; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.HashMap; +import java.util.Objects; +import java.util.stream.Stream; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.junit.jupiter.api.*; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mockito; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; + +@TestMethodOrder(MethodOrderer.OrderAnnotation.class) +@ExtendWith(MockitoExtension.class) +public class FileGZipCollectorPluginTest { + + private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class); + + private final ApiDescriptor api = new ApiDescriptor(); + + private FileGZipCollectorPlugin plugin; + + private static final String SPLIT_ON_ELEMENT = "repository"; + + @BeforeEach + public void setUp() throws IOException { + + final String gzipFile = Objects + .requireNonNull( + this + .getClass() + .getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz")) + .getFile(); + + api.setBaseUrl(gzipFile); + + HashMap params = new HashMap<>(); + params.put("splitOnElement", SPLIT_ON_ELEMENT); + + api.setParams(params); + + FileSystem fs = FileSystem.get(new Configuration()); + plugin = new FileGZipCollectorPlugin(fs); + } + + @Test + void test() throws CollectorException { + + final Stream stream = plugin.collect(api, new AggregatorReport()); + + stream.limit(10).forEach(s -> { + Assertions.assertTrue(s.length() > 0); + log.info(s); + }); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record_fairsharing.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record_fairsharing.json new file mode 100644 index 000000000..08e80b33b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/datacite/record_fairsharing.json @@ -0,0 +1 @@ +{"id":"10.5517/ccdc.csd.cc25rpzm","type":"dois","attributes":{"doi":"10.5517/ccdc.csd.cc25rpzm","prefix":"10.5517","suffix":"ccdc.csd.cc25rpzm","identifiers":[{"identifier":"2018781","identifierType":"CCDC"}],"alternateIdentifiers":[{"alternateIdentifierType":"CCDC","alternateIdentifier":"2018781"}],"creators":[{"name":"Ling, Irene","affiliation":[],"nameIdentifiers":[]},{"name":"Sobolev, Alexandre N.","affiliation":[],"nameIdentifiers":[]},{"name":"Raston, Colin L.","affiliation":[],"nameIdentifiers":[]}],"titles":[{"title":"CCDC 2018781: Experimental Crystal Structure Determination"}],"publisher":"fairsharing","container":{},"publicationYear":2021,"subjects":[{"subject":"Crystal Structure"},{"subject":"Experimental 3D Coordinates"},{"subject":"Crystal System"},{"subject":"Space Group"},{"subject":"Cell Parameters"},{"subject":"Crystallography"},{"subject":"bis[penta-aqua-copper(ii)] bis(mu-5,11,17,23-tetra-sulfonato-25,26,27,28-tetrahydroxycalix(4)arene)-dodeca-aqua-tri-copper(ii) bis(nitrate) heptahydrate"}],"contributors":[],"dates":[],"language":"en","types":{"ris":"DATA","bibtex":"misc","citeproc":"dataset","schemaOrg":"Dataset","resourceTypeGeneral":"Dataset"},"relatedIdentifiers":[{"relationType":"IsSupplementTo","relatedIdentifier":"10.1080/00958972.2020.1849642","relatedIdentifierType":"DOI"}],"sizes":[],"formats":["CIF"],"version":null,"rightsList":[],"descriptions":[{"description":"Related Article: Irene Ling, Alexandre N. Sobolev, Colin L. Raston|2021|J.Coord.Chem.|74|40|doi:10.1080/00958972.2020.1849642","descriptionType":"Other"}],"geoLocations":[],"fundingReferences":[],"xml":"PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0iVVRGLTgiPz4KPHJlc291cmNlIHhtbG5zOnhzaT0iaHR0cDovL3d3dy53My5vcmcvMjAwMS9YTUxTY2hlbWEtaW5zdGFuY2UiIHhtbG5zPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCIgeHNpOnNjaGVtYUxvY2F0aW9uPSJodHRwOi8vZGF0YWNpdGUub3JnL3NjaGVtYS9rZXJuZWwtNCBodHRwOi8vc2NoZW1hLmRhdGFjaXRlLm9yZy9tZXRhL2tlcm5lbC00L21ldGFkYXRhLnhzZCI+CiAgPGlkZW50aWZpZXIgaWRlbnRpZmllclR5cGU9IkRPSSI+MTAuNTUxNy9DQ0RDLkNTRC5DQzI1UlBaTTwvaWRlbnRpZmllcj4KICA8Y3JlYXRvcnM+CiAgICA8Y3JlYXRvcj4KICAgICAgPGNyZWF0b3JOYW1lPkxpbmcsIElyZW5lPC9jcmVhdG9yTmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWU+U29ib2xldiwgQWxleGFuZHJlIE4uPC9jcmVhdG9yTmFtZT4KICAgIDwvY3JlYXRvcj4KICAgIDxjcmVhdG9yPgogICAgICA8Y3JlYXRvck5hbWU+UmFzdG9uLCBDb2xpbiBMLjwvY3JlYXRvck5hbWU+CiAgICA8L2NyZWF0b3I+CiAgPC9jcmVhdG9ycz4KICA8dGl0bGVzPgogICAgPHRpdGxlPkNDREMgMjAxODc4MTogRXhwZXJpbWVudGFsIENyeXN0YWwgU3RydWN0dXJlIERldGVybWluYXRpb248L3RpdGxlPgogIDwvdGl0bGVzPgogIDxwdWJsaXNoZXI+Q2FtYnJpZGdlIENyeXN0YWxsb2dyYXBoaWMgRGF0YSBDZW50cmU8L3B1Ymxpc2hlcj4KICA8cHVibGljYXRpb25ZZWFyPjIwMjE8L3B1YmxpY2F0aW9uWWVhcj4KICA8cmVzb3VyY2VUeXBlIHJlc291cmNlVHlwZUdlbmVyYWw9IkRhdGFzZXQiLz4KICA8c3ViamVjdHM+CiAgICA8c3ViamVjdD5DcnlzdGFsIFN0cnVjdHVyZTwvc3ViamVjdD4KICAgIDxzdWJqZWN0PkV4cGVyaW1lbnRhbCAzRCBDb29yZGluYXRlczwvc3ViamVjdD4KICAgIDxzdWJqZWN0PkNyeXN0YWwgU3lzdGVtPC9zdWJqZWN0PgogICAgPHN1YmplY3Q+U3BhY2UgR3JvdXA8L3N1YmplY3Q+CiAgICA8c3ViamVjdD5DZWxsIFBhcmFtZXRlcnM8L3N1YmplY3Q+CiAgICA8c3ViamVjdD5DcnlzdGFsbG9ncmFwaHk8L3N1YmplY3Q+CiAgICA8c3ViamVjdD5iaXNbcGVudGEtYXF1YS1jb3BwZXIoaWkpXSBiaXMobXUtNSwxMSwxNywyMy10ZXRyYS1zdWxmb25hdG8tMjUsMjYsMjcsMjgtdGV0cmFoeWRyb3h5Y2FsaXgoNClhcmVuZSktZG9kZWNhLWFxdWEtdHJpLWNvcHBlcihpaSkgYmlzKG5pdHJhdGUpIGhlcHRhaHlkcmF0ZTwvc3ViamVjdD4KICA8L3N1YmplY3RzPgogIDxsYW5ndWFnZT5lbmc8L2xhbmd1YWdlPgogIDxhbHRlcm5hdGVJZGVudGlmaWVycz4KICAgIDxhbHRlcm5hdGVJZGVudGlmaWVyIGFsdGVybmF0ZUlkZW50aWZpZXJUeXBlPSJDQ0RDIj4yMDE4NzgxPC9hbHRlcm5hdGVJZGVudGlmaWVyPgogIDwvYWx0ZXJuYXRlSWRlbnRpZmllcnM+CiAgPHJlbGF0ZWRJZGVudGlmaWVycz4KICAgIDxyZWxhdGVkSWRlbnRpZmllciByZWxhdGVkSWRlbnRpZmllclR5cGU9IkRPSSIgcmVsYXRpb25UeXBlPSJJc1N1cHBsZW1lbnRUbyI+MTAuMTA4MC8wMDk1ODk3Mi4yMDIwLjE4NDk2NDI8L3JlbGF0ZWRJZGVudGlmaWVyPgogIDwvcmVsYXRlZElkZW50aWZpZXJzPgogIDxzaXplcy8+CiAgPGZvcm1hdHM+CiAgICA8Zm9ybWF0PkNJRjwvZm9ybWF0PgogIDwvZm9ybWF0cz4KICA8dmVyc2lvbi8+CiAgPGRlc2NyaXB0aW9ucz4KICAgIDxkZXNjcmlwdGlvbiBkZXNjcmlwdGlvblR5cGU9Ik90aGVyIj5SZWxhdGVkIEFydGljbGU6IElyZW5lIExpbmcsICBBbGV4YW5kcmUgTi4gU29ib2xldiwgIENvbGluIEwuIFJhc3RvbnwyMDIxfEouQ29vcmQuQ2hlbS58NzR8NDB8ZG9pOjEwLjEwODAvMDA5NTg5NzIuMjAyMC4xODQ5NjQyPC9kZXNjcmlwdGlvbj4KICA8L2Rlc2NyaXB0aW9ucz4KPC9yZXNvdXJjZT4K","url":"http://www.ccdc.cam.ac.uk/services/structure_request?id=doi:10.5517/ccdc.csd.cc25rpzm&sid=DataCite","contentUrl":null,"metadataVersion":3,"schemaVersion":"http://datacite.org/schema/kernel-4","source":"api","isActive":true,"state":"findable","reason":null,"viewCount":0,"viewsOverTime":[],"downloadCount":0,"downloadsOverTime":[],"referenceCount":0,"citationCount":0,"citationsOverTime":[],"partCount":0,"partOfCount":0,"versionCount":0,"versionOfCount":0,"created":"2021-03-09T13:25:35.000Z","registered":"2021-03-09T13:25:36.000Z","published":"2021","updated":"2021-03-31T21:49:56.000Z"},"relationships":{"client":{"data":{"id":"ccdc.csd","type":"clients"}},"provider":{"data":{"id":"ccdc","type":"providers"}},"media":{"data":{"id":"10.5517/ccdc.csd.cc25rpzm","type":"media"}},"references":{"data":[]},"citations":{"data":[]},"parts":{"data":[]},"partOf":{"data":[]},"versions":{"data":[]},"versionOf":{"data":[]}}} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml new file mode 100644 index 000000000..e5806a60e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml @@ -0,0 +1,1079 @@ + + + Copyright 2012, University of Nottingham + OpenDOAR data is available for re-use under a Creative Commons Attribution-Non-Commercial-Share Alike licence + + + Architektur-Informatik + + Y + http://architektur-informatik.scix.net/ + http://architektur-informatik.scix.net/cgi-bin/works/OAI + + + + + Arbeitskreis Architekturinformatik + AK AI + Y + http://www.architektur-informatik.org/ + + + AT + Austria + + 46.783300 + 12.950000 + + + This is a German language repository on Computer Science. Full-text is not available for all items. + + 68 + 2008-05-15 + + Disciplinary + Operational + SciX + + + + Cin + Computers and IT + + + + + de + German + + + + Journal articles + Conference and workshop papers + Theses and dissertations + Unpublished reports and working papers + + + + Content + Content policies not stated + + No policy registered in OpenDOAR. + + + + Metadata + Metadata policies not stated + + No policy registered in OpenDOAR. + + + + Data + Full data item policies unknown + + No policy registered in OpenDOAR. + + + + Submission + Submission policies not stated + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies not stated + + No policy registered in OpenDOAR. + + + + + + Bob Martens + Administrator + b.martens@tuwien.ac.at + + + + + OAI Administrator + architektur-informatik@scix.net + + + + + + Dokumentenserver des LBI-HTA + + Y + http://eprints.hta.lbg.ac.at/ + http://eprints.hta.lbg.ac.at/cgi/oai2 + + + + + Ludwig Boltzmann Institut für Health Technology Assessment + LBI-HTA + Y + http://hta.lbg.ac.at/ + + + AT + Austria + + 48.209200 + 16.372800 + + + This site provides access to the institutions outputs. Users may set up Atom and RSS feeds to be alerted to new content. The interface is available in English and German. Many items are not available as full-text. + Special items include: Newsletters and Decision Support Document + 600 + 2010-02-04 + + Institutional + Operational + EPrints + 3.0.3 + + + Ce + Health and Medicine + + + + + en + English + + + + Journal articles + Conference and workshop papers + Unpublished reports and working papers + Books, chapters and sections + Other special item types + + + + Content + Content policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Metadata + Metadata re-use policy explicitly undefined + + No policy registered in OpenDOAR. + + + + Data + Full data item policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Submission + Submission policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies explicitly undefined + + No policy registered in OpenDOAR. + + + + + + + Elektronisch archivierte Theorie - Sammelpunkt + + Y + http://sammelpunkt.philo.at:8080/ + http://sammelpunkt.philo.at:8080/cgi/oai2 + Institut für Philosophie + + Y + + Universität Wien + + Y + http://www.univie.ac.at/ + Fakultät für Philosopohie und Bildungswissenschaft, Universitätsstraße 7, A-1010 Wien + + AT + Austria + + 48.209190 + 16.372740 + + + This is a subject based institutional repository hosted by the Institute for Philosophy of the University of Vienna. The interface is primarily available in German only, as are virtually all the papers. However the search form is in English and there are plans to make it available in other European languages. + + 1293 + 2010-02-09 + + Institutional + Operational + EPrints + 3.1.3 + + + Cop + Philosophy and Religion + + + + + de + German + + + + Theses and dissertations + Books, chapters and sections + Other special item types + Journal articles + Unpublished reports and working papers + + + + Content + Content policies defined + + This is an institutional or departmental repository. + The repository holds all types of materials. + + + + Metadata + Metadata re-use permitted for not-for-profit purposes + + Anyone may access the metadata free of charge. + The metadata may be re-used in any medium without prior permission for not-for-profit purposes provided: + + the OAI Identifier or a link to the original metadata record are given + the repository is mentioned + + + The metadata must not be re-used in any medium for commercial purposes without formal permission. + + + + Data + Full data item policies explicitly undefined + + Anyone may access full items free of charge. + No full-item re-use policy defined. Assume no rights at all have been granted. + + + + Submission + Submission policies defined + + Items may only be deposited by accredited members of the institution + Authors may only submit their own work for archiving. + The administrator only vets items for relevance to the scope of the repository + The validity and authenticity of the content of submissions is not checked. + No embargo policy defined. + Any copyright violations are entirely the responsibility of the authors/depositors. + If the repository receives proof of copyright violation, the relevant item will be removed immediately. + + + + Preserve + Preservation policies not stated + + No preservation policy defined. + + + + + + H Hracovec + Site Administrator + hrachov@philo.at + + + + + + Elektronische Publikationen der Wirtschaftsuniversität Wien + Epub WU + Y + http://epub.wu.ac.at/ + http://epub.wu.ac.at/cgi/oai2 + Universtätsbibliothek (University Library) + + Y + http://www.wu.ac.at/library + Wirtschaftsuniversität Wien (Vienna University of Economics) + WU + Y + http://www.wu.ac.at/ + Augasse 2-6, A-Wien + + AT + Austria + + 48.230000 + 16.357000 + 0131-3364990 + + This is the institutional repository of the WU Vienna University of Economics and Business. It provides access to the research output of the institution. Documents are available in full text. The interface is accessable in English. + + 1216 + 2012-02-28 + + Institutional + Operational + EPrints + 3 + + + Cub + Business and Economics + + + + + de + German + + + + Conference and workshop papers + Theses and dissertations + Unpublished reports and working papers + + + + Content + Content policies defined + + This is an institutional or departmental repository. + The repository holds all types of materials. + Deposited items may include: + + working drafts + submitted versions (as sent to journals for peer-review) + accepted versions (author's final peer-reviewed drafts) + published versions (publisher-created files) + + + Items are individually tagged with: + + their version type and date. + their peer-review status. + their publication status. + + + For more information, please see webpage: http://epub.wu.ac.at/policies.html + + + + Metadata + Metadata re-use permitted for not-for-profit purposes + + Anyone may access the metadata free of charge. + The metadata may be re-used in any medium without prior permission for not-for-profit purposes provided the OAI Identifier or a link to the original metadata record are given. + The metadata must not be re-used in any medium for commercial purposes without formal permission. + For more information, please see webpage: http://epub.wu.ac.at/policies.html + + + + Data + Rights vary for the re-use of full data items + + Anyone may access full items free of charge. + Copies of full items generally can be: + + reproduced, and displayed or performed in any format or medium + for personal research or study, educational, or not-for-profit purposes without prior permission or charge. + + provided: + + the authors, title and full bibliographic details are given + a hyperlink and/or URL are given for the original metadata page + the content is not changed in any way + + + Full items must not be sold commercially in any format or medium without formal permission of the copyright holders. + Some full items are individually tagged with different rights permissions and conditions. + For more information see webpage: http://epub.wu.ac.at/policies.html. + + + + Submission + Submission policies defined + + Items may only be deposited by accredited members of the organisation, or their delegated agents. + Authors may only submit their own work for archiving. + The administrator only vets items for the eligibility of authors/depositors, relevance to the scope of the repository, valid layout & format, and the exclusion of spam + The validity and authenticity of the content of submissions is the sole responsibility of the depositor. + No embargo policy defined. + Any copyright violations are entirely the responsibility of the authors/depositors. + If the repository receives proof of copyright violation, the relevant item will be removed immediately. + For more information see webpage: http://epub.wu.ac.at/policies.html + + + + Preserve + Preservation policies defined + + Items will be retained indefinitely. + The repository will try to ensure continued readability and accessibility. + The repository regularly backs up its files according to current best practice. + The original bit stream is retained for all items, in addition to any upgraded formats. + Items may be removed at the request of the author/copyright holder, but this is strongly discouraged. + Withdrawn items are not deleted per se, but are removed from public view. + Withdrawn items' identifiers/URLs are retained indefinitely. + URLs will continue to point to 'tombstone' citations, to avoid broken links and to retain item histories. + Changes to deposited items are not permitted. + Errata and corrigenda lists may be included with the original record if required. + If necessary, an updated version may be deposited. + + The item's persistent URL will always link to the latest version. + There will be links between earlier and later versions, with the most recent version clearly identified. + + + In the event of the repository being closed down, the database will be transferred to another appropriate archive. + For more information see webpage: http://epub.wu.ac.at/policies.html + + + + + + Gertraud Novotny + Administrator + gertraud.novotny@wu.ac.at + + + + + OAI Administrator + epub@wu.ac.at + + + + + + Elektronisches Publikationsportal der Österreichischen Akademie der Wissenschaften + epub.oeaw + Y + http://epub.oeaw.ac.at/ + http://epub.oeaw.ac.at/oai + + + + + Austrian Academy of Sciences + + Y + http://www.oeaw.ac.at/ + Postgasse 7, A-1010 Wien + + AT + Austria + + 48.250000 + 16.350000 + + + This site is a repository providing access to the publication output of the organisation. However only a very small proportion of material is available via Open Access as this site is mainly concerned with subscription-only access to its eBook and printed publications. As such that material which is offered freely is intended to induce a purchasing activity from the reader. The main site interface is available in English or German, however the supporting information and help is in the latter only. Users may set up RSS feeds to be alerted to new content. + Partners: Verlag der Österreichischen Akademie der Wissenschaften + + 2012-02-27 + 2006 + Institutional + Operational + Hyperwave + + + + C + Multidisciplinary + + + + + de + German + + + en + English + + + + Books, chapters and sections + + + + Content + Content policies explicitly undefined + + This is an institutional or departmental repository. + No content policy defined. + + + + Metadata + Metadata re-use policy explicitly undefined + + Anyone may access the metadata free of charge. + No metadata re-use policy defined. Assume no rights at all have been granted. + + + + Data + Full data item policies explicitly undefined + + Anyone may access full items free of charge. + No full-item re-use policy defined. Assume no rights at all have been granted. + + + + Submission + Submission policies explicitly undefined + + No submission policy defined. + + + + Preserve + Preservation policies not stated + + No preservation policy defined. + + + + + + Herwig Stöger + Administrator + herwig.stoeger@oeaw.ac.at + + + + + + European Research Papers Archive + ERPA + Y + http://eiop.or.at/erpa/ + http://eiop.or.at/cgi-bin/oaiserv.pl + + + + + European Communities Studies Association Austria + ECSA Austria + Y + http://www2.wu-wien.ac.at/ecsa/ + Institut für Technikfolgen-Abschätzung, Österreichische Akademie der Wissenschaften, Strohgasse 45/5, A-1030 Wien + + AT + Austria + + 48.230100 + 16.359200 + +43 1 51581 6583 + +43 1 710 98 83 + This site is an aggregating repository that contains a collection of research papers from ten European institutions. The site contains working papers on European Integration. The site interface is in English but several papers are written in French and German. + + 1098 + 2007-07-17 + + Aggregating + Operational + + + + + Cog + Geography and Regional Studies + + + Cub + Business and Economics + + + Cup + Law and Politics + + + + + en + English + + + fr + French + + + de + German + + + + Journal articles + Unpublished reports and working papers + + + + Content + Content policies defined + + This is a multi-institution subject-based repository. + Subject Specialities: + + Multidisciplinary + History and Archaeology + Social Sciences General + Business and Economics + Law and Politics + + + The repository is restricted to: + + Journal articles + Conference and workshop papers + Unpublished reports and working papers + + + Deposited items may include: + + submitted versions (as sent to journals for peer-review) + accepted versions (author's final peer-reviewed drafts) + published versions (publisher-created files) + + + Principal Languages: English; German + For more information, please see webpage: http://eiop.or.at/erpa/erpainfo.htm + + + + Metadata + Metadata re-use permitted for not-for-profit purposes + + Anyone may access the metadata free of charge. + The metadata may be re-used in any medium without prior permission for not-for-profit purposes provided: + + the OAI Identifier or a link to the original metadata record are given + the repository is mentioned + + + + + + Data + Re-use of full data items permitted for not-for-profit purposes + + Anyone may access full items free of charge. + Copies of full items generally can be: + + displayed or performed + for personal research or study purposes without prior permission or charge. + + + This repository is not the publisher; it is merely the online archive. + + + + Submission + Submission policies defined + + Items may only be deposited by accredited members of the institution, or their delegated agents. + Eligible depositors must deposit bibliographic metadata for all their publications. + Eligible depositors must deposit full texts of all their publications. + No moderation policy defined. Assume nothing has been vetted. + The validity and authenticity of the content of submissions is the sole responsibility of the depositor. + No embargo policy defined. + Any copyright violations are entirely the responsibility of the authors/depositors. + For more information see webpage: http://eiop.or.at/erpa/policy.htm + + + + Preserve + Preservation policies unclearly stated + + No retention period defined. + The repository will try to ensure continued readability and accessibility. + No file preservation policy defined. + No withdrawal policy defined. + Withdrawn items are deleted entirely from the database. + Withdrawn items' identifiers/URLs are not retained. + In the event of the repository being closed down, the database will be transferred to another appropriate archive. + + + + + + Michael Nentwich + Site Administrator + mnent@oeaw.ac.at + + + + + + OTHES + + Y + http://othes.univie.ac.at/ + https://othes.univie.ac.at/secure/cgi/oai2 + + + + + Universität Wien + + Y + http://www.univie.ac.at/ + Fakultät für Philosopohie und Bildungswissenschaft, Universitätsstraße 7, A-1010 Wien + + AT + Austria + + 48.209190 + 16.372740 + + + This is an institutional repository for the University of Vienna providing access to the thesis and dissertation output of the university. Users may set up Atom and RSS feeds to be alerted to new content. + + 8066 + 2010-04-16 + + Institutional + Operational + EPrints + 3.0.1-beta-2 + + + C + Multidisciplinary + + + + + de + German + + + + Theses and dissertations + + + + Content + Content policies explicitly undefined + + This is an institutional or departmental repository. + No content policy defined. + + + + Metadata + Metadata re-use policy explicitly undefined + + Anyone may access the metadata free of charge. + No metadata re-use policy defined. Assume no rights at all have been granted. + + + + Data + Full data item policies explicitly undefined + + Anyone may access full items free of charge. + No full-item re-use policy defined. Assume no rights at all have been granted. + + + + Submission + Submission policies explicitly undefined + + No submission policy defined. + + + + Preserve + Preservation policies explicitly undefined + + No preservation policy defined. + + + + + + Adelheid Mayer + Administrator + adelheid.mayer@univie.ac.at + + + + + + thesis-help.ub@univie.ac.at + + + + + + Permanent Hosting, Archiving and Indexing of Digital Resources and Assets + Phaidra + N + https://phaidra.univie.ac.at/ + + + + + + Universität Wien + + Y + http://www.univie.ac.at/ + Fakultät für Philosopohie und Bildungswissenschaft, Universitätsstraße 7, A-1010 Wien + + AT + Austria + + 48.209190 + 16.372740 + + + This site provides access to the digitised copies of the institutions collection as well as teaching material. The interface is in German. + + + 2010-02-17 + + Institutional + Operational + Fedora + + + + C + Multidisciplinary + + + + + de + German + + + + Books, chapters and sections + Learning Objects + Multimedia and audio-visual materials + + + + Content + Content policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Metadata + Metadata re-use policy explicitly undefined + + No policy registered in OpenDOAR. + + + + Data + Full data item policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Submission + Submission policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies explicitly undefined + + No policy registered in OpenDOAR. + + + + + + Paolo Budroni + Administrator + paolo.budroni@univie.ac.at + + + + + + textfeld + + Y + http://www.textfeld.ac.at/ + + + + + + textfeld society for advancement of academic potential + + Y + http://textfeld.ac.at/ + Wien + + AT + Austria + + 48.239300 + 16.369600 + + + This site provides access to publications by students and young scholars of all fields. Interface is in German. + + 514 + 2012-02-13 + + Aggregating + Operational + + + + + C + Multidisciplinary + + + + + de + German + + + + Journal articles + Theses and dissertations + + + + Content + Content policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Metadata + Metadata re-use policy explicitly undefined + + No policy registered in OpenDOAR. + + + + Data + Full data item policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Submission + Submission policies explicitly undefined + + No policy registered in OpenDOAR. + + + + Preserve + Preservation policies explicitly undefined + + No policy registered in OpenDOAR. + + + + + + + Repository of Belarusian National Technical University (BNTU) + + Y + http://rep.bntu.by/ + + + + + + Belarusian National Technical University + + Y + http://www.bntu.by/ + + + BY + Belarus + + 53.922100 + 27.590700 + + + This site provides access to the research output of the institution. The interface is available in Russian and English. Users may set up RSS feeds to be alerted to new content. + + 286 + 2012-01-26 + + Institutional + Operational + DSpace + + + + C + Multidisciplinary + + + + + en + English + + + ru + Russian + + + + Journal articles + Books, chapters and sections + Learning Objects + + + + + Alexey Skalaban + Administrator + skalaban@gmail.com + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz new file mode 100644 index 000000000..f783b69e7 Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz differ diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala index 8e41de83c..31784c7e9 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala @@ -107,4 +107,19 @@ class DataciteToOAFTest extends AbstractVocabularyTest { } + @Test + def testFilter(): Unit = { + val record = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record_fairsharing.json") + ) + .mkString + + val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT) + val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true) + + assertTrue(res.isEmpty) + + } + } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index 56aa953b4..d9f6433a0 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -102,21 +102,28 @@ public class SparkCountryPropagationJob { private static MapFunction, R> getCountryMergeFn() { return t -> { Optional.ofNullable(t._2()).ifPresent(r -> { - t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); + if (Optional.ofNullable(t._1().getCountry()).isPresent()) + t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); + else + t._1().setCountry(merge(null, t._2().getCountrySet())); }); return t._1(); }; } private static List merge(List c1, List c2) { - HashSet countries = c1 - .stream() - .map(Qualifier::getClassid) - .collect(Collectors.toCollection(HashSet::new)); + HashSet countries = new HashSet<>(); + if (Optional.ofNullable(c1).isPresent()) { + countries = c1 + .stream() + .map(Qualifier::getClassid) + .collect(Collectors.toCollection(HashSet::new)); + } + HashSet finalCountries = countries; return c2 .stream() - .filter(c -> !countries.contains(c.getClassid())) + .filter(c -> !finalCountries.contains(c.getClassid())) .map(c -> getCountry(c.getClassid(), c.getClassname())) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 3e8ca1763..739be3df4 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -18,14 +18,7 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; +import java.util.*; import org.apache.commons.lang3.StringUtils; import org.dom4j.*; @@ -35,6 +28,7 @@ import com.google.common.collect.Sets; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Context; @@ -199,8 +193,13 @@ public abstract class AbstractMdRecordToOafMapper { final List oafs = Lists.newArrayList(entity); if (!oafs.isEmpty()) { - oafs.addAll(addProjectRels(doc, entity)); - oafs.addAll(addOtherResultRels(doc, entity)); + Set rels = Sets.newHashSet(); + + rels.addAll(addProjectRels(doc, entity)); + rels.addAll(addOtherResultRels(doc, entity)); + rels.addAll(addRelations(doc, entity)); + + oafs.addAll(rels); } return oafs; @@ -278,6 +277,46 @@ public abstract class AbstractMdRecordToOafMapper { return res; } + private List addRelations(Document doc, OafEntity entity) { + + final List rels = Lists.newArrayList(); + + for (Object o : doc.selectNodes("//oaf:relation")) { + Element element = (Element) o; + + final String target = StringUtils.trim(element.getText()); + final String relType = element.attributeValue("relType"); + final String subRelType = element.attributeValue("subRelType"); + final String relClass = element.attributeValue("relClass"); + + if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType) + && StringUtils.isNotBlank(relClass)) { + + final String relClassInverse = ModelSupport + .findInverse(ModelSupport.rel(relType, subRelType, relClass)) + .getInverseRelClass(); + final String validationdDate = ((Node) o).valueOf("@validationDate"); + + if (StringUtils.isNotBlank(target)) { + final String targetType = element.attributeValue("targetType"); + if (StringUtils.isNotBlank(targetType)) { + final String targetId = createOpenaireId(targetType, target, true); + rels + .add( + getRelation( + entity.getId(), targetId, relType, subRelType, relClass, entity, validationdDate)); + rels + .add( + getRelation( + targetId, entity.getId(), relType, subRelType, relClassInverse, entity, + validationdDate)); + } + } + } + } + return rels; + } + protected Relation getRelation(final String source, final String target, final String relType, diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index bdb73abf5..f5cb86bfd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -57,14 +57,10 @@ class MappersTest { final List list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); - assertEquals(3, list.size()); - assertTrue(list.get(0) instanceof Publication); - assertTrue(list.get(1) instanceof Relation); - assertTrue(list.get(2) instanceof Relation); + assertEquals(1, list.stream().filter(o -> o instanceof Publication).count()); + assertEquals(4, list.stream().filter(o -> o instanceof Relation).count()); - final Publication p = (Publication) list.get(0); - final Relation r1 = (Relation) list.get(1); - final Relation r2 = (Relation) list.get(2); + Publication p = (Publication) list.stream().filter(o -> o instanceof Publication).findFirst().get(); assertValidId(p.getId()); @@ -125,26 +121,58 @@ class MappersTest { assertNotNull(p.getBestaccessright()); assertEquals("OPEN", p.getBestaccessright().getClassid()); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); - assertTrue(r1.getValidated()); - assertTrue(r2.getValidated()); - assertEquals("2020-01-01", r1.getValidationDate()); - assertEquals("2020-01-01", r2.getValidationDate()); + + // RESULT PROJECT + List resultProject = list + .stream() + .filter(o -> o instanceof Relation) + .map(o -> (Relation) o) + .filter(r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType())) + .collect(Collectors.toList()); + + assertEquals(2, resultProject.size()); + final Relation rp1 = resultProject.get(0); + final Relation rp2 = resultProject.get(1); + + verifyRelation(rp1); + verifyRelation(rp2); + + assertTrue(rp1.getValidated()); + assertTrue(rp2.getValidated()); + assertEquals("2020-01-01", rp1.getValidationDate()); + assertEquals("2020-01-01", rp2.getValidationDate()); + + assertEquals(rp1.getSource(), rp2.getTarget()); + assertEquals(rp2.getSource(), rp1.getTarget()); + + // AFFILIATIONS + List affiliation = list + .stream() + .filter(o -> o instanceof Relation) + .map(o -> (Relation) o) + .filter(r -> ModelConstants.RESULT_ORGANIZATION.equals(r.getRelType())) + .collect(Collectors.toList()); + + assertEquals(2, affiliation.size()); + final Relation aff1 = affiliation.get(0); + final Relation aff2 = affiliation.get(1); + + verifyRelation(aff1); + verifyRelation(aff2); + + assertEquals(aff1.getSource(), aff2.getTarget()); + assertEquals(aff2.getSource(), aff1.getTarget()); + } + + private void verifyRelation(Relation r) { + assertValidId(r.getSource()); + assertValidId(r.getTarget()); + assertValidId(r.getCollectedfrom().get(0).getKey()); + assertNotNull(r.getDataInfo()); + assertNotNull(r.getDataInfo().getTrust()); + assertTrue(StringUtils.isNotBlank(r.getRelClass())); + assertTrue(StringUtils.isNotBlank(r.getRelType())); + } @Test @@ -734,6 +762,51 @@ class MappersTest { assertFalse(p_cleaned.getTitle().isEmpty()); } + @Test + void testZenodo() throws IOException, DocumentException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + + assertNotNull(p.getTitle()); + assertFalse(p.getTitle().isEmpty()); + assertEquals(1, p.getTitle().size()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + + assertNotNull(p.getAuthor()); + assertEquals(2, p.getAuthor().size()); + + Author author = p + .getAuthor() + .stream() + .filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8007"))) + .findFirst() + .get(); + assertNotNull(author); + assertTrue(StringUtils.isBlank(author.getSurname())); + assertTrue(StringUtils.isBlank(author.getName())); + assertEquals("Anne van Weerden", author.getFullname()); + + author = p + .getAuthor() + .stream() + .filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8008"))) + .findFirst() + .get(); + assertNotNull(author); + assertFalse(StringUtils.isBlank(author.getSurname())); + assertFalse(StringUtils.isBlank(author.getName())); + assertFalse(StringUtils.isBlank(author.getFullname())); + + } + @Test void testOdfFromHdfs() throws IOException, DocumentException { final String xml = IOUtils @@ -835,6 +908,20 @@ class MappersTest { assertEquals("EUR", p.getProcessingchargecurrency().getValue()); } + @Test + void testROHub() throws IOException, DocumentException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); +// final Dataset p = (Dataset) list.get(0); +// assertValidId(p.getId()); +// assertValidId(p.getCollectedfrom().get(0).getKey()); +// System.out.println(p.getTitle().get(0).getValue()); +// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + } + private void assertValidId(final String id) { // System.out.println(id); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt index 09bd58aeb..39ed0cef1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt @@ -497,6 +497,7 @@ dnet:publication_resource @=@ 0044 @=@ Graduate diploma dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma dnet:publication_resource @=@ 0000 @=@ UNKNOWN dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance +dnet:publication_resource @=@ 0048 @=@ RO-crate dnet:languages @=@ abk @=@ ab dnet:languages @=@ aar @=@ aa dnet:languages @=@ afr @=@ af diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index bb1e5fbf9..59311d5a7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -164,6 +164,7 @@ dnet:publication_resource @=@ dnet:publication_resource @=@ 0030 @=@ Sound dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance +dnet:publication_resource @=@ dnet:publication_resource @=@ 0048 @=@ Research Object ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml index f4b0c477f..277578185 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_record.xml @@ -60,6 +60,15 @@ https://oneecosystem.pensoft.net/article/13718/ One Ecosystem 0001 + ror_________::https://ror.org/02gdcn153 + corda_______::226852 diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml new file mode 100644 index 000000000..0fc568e56 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml @@ -0,0 +1,69 @@ + +> +
+ oai:zenodo.org:3406824 + 2020-01-20T16:45:20Z + openaire + 2022-06-07T10:21:24.06Z + test________::92fe3efa47883b2f3401e6a4bd92e9d7 + 2020-05-21T05:26:15.93Z + 2020-08-01T11:06:26.977Z +
+ + + 10.5281/zenodo.3406824 + + http://dx.doi.org/10.5281/zenodo.3406824 + + + + Anne van Weerden + 0000-0003-3272-8007 + Utrecht University Library + + + Anne van, Weerden + 0000-0003-3272-8008 + Utrecht University Library + + + + Helen Bayly and Catherine Disney as influences in the life of Sir William Rowan Hamilton + + Zenodo + 2018 + + Sir William Rowan Hamilton, Lady Helena Maria Hamilton Bayly, Catherine Disney, Ireland, history, biography, nineteenth century + + + 2018-12-28 + + en + + + 10.5281/zenodo.3406823 + + + Creative Commons Attribution 4.0 International + Open Access + + +

In the 1880s Robert Graves published a biography about Sir William Rowan Hamilton (1805-1865), to which in a 1980 biography Thomas Hankins added further information. From these biographies a picture emerged of a man who was unhappily married because he had lost the love of his life, which raised the question how such an unhappy man could produce so much beautiful mathematics. In this article it is stated that a main cause for the unhappy picture is that Graves ignored the influence on one another of Hamilton and his wife Helen Bayly, and Hankins that of Hamilton and his first and lost love Catherine Disney. It is then shown that if these influences are taken into account a very different view on Hamilton;s private life arises, in which he was happily married to a wife who enabled him to work as he needed to.

+
+
+ 10.5281/zenodo.3406824 + 0001 + 2018-12-28 + OPEN + https://creativecommons.org/licenses/by/4.0/legalcode + eng + + +
+
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml new file mode 100644 index 000000000..ca3ebe6c2 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/rohub.xml @@ -0,0 +1,103 @@ + + +
+ eosca5322f5f::4dd1aaf93ae136b65dc9ee4e6f76eac9 + 53aa90bf-c593-4e6d-923f-d4711ac4b0e1 + 2022-05-25T15:35:48.262Z + eosca5322f5f + 53aa90bf-c593-4e6d-923f-d4711ac4b0e1 + 2022-05-25T15:35:38Z + rohub_data + ro-crate_data + 2022-05-25T15:36:11.094Z +
+ + + https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1 + + http://api.rohub.org/api/ros/53aa90bf-c593-4e6d-923f-d4711ac4b0e1/ + + + + https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb + + https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb + + https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html + + https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html + + + + + Anne Fouilloux + + + + + 2021-12-19T21:18:33Z + + + The COVID-19 pandemic has led to significant reductions in economic activity, especially during lockdowns. Several studies has shown that the concentration of nitrogen dioxyde and particulate matter levels have reduced during lockdown events. Reductions in transportation sector emissions are most likely largely responsible for the NO2 anomalies. In this study, we analyze the impact of lockdown events on the air quality using data from Copernicus Atmosphere Monitoring Service over Europe and at selected locations. + + + + European Commission + 10.13039/501100000781 + 101017502 + Research Lifecycle Management for Earth Science Communities and Copernicus Users + + + MIT License + University of Oslo + 2021 + RO-crate + + open access + + + 11.971 MB + + + Applied sciences + Meteorology + EOSC::RO-crate + + + Impact of the Covid-19 Lockdown on Air quality over Europe + + + https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1 + 0048 + + OPEN + https://opensource.org/licenses/MIT + und + + + + + + + https%3A%2F%2Fapi.rohub.org%2Fapi%2Foai2d%2F + 53aa90bf-c593-4e6d-923f-d4711ac4b0e1 + 2022-05-25T15:35:38Z + + + + + false + false + 0.9 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql index a5839da11..aee66fd5e 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step13.sql @@ -80,4 +80,34 @@ where reltype='resultResult' and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE and r1.resulttype.classname != 'other' and r2.resulttype.classname != 'other' - and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; \ No newline at end of file + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; + +create table ${stats_db_name}.result_citations_oc stored as parquet as +select substr(target, 4) as id, count(distinct substr(source, 4)) as citations +from ${openaire_db_name}.relation rel +join ${openaire_db_name}.result r1 on rel.source=r1.id +join ${openaire_db_name}.result r2 on r2.id=rel.target +where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations' + and reltype='resultResult' + and r1.resulttype.classname!=r2.resulttype.classname + and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE + and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE + and r1.resulttype.classname != 'other' + and r2.resulttype.classname != 'other' + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE +group by substr(target, 4); + +create table ${stats_db_name}.result_references_oc stored as parquet as +select substr(source, 4) as id, count(distinct substr(target, 4)) as references +from ${openaire_db_name}.relation rel + join ${openaire_db_name}.result r1 on rel.source=r1.id + join ${openaire_db_name}.result r2 on r2.id=rel.target +where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations' + and reltype='resultResult' + and r1.resulttype.classname!=r2.resulttype.classname + and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE + and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE + and r1.resulttype.classname != 'other' + and r2.resulttype.classname != 'other' + and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE +group by substr(source, 4); \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 24e6bff7e..db40cf973 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -82,31 +82,31 @@ on r.id= tmp.id; compute stats indi_funded_result_with_fundref; -create table indi_result_org_country_collab stored as parquet as -with tmp as -(select o.id as id, o.country , ro.id as result,r.type from organization o -join result_organization ro on o.id=ro.organization -join result r on r.id=ro.id where o.country <> 'UNKNOWN') -select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations -from tmp as o1 -join tmp as o2 on o1.result=o2.result -where o1.id<>o2.id and o1.country<>o2.country -group by o1.id, o1.type,o2.country; +-- create table indi_result_org_country_collab stored as parquet as +-- with tmp as +-- (select o.id as id, o.country , ro.id as result,r.type from organization o +-- join result_organization ro on o.id=ro.organization +-- join result r on r.id=ro.id where o.country <> 'UNKNOWN') +-- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations +-- from tmp as o1 +-- join tmp as o2 on o1.result=o2.result +-- where o1.id<>o2.id and o1.country<>o2.country +-- group by o1.id, o1.type,o2.country; +-- +-- compute stats indi_result_org_country_collab; -compute stats indi_result_org_country_collab; - -create table indi_result_org_collab stored as parquet as -with tmp as -(select o.id, ro.id as result,r.type from organization o -join result_organization ro on o.id=ro.organization -join result r on r.id=ro.id) -select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations -from tmp as o1 -join tmp as o2 on o1.result=o2.result -where o1.id<>o2.id -group by o1.id, o2.id, o1.type; - -compute stats indi_result_org_collab; +-- create table indi_result_org_collab stored as parquet as +-- with tmp as +-- (select o.id, ro.id as result,r.type from organization o +-- join result_organization ro on o.id=ro.organization +-- join result r on r.id=ro.id) +-- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations +-- from tmp as o1 +-- join tmp as o2 on o1.result=o2.result +-- where o1.id<>o2.id +-- group by o1.id, o2.id, o1.type; +-- +-- compute stats indi_result_org_collab; create table indi_funder_country_collab stored as parquet as with tmp as (select funder, project, country from organization_projects op diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index bcc9f0b5d..7412910a9 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -18,28 +18,45 @@ create table TARGET.result stored as parquet as select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) union all select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( - 'openorgs____::759d59f05d77188faee99b7493b46805', - 'openorgs____::b84450f9864182c67b8611b5593f4250', - 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', - 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', - 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', - 'openorgs____::d169c7407dd417152596908d48c11460', - 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', - 'openorgs____::2fb1e47b4612688d9de9169d579939a7', - 'openorgs____::759d59f05d77188faee99b7493b46805', - 'openorgs____::cad284878801b9465fa51a95b1d779db', - 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', - 'openorgs____::c0286313e36479eff8676dba9b724b40' - -- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot - ) )) foo; + 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC" + 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council + 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ?? + 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University + 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade + 'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki + 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho + 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid + 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen + 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens + -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot + 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University + 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark + 'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin + 'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt + 'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven + 'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape + 'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute + 'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University + 'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg + 'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII) + ) )) foo; compute stats TARGET.result; create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_citations; +create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_references_oc; + +create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_citations_oc; + create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_classifications; +create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); +compute stats TARGET.result_apc; + create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_concepts; @@ -90,11 +107,6 @@ compute stats TARGET.result_sources; create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.result_topics; -create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.result_apc; - - - create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql index e1c36cbc0..b5eba6111 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step7.sql @@ -127,6 +127,7 @@ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r WHERE r.reltype = 'resultOrganization' + and r.target like '50|%' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index fa3eca1a9..6fa0e6fdf 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -93,7 +93,7 @@ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.target like '20|%' and r.datainfo.invisible=false; -- datasource sources: -- where the datasource info have been collected from.