Merge branch 'beta' of code-repo.d4science.org:D-Net/dnet-hadoop into doiboost_refactor

This commit is contained in:
Sandro La Bruzzo 2022-06-27 16:24:04 +02:00
commit e517f52e30
33 changed files with 2067 additions and 159 deletions

View File

@ -1,18 +1,18 @@
package eu.dnetlib.dhp.common; package eu.dnetlib.dhp.common;
import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.HashSet; import java.util.*;
import java.util.List; import java.util.stream.Collectors;
import java.util.Set;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils; import org.apache.commons.lang3.text.WordUtils;
import com.ctc.wstx.dtd.LargePrefixedNameSet;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
@ -29,7 +29,19 @@ public class PacePerson {
private List<String> fullname = Lists.newArrayList(); private List<String> fullname = Lists.newArrayList();
private final String original; private final String original;
private static Set<String> particles = null; private static Set<String> particles;
static {
try {
particles = new HashSet<>(IOUtils
.readLines(
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/** /**
* Capitalizes a string * Capitalizes a string
@ -37,29 +49,20 @@ public class PacePerson {
* @param s the string to capitalize * @param s the string to capitalize
* @return the input string with capital letter * @return the input string with capital letter
*/ */
public static final String capitalize(final String s) { public static String capitalize(final String s) {
if (particles.contains(s)) {
return s;
}
return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
} }
/** /**
* Adds a dot to a string with length equals to 1 * Adds a dot to a string with length equals to 1
*/ */
public static final String dotAbbreviations(final String s) { public static String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s; return s.length() == 1 ? s + "." : s;
} }
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
/** /**
* The constructor of the class. It fills the fields of the class basing on the input fullname. * The constructor of the class. It fills the fields of the class basing on the input fullname.
* *
@ -128,10 +131,6 @@ public class PacePerson {
} }
private List<String> splitTerms(final String s) { private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList(); final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) { if (!particles.contains(part.toLowerCase())) {
@ -187,17 +186,36 @@ public class PacePerson {
} }
public List<String> getCapitalFirstnames() { public List<String> getCapitalFirstnames() {
return Lists return Optional
.newArrayList( .ofNullable(getNameWithAbbreviations())
Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); .map(
name -> name
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
} }
public List<String> getCapitalSurname() { public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); return Optional
.ofNullable(getSurname())
.map(
surname -> surname
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
} }
public List<String> getNameWithAbbreviations() { public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); return Optional
.ofNullable(getName())
.map(
name -> name
.stream()
.map(PacePerson::dotAbbreviations)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
} }
public boolean isAccurate() { public boolean isAccurate() {

View File

@ -3,7 +3,6 @@ package eu.dnetlib.dhp.actionmanager.ror;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORG_ORG_RELTYPE;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
@ -39,7 +38,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType; import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType;
import eu.dnetlib.dhp.actionmanager.ror.model.Relationship;
import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization; import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
@ -51,7 +49,6 @@ import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;
@ -168,38 +165,10 @@ public class GenerateRorActionSetJob {
final List<AtomicAction<? extends Oaf>> res = new ArrayList<>(); final List<AtomicAction<? extends Oaf>> res = new ArrayList<>();
res.add(new AtomicAction<>(Organization.class, o)); res.add(new AtomicAction<>(Organization.class, o));
for (final Relationship rorRel : r.getRelationships()) {
if (rorRel.getType().equalsIgnoreCase("parent")) {
final String orgId1 = calculateOpenaireId(r.getId());
final String orgId2 = calculateOpenaireId(rorRel.getId());
res
.add(
new AtomicAction<>(Relation.class,
calculateHierarchyRel(orgId1, orgId2, ModelConstants.IS_PARENT_OF)));
res
.add(
new AtomicAction<>(Relation.class,
calculateHierarchyRel(orgId2, orgId1, ModelConstants.IS_CHILD_OF)));
}
}
return res; return res;
} }
private static Relation calculateHierarchyRel(final String source, final String target, final String relClass) {
final Relation rel = new Relation();
rel.setSource(source);
rel.setTarget(target);
rel.setRelType(ORG_ORG_RELTYPE);
rel.setSubRelType(ModelConstants.RELATIONSHIP);
rel.setRelClass(relClass);
rel.setCollectedfrom(ROR_COLLECTED_FROM);
rel.setDataInfo(ROR_DATA_INFO);
rel.setLastupdatetimestamp(System.currentTimeMillis());
return rel;
}
private static String calculateOpenaireId(final String rorId) { private static String calculateOpenaireId(final String rorId) {
return String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(rorId)); return String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(rorId));
} }

View File

@ -19,6 +19,8 @@ import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.aggregation.common.ReporterCallback; import eu.dnetlib.dhp.aggregation.common.ReporterCallback;
import eu.dnetlib.dhp.aggregation.common.ReportingJob; import eu.dnetlib.dhp.aggregation.common.ReportingJob;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.file.FileGZipCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MDStoreCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin;
@ -114,6 +116,10 @@ public class CollectorWorker extends ReportingJob {
return new OaiCollectorPlugin(clientParams); return new OaiCollectorPlugin(clientParams);
case rest_json2xml: case rest_json2xml:
return new RestCollectorPlugin(clientParams); return new RestCollectorPlugin(clientParams);
case file:
return new FileCollectorPlugin(fileSystem);
case fileGZip:
return new FileGZipCollectorPlugin(fileSystem);
case other: case other:
final CollectorPlugin.NAME.OTHER_NAME plugin = Optional final CollectorPlugin.NAME.OTHER_NAME plugin = Optional
.ofNullable(api.getParams().get("other_plugin_type")) .ofNullable(api.getParams().get("other_plugin_type"))

View File

@ -10,7 +10,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
public interface CollectorPlugin { public interface CollectorPlugin {
enum NAME { enum NAME {
oai, other, rest_json2xml; oai, other, rest_json2xml, file, fileGZip;
public enum OTHER_NAME { public enum OTHER_NAME {
mdstore_mongodb_dump, mdstore_mongodb mdstore_mongodb_dump, mdstore_mongodb

View File

@ -0,0 +1,80 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.Iterator;
import java.util.Optional;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.plugin.utils.XMLIterator;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public abstract class AbstractSplittedRecordPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(AbstractSplittedRecordPlugin.class);
public static final String SPLIT_ON_ELEMENT = "splitOnElement";
private final FileSystem fileSystem;
public AbstractSplittedRecordPlugin(FileSystem fileSystem) {
this.fileSystem = fileSystem;
}
@Override
public Stream<String> collect(ApiDescriptor api, AggregatorReport report) throws CollectorException {
// get path to file
final Path filePath = Optional
.ofNullable(api.getBaseUrl())
.map(Path::new)
.orElseThrow(() -> new CollectorException("missing baseUrl"));
log.info("baseUrl: {}", filePath);
// check that path to file exists
try {
if (!fileSystem.exists(filePath)) {
throw new CollectorException("path does not exist: " + filePath);
}
} catch (IOException e) {
throw new CollectorException(e);
}
// get split element
final String splitOnElement = Optional
.ofNullable(api.getParams().get(SPLIT_ON_ELEMENT))
.orElseThrow(
() -> new CollectorException(String
.format("missing parameter '%s', required by the AbstractSplittedRecordPlugin", SPLIT_ON_ELEMENT)));
log.info("splitOnElement: {}", splitOnElement);
final BufferedInputStream bis = getBufferedInputStream(filePath);
Iterator<String> xmlIterator = new XMLIterator(splitOnElement, bis);
return StreamSupport
.stream(
Spliterators.spliteratorUnknownSize(xmlIterator, Spliterator.ORDERED),
false);
}
abstract protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException;
public FileSystem getFileSystem() {
return fileSystem;
}
}

View File

@ -0,0 +1,33 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class FileCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileCollectorPlugin.class);
public FileCollectorPlugin(FileSystem fileSystem) {
super(fileSystem);
}
@Override
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
log.info("filePath: {}", filePath);
try {
FileSystem fs = super.getFileSystem();
return new BufferedInputStream(fs.open(filePath));
} catch (Exception e) {
throw new CollectorException("Error reading file " + filePath, e);
}
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.BufferedInputStream;
import java.util.zip.GZIPInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class FileGZipCollectorPlugin extends AbstractSplittedRecordPlugin {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPlugin.class);
public FileGZipCollectorPlugin(FileSystem fileSystem) {
super(fileSystem);
}
@Override
protected BufferedInputStream getBufferedInputStream(final Path filePath) throws CollectorException {
log.info("filePath: {}", filePath);
try {
FileSystem fs = super.getFileSystem();
GZIPInputStream stream = new GZIPInputStream(fs.open(filePath));
return new BufferedInputStream(stream);
} catch (Exception e) {
throw new CollectorException("Error reading file " + filePath, e);
}
}
}

View File

@ -19,7 +19,7 @@ import org.dom4j.io.XMLWriter;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.XmlCleaner; import eu.dnetlib.dhp.collection.plugin.utils.XmlCleaner;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpConnector2; import eu.dnetlib.dhp.common.collection.HttpConnector2;

View File

@ -30,7 +30,7 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList; import org.w3c.dom.NodeList;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
import eu.dnetlib.dhp.collection.JsonUtils; import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection; package eu.dnetlib.dhp.collection.plugin.utils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;

View File

@ -0,0 +1,177 @@
package eu.dnetlib.dhp.collection.plugin.utils;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.util.Iterator;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class XMLIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(XMLIterator.class);
private ThreadLocal<XMLInputFactory> inputFactory = new ThreadLocal<XMLInputFactory>() {
@Override
protected XMLInputFactory initialValue() {
return XMLInputFactory.newInstance();
}
};
private ThreadLocal<XMLOutputFactory> outputFactory = new ThreadLocal<XMLOutputFactory>() {
@Override
protected XMLOutputFactory initialValue() {
return XMLOutputFactory.newInstance();
}
};
private ThreadLocal<XMLEventFactory> eventFactory = new ThreadLocal<XMLEventFactory>() {
@Override
protected XMLEventFactory initialValue() {
return XMLEventFactory.newInstance();
}
};
public static final String UTF_8 = "UTF-8";
final XMLEventReader parser;
private XMLEvent current = null;
private String element;
private InputStream inputStream;
public XMLIterator(final String element, final InputStream inputStream) {
super();
this.element = element;
this.inputStream = inputStream;
this.parser = getParser();
try {
this.current = findElement(parser);
} catch (XMLStreamException e) {
log.warn("cannot init parser position. No element found: " + element);
current = null;
}
}
@Override
public boolean hasNext() {
return current != null;
}
@Override
public String next() {
String result = null;
try {
result = copy(parser);
current = findElement(parser);
return result;
} catch (XMLStreamException e) {
throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e);
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@SuppressWarnings("finally")
private String copy(final XMLEventReader parser) throws XMLStreamException {
final StringWriter result = new StringWriter();
try {
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result);
final StartElement start = current.asStartElement();
final StartElement newRecord = eventFactory
.get()
.createStartElement(start.getName(), start.getAttributes(), start.getNamespaces());
// new root record
writer.add(newRecord);
// copy the rest as it is
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
// TODO: replace with depth tracking instead of close tag tracking.
if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) {
writer.add(event);
break;
}
writer.add(event);
}
writer.close();
} finally {
return result.toString();
}
}
/**
* Looks for the next occurrence of the splitter element.
*
* @param parser
* @return
* @throws XMLStreamException
*/
private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException {
/*
* if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; }
*/
XMLEvent peek = parser.peek();
if (peek != null && peek.isStartElement()) {
String name = peek.asStartElement().getName().getLocalPart();
if (element.equals(name)) {
return peek;
}
}
while (parser.hasNext()) {
final XMLEvent event = parser.nextEvent();
if (event != null && event.isStartElement()) {
String name = event.asStartElement().getName().getLocalPart();
if (element.equals(name)) {
return event;
}
}
}
return null;
}
private XMLEventReader getParser() {
try {
return inputFactory.get().createXMLEventReader(sanitize(inputStream));
} catch (XMLStreamException e) {
throw new RuntimeException(e);
}
}
private Reader sanitize(final InputStream in) {
final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder();
charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE);
charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
return new InputStreamReader(in, charsetDecoder);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.collection; package eu.dnetlib.dhp.collection.plugin.utils;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;

View File

@ -47,13 +47,18 @@ object DataciteToOAFTransformation {
} }
/** This method should skip record if json contains invalid text /** This method should skip record if json contains invalid text
* defined in gile datacite_filter * defined in file datacite_filter
* *
* @param json * @param record : unparsed datacite record
* @param json : parsed record
* @return True if the record should be skipped * @return True if the record should be skipped
*/ */
def skip_record(json: String): Boolean = { def skip_record(record: String, json: org.json4s.JValue): Boolean = {
datacite_filter.exists(f => json.contains(f)) implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher")
.extractOrElse[String]("")
.equalsIgnoreCase("FAIRsharing")
} }
@deprecated("this method will be removed", "dhp") @deprecated("this method will be removed", "dhp")
@ -304,12 +309,13 @@ object DataciteToOAFTransformation {
vocabularies: VocabularyGroup, vocabularies: VocabularyGroup,
exportLinks: Boolean exportLinks: Boolean
): List[Oaf] = { ): List[Oaf] = {
if (skip_record(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
if (skip_record(input, json))
return List()
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = val resourceTypeGeneral =
(json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)

View File

@ -0,0 +1,61 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.IOException;
import java.util.HashMap;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
import net.bytebuddy.asm.Advice;
public class FileCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
private final ApiDescriptor api = new ApiDescriptor();
private FileCollectorPlugin plugin;
private static final String SPLIT_ON_ELEMENT = "repository";
@BeforeEach
public void setUp() throws IOException {
final String gzipFile = this
.getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml")
.getFile();
api.setBaseUrl(gzipFile);
HashMap<String, String> params = new HashMap<>();
params.put("splitOnElement", SPLIT_ON_ELEMENT);
api.setParams(params);
FileSystem fs = FileSystem.get(new Configuration());
plugin = new FileCollectorPlugin(fs);
}
@Test
void test() throws CollectorException {
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
stream.limit(10).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
log.info(s);
});
}
}

View File

@ -0,0 +1,68 @@
package eu.dnetlib.dhp.collection.plugin.file;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.HashMap;
import java.util.Objects;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ExtendWith(MockitoExtension.class)
public class FileGZipCollectorPluginTest {
private static final Logger log = LoggerFactory.getLogger(FileGZipCollectorPluginTest.class);
private final ApiDescriptor api = new ApiDescriptor();
private FileGZipCollectorPlugin plugin;
private static final String SPLIT_ON_ELEMENT = "repository";
@BeforeEach
public void setUp() throws IOException {
final String gzipFile = Objects
.requireNonNull(
this
.getClass()
.getResource("/eu/dnetlib/dhp/collection/plugin/file/opendoar.xml.gz"))
.getFile();
api.setBaseUrl(gzipFile);
HashMap<String, String> params = new HashMap<>();
params.put("splitOnElement", SPLIT_ON_ELEMENT);
api.setParams(params);
FileSystem fs = FileSystem.get(new Configuration());
plugin = new FileGZipCollectorPlugin(fs);
}
@Test
void test() throws CollectorException {
final Stream<String> stream = plugin.collect(api, new AggregatorReport());
stream.limit(10).forEach(s -> {
Assertions.assertTrue(s.length() > 0);
log.info(s);
});
}
}

View File

@ -107,4 +107,19 @@ class DataciteToOAFTest extends AbstractVocabularyTest {
} }
@Test
def testFilter(): Unit = {
val record = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record_fairsharing.json")
)
.mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res: List[Oaf] = DataciteToOAFTransformation.generateOAF(record, 0L, 0L, vocabularies, true)
assertTrue(res.isEmpty)
}
} }

View File

@ -102,21 +102,28 @@ public class SparkCountryPropagationJob {
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() { private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
return t -> { return t -> {
Optional.ofNullable(t._2()).ifPresent(r -> { Optional.ofNullable(t._2()).ifPresent(r -> {
if (Optional.ofNullable(t._1().getCountry()).isPresent())
t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet())); t._1().getCountry().addAll(merge(t._1().getCountry(), r.getCountrySet()));
else
t._1().setCountry(merge(null, t._2().getCountrySet()));
}); });
return t._1(); return t._1();
}; };
} }
private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) { private static List<Country> merge(List<Country> c1, List<CountrySbs> c2) {
HashSet<String> countries = c1 HashSet<String> countries = new HashSet<>();
if (Optional.ofNullable(c1).isPresent()) {
countries = c1
.stream() .stream()
.map(Qualifier::getClassid) .map(Qualifier::getClassid)
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
}
HashSet<String> finalCountries = countries;
return c2 return c2
.stream() .stream()
.filter(c -> !countries.contains(c.getClassid())) .filter(c -> !finalCountries.contains(c.getClassid()))
.map(c -> getCountry(c.getClassid(), c.getClassname())) .map(c -> getCountry(c.getClassid(), c.getClassname()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -18,14 +18,7 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.util.ArrayList; import java.util.*;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.*; import org.dom4j.*;
@ -35,6 +28,7 @@ import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Context;
@ -199,8 +193,13 @@ public abstract class AbstractMdRecordToOafMapper {
final List<Oaf> oafs = Lists.newArrayList(entity); final List<Oaf> oafs = Lists.newArrayList(entity);
if (!oafs.isEmpty()) { if (!oafs.isEmpty()) {
oafs.addAll(addProjectRels(doc, entity)); Set<Oaf> rels = Sets.newHashSet();
oafs.addAll(addOtherResultRels(doc, entity));
rels.addAll(addProjectRels(doc, entity));
rels.addAll(addOtherResultRels(doc, entity));
rels.addAll(addRelations(doc, entity));
oafs.addAll(rels);
} }
return oafs; return oafs;
@ -278,6 +277,46 @@ public abstract class AbstractMdRecordToOafMapper {
return res; return res;
} }
private List<Oaf> addRelations(Document doc, OafEntity entity) {
final List<Oaf> rels = Lists.newArrayList();
for (Object o : doc.selectNodes("//oaf:relation")) {
Element element = (Element) o;
final String target = StringUtils.trim(element.getText());
final String relType = element.attributeValue("relType");
final String subRelType = element.attributeValue("subRelType");
final String relClass = element.attributeValue("relClass");
if (StringUtils.isNotBlank(target) && StringUtils.isNotBlank(relType) && StringUtils.isNotBlank(subRelType)
&& StringUtils.isNotBlank(relClass)) {
final String relClassInverse = ModelSupport
.findInverse(ModelSupport.rel(relType, subRelType, relClass))
.getInverseRelClass();
final String validationdDate = ((Node) o).valueOf("@validationDate");
if (StringUtils.isNotBlank(target)) {
final String targetType = element.attributeValue("targetType");
if (StringUtils.isNotBlank(targetType)) {
final String targetId = createOpenaireId(targetType, target, true);
rels
.add(
getRelation(
entity.getId(), targetId, relType, subRelType, relClass, entity, validationdDate));
rels
.add(
getRelation(
targetId, entity.getId(), relType, subRelType, relClassInverse, entity,
validationdDate));
}
}
}
}
return rels;
}
protected Relation getRelation(final String source, protected Relation getRelation(final String source,
final String target, final String target,
final String relType, final String relType,

View File

@ -57,14 +57,10 @@ class MappersTest {
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml); final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
assertEquals(3, list.size()); assertEquals(1, list.stream().filter(o -> o instanceof Publication).count());
assertTrue(list.get(0) instanceof Publication); assertEquals(4, list.stream().filter(o -> o instanceof Relation).count());
assertTrue(list.get(1) instanceof Relation);
assertTrue(list.get(2) instanceof Relation);
final Publication p = (Publication) list.get(0); Publication p = (Publication) list.stream().filter(o -> o instanceof Publication).findFirst().get();
final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2);
assertValidId(p.getId()); assertValidId(p.getId());
@ -125,26 +121,58 @@ class MappersTest {
assertNotNull(p.getBestaccessright()); assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid()); assertEquals("OPEN", p.getBestaccessright().getClassid());
assertValidId(r1.getSource());
assertValidId(r1.getTarget()); // RESULT PROJECT
assertValidId(r2.getSource()); List<Relation> resultProject = list
assertValidId(r2.getTarget()); .stream()
assertValidId(r1.getCollectedfrom().get(0).getKey()); .filter(o -> o instanceof Relation)
assertValidId(r2.getCollectedfrom().get(0).getKey()); .map(o -> (Relation) o)
assertNotNull(r1.getDataInfo()); .filter(r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType()))
assertNotNull(r2.getDataInfo()); .collect(Collectors.toList());
assertNotNull(r1.getDataInfo().getTrust());
assertNotNull(r2.getDataInfo().getTrust()); assertEquals(2, resultProject.size());
assertEquals(r1.getSource(), r2.getTarget()); final Relation rp1 = resultProject.get(0);
assertEquals(r2.getSource(), r1.getTarget()); final Relation rp2 = resultProject.get(1);
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass())); verifyRelation(rp1);
assertTrue(StringUtils.isNotBlank(r1.getRelType())); verifyRelation(rp2);
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
assertTrue(r1.getValidated()); assertTrue(rp1.getValidated());
assertTrue(r2.getValidated()); assertTrue(rp2.getValidated());
assertEquals("2020-01-01", r1.getValidationDate()); assertEquals("2020-01-01", rp1.getValidationDate());
assertEquals("2020-01-01", r2.getValidationDate()); assertEquals("2020-01-01", rp2.getValidationDate());
assertEquals(rp1.getSource(), rp2.getTarget());
assertEquals(rp2.getSource(), rp1.getTarget());
// AFFILIATIONS
List<Relation> affiliation = list
.stream()
.filter(o -> o instanceof Relation)
.map(o -> (Relation) o)
.filter(r -> ModelConstants.RESULT_ORGANIZATION.equals(r.getRelType()))
.collect(Collectors.toList());
assertEquals(2, affiliation.size());
final Relation aff1 = affiliation.get(0);
final Relation aff2 = affiliation.get(1);
verifyRelation(aff1);
verifyRelation(aff2);
assertEquals(aff1.getSource(), aff2.getTarget());
assertEquals(aff2.getSource(), aff1.getTarget());
}
private void verifyRelation(Relation r) {
assertValidId(r.getSource());
assertValidId(r.getTarget());
assertValidId(r.getCollectedfrom().get(0).getKey());
assertNotNull(r.getDataInfo());
assertNotNull(r.getDataInfo().getTrust());
assertTrue(StringUtils.isNotBlank(r.getRelClass()));
assertTrue(StringUtils.isNotBlank(r.getRelType()));
} }
@Test @Test
@ -734,6 +762,51 @@ class MappersTest {
assertFalse(p_cleaned.getTitle().isEmpty()); assertFalse(p_cleaned.getTitle().isEmpty());
} }
@Test
void testZenodo() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertNotNull(p.getTitle());
assertFalse(p.getTitle().isEmpty());
assertEquals(1, p.getTitle().size());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertNotNull(p.getAuthor());
assertEquals(2, p.getAuthor().size());
Author author = p
.getAuthor()
.stream()
.filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8007")))
.findFirst()
.get();
assertNotNull(author);
assertTrue(StringUtils.isBlank(author.getSurname()));
assertTrue(StringUtils.isBlank(author.getName()));
assertEquals("Anne van Weerden", author.getFullname());
author = p
.getAuthor()
.stream()
.filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8008")))
.findFirst()
.get();
assertNotNull(author);
assertFalse(StringUtils.isBlank(author.getSurname()));
assertFalse(StringUtils.isBlank(author.getName()));
assertFalse(StringUtils.isBlank(author.getFullname()));
}
@Test @Test
void testOdfFromHdfs() throws IOException, DocumentException { void testOdfFromHdfs() throws IOException, DocumentException {
final String xml = IOUtils final String xml = IOUtils
@ -835,6 +908,20 @@ class MappersTest {
assertEquals("EUR", p.getProcessingchargecurrency().getValue()); assertEquals("EUR", p.getProcessingchargecurrency().getValue());
} }
@Test
void testROHub() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
// final Dataset p = (Dataset) list.get(0);
// assertValidId(p.getId());
// assertValidId(p.getCollectedfrom().get(0).getKey());
// System.out.println(p.getTitle().get(0).getValue());
// assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
}
private void assertValidId(final String id) { private void assertValidId(final String id) {
// System.out.println(id); // System.out.println(id);

View File

@ -497,6 +497,7 @@ dnet:publication_resource @=@ 0044 @=@ Graduate diploma
dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma
dnet:publication_resource @=@ 0000 @=@ UNKNOWN dnet:publication_resource @=@ 0000 @=@ UNKNOWN
dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance
dnet:publication_resource @=@ 0048 @=@ RO-crate
dnet:languages @=@ abk @=@ ab dnet:languages @=@ abk @=@ ab
dnet:languages @=@ aar @=@ aa dnet:languages @=@ aar @=@ aa
dnet:languages @=@ afr @=@ af dnet:languages @=@ afr @=@ af

View File

@ -164,6 +164,7 @@ dnet:publication_resource @=@ dnet:publication_resource @=@ 0030 @=@ Sound
dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis
dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown
dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance
dnet:publication_resource @=@ dnet:publication_resource @=@ 0048 @=@ Research Object
ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram
ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program
ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram

View File

@ -60,6 +60,15 @@
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext> <oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal> <oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
<oaf:refereed>0001</oaf:refereed> <oaf:refereed>0001</oaf:refereed>
<oaf:relation relClass="hasAuthorInstitution"
relType="resultOrganization"
subRelType="affiliation"
targetType="organization">ror_________::https://ror.org/02gdcn153</oaf:relation>
<oaf:relation relClass="isProducedBy"
relType="resultProject"
subRelType="outcome"
targetType="project"
validationDate="2020-01-01">corda_______::226852</oaf:relation>
</metadata> </metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/"> <about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd"> <provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">

View File

@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-3"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri">>
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<identifier>oai:zenodo.org:3406824</identifier>
<datestamp>2020-01-20T16:45:20Z</datestamp>
<setSpec>openaire</setSpec>
<dr:dateOfTransformation>2022-06-07T10:21:24.06Z</dr:dateOfTransformation>
<dri:objIdentifier>test________::92fe3efa47883b2f3401e6a4bd92e9d7</dri:objIdentifier>
<dri:dateOfCollection>2020-05-21T05:26:15.93Z</dri:dateOfCollection>
<dri:dateOfTransformation>2020-08-01T11:06:26.977Z</dri:dateOfTransformation>
</header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
<identifier identifierType="DOI">10.5281/zenodo.3406824</identifier>
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3">
<alternateIdentifier alternateIdentifierType="URL">http://dx.doi.org/10.5281/zenodo.3406824</alternateIdentifier>
</alternateIdentifiers>
<creators>
<creator>
<creatorName>Anne van Weerden</creatorName>
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3272-8007</nameIdentifier>
<affiliation>Utrecht University Library</affiliation>
</creator>
<creator>
<creatorName>Anne van, Weerden</creatorName>
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3272-8008</nameIdentifier>
<affiliation>Utrecht University Library</affiliation>
</creator>
</creators>
<titles>
<title>Helen Bayly and Catherine Disney as influences in the life of Sir William Rowan Hamilton</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>2018</publicationYear>
<subjects>
<subject>Sir William Rowan Hamilton, Lady Helena Maria Hamilton Bayly, Catherine Disney, Ireland, history, biography, nineteenth century</subject>
</subjects>
<dates>
<date dateType="Issued">2018-12-28</date>
</dates>
<language>en</language>
<resourceType resourceTypeGeneral="JournalArticle"/>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.5281/zenodo.3406823</relatedIdentifier>
</relatedIdentifiers>
<rightsList>
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract"><p>In the 1880s Robert Graves published a biography about Sir William Rowan Hamilton (1805-1865), to which in a 1980 biography Thomas Hankins added further information. From these biographies a picture emerged of a man who was unhappily married because he had lost the love of his life, which raised the question how such an unhappy man could produce so much beautiful mathematics. In this article it is stated that a main cause for the unhappy picture is that Graves ignored the influence on one another of Hamilton and his wife Helen Bayly, and Hankins that of Hamilton and his first and lost love Catherine Disney. It is then shown that if these influences are taken into account a very different view on Hamilton;s private life arises, in which he was happily married to a wife who enabled him to work as he needed to.</p></description>
</descriptions>
</resource>
<oaf:identifier identifierType="doi">10.5281/zenodo.3406824</oaf:identifier>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>2018-12-28</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>https://creativecommons.org/licenses/by/4.0/legalcode</oaf:license>
<oaf:language>eng</oaf:language>
<oaf:hostedBy name="ZENODO" id="opendoar____::2659"/>
<oaf:collectedFrom name="ZENODO" id="opendoar____::2659"/>
</metadata>
</record>

View File

@ -0,0 +1,103 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:xs="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<dri:objIdentifier>eosca5322f5f::4dd1aaf93ae136b65dc9ee4e6f76eac9</dri:objIdentifier>
<dri:recordIdentifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</dri:recordIdentifier>
<dri:dateOfCollection>2022-05-25T15:35:48.262Z</dri:dateOfCollection>
<oaf:datasourceprefix>eosca5322f5f</oaf:datasourceprefix>
<identifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</identifier>
<datestamp>2022-05-25T15:35:38Z</datestamp>
<setSpec>rohub_data</setSpec>
<setSpec>ro-crate_data</setSpec>
<dr:dateOfTransformation>2022-05-25T15:36:11.094Z</dr:dateOfTransformation>
</header>
<metadata>
<oaire:resource xmlns="http://namespace.openaire.eu/schema/oaire/">
<datacite:identifier identifierType="landingPage">https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1</datacite:identifier>
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier alternateIdentifierType="URL">http://api.rohub.org/api/ros/53aa90bf-c593-4e6d-923f-d4711ac4b0e1/</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="" relationType="">
https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb
</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="URI" relationType="IsPartOf">https://github.com/NordicESMhub/RELIANCE/blob/main/content/science/notebooks/air_quality_lockdown.ipynb</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="" relationType="">
https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html
</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="URI" relationType="IsPartOf">https://nordicesmhub.github.io/RELIANCE/science/notebooks/air_quality_lockdown.html</datacite:relatedIdentifier>
</datacite:relatedIdentifiers>
<creators xmlns="http://datacite.org/schema/kernel-4">
<creator>
<creator>
<creatorName>Anne Fouilloux</creatorName>
</creator>
</creator>
</creators>
<dates xmlns="http://datacite.org/schema/kernel-4">
<date dateType="Created">2021-12-19T21:18:33Z</date>
</dates>
<dc:descriptions>
<dc:description descriptionType="Abstract">The COVID-19 pandemic has led to significant reductions in economic activity, especially during lockdowns. Several studies has shown that the concentration of nitrogen dioxyde and particulate matter levels have reduced during lockdown events. Reductions in transportation sector emissions are most likely largely responsible for the NO2 anomalies. In this study, we analyze the impact of lockdown events on the air quality using data from Copernicus Atmosphere Monitoring Service over Europe and at selected locations.</dc:description>
</dc:descriptions>
<oaire:fundingReferences>
<oaire:fundingReference>
<oaire:funderName>European Commission</oaire:funderName>
<oaire:funderIdentifier funderIdentifierType="Crossref Funder ID">10.13039/501100000781</oaire:funderIdentifier>
<oaire:awardNumber awardURI="">101017502</oaire:awardNumber>
<oaire:awardTitle>Research Lifecycle Management for Earth Science Communities and Copernicus Users</oaire:awardTitle>
</oaire:fundingReference>
</oaire:fundingReferences>
<oaire:licenseCondition uri="https://opensource.org/licenses/MIT">MIT License</oaire:licenseCondition>
<dc:publisher>University of Oslo</dc:publisher>
<dc:publicationYear>2021</dc:publicationYear>
<oaire:resourceType resourceTypeGeneral="other research product" uri="http://purl.org/coar/resource_type/c_1843">RO-crate</oaire:resourceType>
<rightsList xmlns="http://datacite.org/schema/kernel-4">
<rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights>
</rightsList>
<sizes xmlns="http://datacite.org/schema/kernel-4">
<size>11.971 MB</size>
</sizes>
<subjects xmlns="http://datacite.org/schema/kernel-4">
<subject>Applied sciences</subject>
<subject>Meteorology</subject>
<subject>EOSC::RO-crate</subject>
</subjects>
<titles xmlns="http://datacite.org/schema/kernel-4">
<title>Impact of the Covid-19 Lockdown on Air quality over Europe</title>
</titles>
</oaire:resource>
<oaf:identifier identifierType="URL">https://w3id.org/ro-id/53aa90bf-c593-4e6d-923f-d4711ac4b0e1</oaf:identifier>
<dr:CobjCategory type="other">0048</dr:CobjCategory>
<oaf:dateAccepted/>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>https://opensource.org/licenses/MIT</oaf:license>
<oaf:language>und</oaf:language>
<oaf:hostedBy id="eosc________::psnc::psnc.rohub" name="ROHub"/>
<oaf:collectedFrom id="eosc________::psnc::psnc.rohub" name="ROHub"/>
</metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/" xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2022-05-25T15:35:48.262Z">
<baseURL>https%3A%2F%2Fapi.rohub.org%2Fapi%2Foai2d%2F</baseURL>
<identifier>53aa90bf-c593-4e6d-923f-d4711ac4b0e1</identifier>
<datestamp>2022-05-25T15:35:38Z</datestamp>
<metadataNamespace/>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk"
classname="Harvested" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -81,3 +81,33 @@ where reltype='resultResult'
and r1.resulttype.classname != 'other' and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other' and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE; and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
create table ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
and reltype='resultResult'
and r1.resulttype.classname!=r2.resulttype.classname
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4);
create table ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
join ${openaire_db_name}.result r2 on r2.id=rel.target
where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:crosswalk:opencitations'
and reltype='resultResult'
and r1.resulttype.classname!=r2.resulttype.classname
and r1.datainfo.deletedbyinference=false and r1.datainfo.invisible = FALSE
and r2.datainfo.deletedbyinference=false and r2.datainfo.invisible = FALSE
and r1.resulttype.classname != 'other'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(source, 4);

View File

@ -82,31 +82,31 @@ on r.id= tmp.id;
compute stats indi_funded_result_with_fundref; compute stats indi_funded_result_with_fundref;
create table indi_result_org_country_collab stored as parquet as -- create table indi_result_org_country_collab stored as parquet as
with tmp as -- with tmp as
(select o.id as id, o.country , ro.id as result,r.type from organization o -- (select o.id as id, o.country , ro.id as result,r.type from organization o
join result_organization ro on o.id=ro.organization -- join result_organization ro on o.id=ro.organization
join result r on r.id=ro.id where o.country <> 'UNKNOWN') -- join result r on r.id=ro.id where o.country <> 'UNKNOWN')
select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations -- select o1.id org1,o2.country country2, o1.type, count(distinct o1.result) as collaborations
from tmp as o1 -- from tmp as o1
join tmp as o2 on o1.result=o2.result -- join tmp as o2 on o1.result=o2.result
where o1.id<>o2.id and o1.country<>o2.country -- where o1.id<>o2.id and o1.country<>o2.country
group by o1.id, o1.type,o2.country; -- group by o1.id, o1.type,o2.country;
--
-- compute stats indi_result_org_country_collab;
compute stats indi_result_org_country_collab; -- create table indi_result_org_collab stored as parquet as
-- with tmp as
create table indi_result_org_collab stored as parquet as -- (select o.id, ro.id as result,r.type from organization o
with tmp as -- join result_organization ro on o.id=ro.organization
(select o.id, ro.id as result,r.type from organization o -- join result r on r.id=ro.id)
join result_organization ro on o.id=ro.organization -- select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations
join result r on r.id=ro.id) -- from tmp as o1
select o1.id org1,o2.id org2, o1.type, count(distinct o1.result) as collaborations -- join tmp as o2 on o1.result=o2.result
from tmp as o1 -- where o1.id<>o2.id
join tmp as o2 on o1.result=o2.result -- group by o1.id, o2.id, o1.type;
where o1.id<>o2.id --
group by o1.id, o2.id, o1.type; -- compute stats indi_result_org_collab;
compute stats indi_result_org_collab;
create table indi_funder_country_collab stored as parquet as create table indi_funder_country_collab stored as parquet as
with tmp as (select funder, project, country from organization_projects op with tmp as (select funder, project, country from organization_projects op

View File

@ -18,28 +18,45 @@ create table TARGET.result stored as parquet as
select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id) select * from SOURCE.result r where exists (select 1 from SOURCE.result_concepts rc where rc.id=r.id)
union all union all
select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in ( select * from SOURCE.result r where exists (select 1 from SOURCE.result_organization ro where ro.id=r.id and ro.organization in (
'openorgs____::759d59f05d77188faee99b7493b46805', 'openorgs____::b84450f9864182c67b8611b5593f4250', --"Athena Research and Innovation Center In Information Communication & Knowledge Technologies', --ARC"
'openorgs____::b84450f9864182c67b8611b5593f4250', 'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', --National Research Council
'openorgs____::d41cf6bd4ab1b1362a44397e0b95c975', 'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', --??? Not exists ??
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', 'openorgs____::d169c7407dd417152596908d48c11460', --Masaryk University
'openorgs____::d2a09b9d5eabb10c95f9470e172d05d2', 'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', --University of Belgrade
'openorgs____::d169c7407dd417152596908d48c11460', 'openorgs____::2fb1e47b4612688d9de9169d579939a7', --University of Helsinki
'openorgs____::1ec924b1759bb16d0a02f2dad8689b21', 'openorgs____::759d59f05d77188faee99b7493b46805', --University of Minho
'openorgs____::2fb1e47b4612688d9de9169d579939a7', 'openorgs____::cad284878801b9465fa51a95b1d779db', --Universidad Politécnica de Madrid
'openorgs____::759d59f05d77188faee99b7493b46805', 'openorgs____::eadc8da90a546e98c03f896661a2e4d4', --University of Göttingen
'openorgs____::cad284878801b9465fa51a95b1d779db', 'openorgs____::c0286313e36479eff8676dba9b724b40', --National and Kapodistrian University of Athens
'openorgs____::eadc8da90a546e98c03f896661a2e4d4', -- 'openorgs____::c80a8243a5e5c620d7931c88d93bf17a', --Université Paris Diderot
'openorgs____::c0286313e36479eff8676dba9b724b40' 'openorgs____::c08634f0a6b0081c3dc6e6c93a4314f3', --Bielefeld University
-- ,'openorgs____::c80a8243a5e5c620d7931c88d93bf17a' -- Paris Diderot 'openorgs____::6fc85e4a8f7ecaf4b0c738d010e967ea', --University of Southern Denmark
'openorgs____::3d6122f87f9a97a99d8f6e3d73313720', --Humboldt-Universität zu Berlin
'openorgs____::16720ada63d0fa8ca41601feae7d1aa5', --TU Darmstadt
'openorgs____::ccc0a066b56d2cfaf90c2ae369df16f5', --KU Leuven
'openorgs____::4c6f119632adf789746f0a057ed73e90', --University of the Western Cape
'openorgs____::ec3665affa01aeafa28b7852c4176dbd', --Rudjer Boskovic Institute
'openorgs____::5f31346d444a7f06a28c880fb170b0f6', --Ghent University
'openorgs____::2dbe47117fd5409f9c61620813456632', --University of Luxembourg
'openorgs____::6445d7758d3a40c4d997953b6632a368' --National Institute of Informatics (NII)
) )) foo; ) )) foo;
compute stats TARGET.result; compute stats TARGET.result;
create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_citations stored as parquet as select * from SOURCE.result_citations orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations; compute stats TARGET.result_citations;
create table TARGET.result_references_oc stored as parquet as select * from SOURCE.result_references_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_references_oc;
create table TARGET.result_citations_oc stored as parquet as select * from SOURCE.result_citations_oc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_citations_oc;
create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_classifications stored as parquet as select * from SOURCE.result_classifications orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_classifications; compute stats TARGET.result_classifications;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_apc;
create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_concepts stored as parquet as select * from SOURCE.result_concepts orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_concepts; compute stats TARGET.result_concepts;
@ -90,11 +107,6 @@ compute stats TARGET.result_sources;
create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id); create table TARGET.result_topics stored as parquet as select * from SOURCE.result_topics orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_topics; compute stats TARGET.result_topics;
create table TARGET.result_apc stored as parquet as select * from SOURCE.result_apc orig where exists (select 1 from TARGET.result r where r.id=orig.id);
compute stats TARGET.result_apc;
create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result); create view TARGET.foo1 as select * from SOURCE.result_result rr where rr.source in (select id from TARGET.result);
create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result); create view TARGET.foo2 as select * from SOURCE.result_result rr where rr.target in (select id from TARGET.result);
create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou; create table TARGET.result_result STORED AS PARQUET as select distinct * from (select * from TARGET.foo1 union all select * from TARGET.foo2) foufou;

View File

@ -127,6 +127,7 @@ CREATE TABLE ${stats_db_name}.result_organization STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'resultOrganization' WHERE r.reltype = 'resultOrganization'
and r.target like '50|%'
and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.result_projects STORED AS PARQUET AS

View File

@ -93,7 +93,7 @@ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false;
CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS
SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization
FROM ${openaire_db_name}.relation r FROM ${openaire_db_name}.relation r
WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.datainfo.invisible=false; WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.target like '20|%' and r.datainfo.invisible=false;
-- datasource sources: -- datasource sources:
-- where the datasource info have been collected from. -- where the datasource info have been collected from.