forked from antonis.lempesis/dnet-hadoop
merge branch with fork master
This commit is contained in:
commit
9dd3ef22c5
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-code-style</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
|
||||
<packaging>jar</packaging>
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
/** Provides serializable and throwing extensions to standard functional interfaces. */
|
||||
|
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
|
|||
private FunctionalInterfaceSupport() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
|
||||
* functions externally.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
|
||||
* functions externally.
|
||||
|
|
|
@ -16,6 +16,12 @@ import com.google.common.collect.Iterables;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
/**
|
||||
* PacePerson tries to derive information from the fullname string of an author. Such informations are Names, Surnames
|
||||
* an Fullname split into terms. It provides also an additional field for the original data. The calculation of the
|
||||
* names and the surnames is not always possible. When it is impossible to assert which are the names and the surnames,
|
||||
* the lists are empty.
|
||||
*/
|
||||
public class PacePerson {
|
||||
|
||||
private static final String UTF8 = "UTF-8";
|
||||
|
@ -26,10 +32,19 @@ public class PacePerson {
|
|||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
/**
|
||||
* Capitalizes a string
|
||||
*
|
||||
* @param s the string to capitalize
|
||||
* @return the input string with capital letter
|
||||
*/
|
||||
public static final String capitalize(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a dot to a string with length equals to 1
|
||||
*/
|
||||
public static final String dotAbbreviations(final String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
|
@ -46,6 +61,12 @@ public class PacePerson {
|
|||
return h;
|
||||
}
|
||||
|
||||
/**
|
||||
* The constructor of the class. It fills the fields of the class basing on the input fullname.
|
||||
*
|
||||
* @param s the input string (fullname of the author)
|
||||
* @param aggressive set the string normalization type
|
||||
*/
|
||||
public PacePerson(String s, final boolean aggressive) {
|
||||
original = s;
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
|
@ -64,6 +85,7 @@ public class PacePerson {
|
|||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
// if the string contains a comma, it can derive surname and name by splitting on it
|
||||
if (s.contains(",")) {
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
|
@ -74,21 +96,23 @@ public class PacePerson {
|
|||
fullname.addAll(surname);
|
||||
fullname.addAll(name);
|
||||
}
|
||||
} else {
|
||||
} else { // otherwise, it should rely on CAPS terms and short terms
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
// computes lastInitialPosition and hasSurnameInUpperCase
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
final String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
lastInitialPosition = i; // first word in the name longer than 1 (to avoid name with dots)
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
hasSurnameInUpperCase = true; // if one of the words is CAPS
|
||||
}
|
||||
}
|
||||
|
||||
// manages particular cases of fullnames
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
name = fullname.subList(0, lastInitialPosition + 1);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class PacePersonTest {
|
||||
|
||||
@Test
|
||||
public void pacePersonTest1() {
|
||||
|
||||
PacePerson p = new PacePerson("Artini, Michele", false);
|
||||
assertEquals("Artini", p.getSurnameString());
|
||||
assertEquals("Michele", p.getNameString());
|
||||
assertEquals("Artini, Michele", p.getNormalisedFullname());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pacePersonTest2() {
|
||||
PacePerson p = new PacePerson("Michele G. Artini", false);
|
||||
assertEquals("Artini, Michele G.", p.getNormalisedFullname());
|
||||
assertEquals("Michele G", p.getNameString());
|
||||
assertEquals("Artini", p.getSurnameString());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ public class ModelConstants {
|
|||
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
||||
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
||||
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
|
||||
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
|
||||
|
||||
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
||||
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
||||
|
@ -25,6 +26,10 @@ public class ModelConstants {
|
|||
public static final String ORP_RESULTTYPE_CLASSID = "other";
|
||||
|
||||
public static final String RESULT_RESULT = "resultResult";
|
||||
/**
|
||||
* @deprecated Use {@link ModelConstants#RELATIONSHIP} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final String PUBLICATION_DATASET = "publicationDataset";
|
||||
public static final String IS_RELATED_TO = "isRelatedTo";
|
||||
public static final String SUPPLEMENT = "supplement";
|
||||
|
@ -34,6 +39,12 @@ public class ModelConstants {
|
|||
public static final String IS_PART_OF = "IsPartOf";
|
||||
public static final String HAS_PARTS = "HasParts";
|
||||
public static final String RELATIONSHIP = "relationship";
|
||||
public static final String CITATION = "citation";
|
||||
public static final String CITES = "cites";
|
||||
public static final String IS_CITED_BY = "IsCitedBy";
|
||||
public static final String REVIEW = "review";
|
||||
public static final String REVIEWS = "reviews";
|
||||
public static final String IS_REVIEWED_BY = "IsReviewedBy";
|
||||
|
||||
public static final String RESULT_PROJECT = "resultProject";
|
||||
public static final String OUTCOME = "outcome";
|
||||
|
|
|
@ -31,7 +31,7 @@ public class Instance implements Serializable {
|
|||
// typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private Field<String> refereed; // peer-review status
|
||||
private Qualifier refereed; // peer-review status
|
||||
|
||||
public Field<String> getLicense() {
|
||||
return license;
|
||||
|
@ -113,11 +113,11 @@ public class Instance implements Serializable {
|
|||
this.processingchargecurrency = processingchargecurrency;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
public Qualifier getRefereed() {
|
||||
return refereed;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
public void setRefereed(Qualifier refereed) {
|
||||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
|
|
|
@ -254,28 +254,25 @@ public class Result extends OafEntity implements Serializable {
|
|||
final StructuredProperty p = baseMainTitle;
|
||||
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
|
||||
}
|
||||
//
|
||||
//
|
||||
// title.remove(baseMainTitle);
|
||||
}
|
||||
|
||||
StructuredProperty newMainTitle = null;
|
||||
if (r.getTitle() != null) {
|
||||
newMainTitle = getMainTitle(r.getTitle());
|
||||
if (newMainTitle != null) {
|
||||
if (newMainTitle != null && title != null) {
|
||||
final StructuredProperty p = newMainTitle;
|
||||
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
// r.getTitle().remove(newMainTitle);
|
||||
}
|
||||
|
||||
if (newMainTitle != null && compareTrust(this, r) < 0)
|
||||
if (newMainTitle != null && compareTrust(this, r) < 0) {
|
||||
baseMainTitle = newMainTitle;
|
||||
}
|
||||
|
||||
title = mergeLists(title, r.getTitle());
|
||||
if (title != null && baseMainTitle != null)
|
||||
if (title != null && baseMainTitle != null) {
|
||||
title.add(baseMainTitle);
|
||||
}
|
||||
|
||||
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-actionmanager</artifactId>
|
||||
|
||||
|
|
|
@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
|
|||
.stream()
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)) : null);
|
||||
i.setRefereed(mapStringField(ri.getRefereed()));
|
||||
i.setRefereed(mapRefereed(ri.getRefereed()));
|
||||
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
|
||||
return i;
|
||||
}
|
||||
|
||||
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassid(refereed.getValue());
|
||||
q.setSchemename(refereed.getValue());
|
||||
q.setSchemeid("dnet:review_levels");
|
||||
q.setSchemename("dnet:review_levels");
|
||||
return q;
|
||||
}
|
||||
|
||||
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getExternalReferenceCount() > 0) {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.io.File;
|
|||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
|||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
|
||||
@Disabled
|
||||
public class DnetCollectorWorkerApplicationTests {
|
||||
|
||||
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,66 +1,70 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-broker-events</artifactId>
|
||||
<artifactId>dhp-broker-events</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-hive_2.11</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-hive_2.11</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||
<version>[2.0.1,3.0.0)</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||
<version>[2.0.1,3.0.0)</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -37,7 +37,7 @@ public class EventFactory {
|
|||
final Map<String, Object> map = createMapFromResult(updateInfo);
|
||||
|
||||
final String eventId = calculateEventId(
|
||||
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
|
||||
updateInfo.getTopicPath(), updateInfo.getTarget().getResult().getOriginalId().get(0),
|
||||
updateInfo.getHighlightValueAsString());
|
||||
|
||||
res.setEventId(eventId);
|
||||
|
@ -54,8 +54,8 @@ public class EventFactory {
|
|||
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
|
||||
final Map<String, Object> map = new HashMap<>();
|
||||
|
||||
final Result source = updateInfo.getSource();
|
||||
final Result target = updateInfo.getTarget();
|
||||
final Result source = updateInfo.getSource().getResult();
|
||||
final Result target = updateInfo.getTarget().getResult();
|
||||
|
||||
final List<KeyValue> collectedFrom = target.getCollectedfrom();
|
||||
if (collectedFrom.size() == 1) {
|
||||
|
|
|
@ -3,15 +3,9 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -26,94 +20,38 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ResultAggregator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ResultGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelationsAggregator;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateEventsApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
||||
|
||||
// Simple Matchers
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingAbstract = new EnrichMissingAbstract();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingPid = new EnrichMissingPid();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingSubject = new EnrichMissingSubject();
|
||||
private static final UpdateMatcher<Result, ?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
|
||||
private static final UpdateMatcher<Result, ?> enrichMorePid = new EnrichMorePid();
|
||||
private static final UpdateMatcher<Result, ?> enrichMoreSubject = new EnrichMoreSubject();
|
||||
|
||||
// Advanced matchers
|
||||
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMissingProject = new EnrichMissingProject();
|
||||
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMoreProject = new EnrichMoreProject();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMissingSoftware = new EnrichMissingSoftware();
|
||||
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy =
|
||||
new EnrichMissingPublicationIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo =
|
||||
new EnrichMissingPublicationIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy =
|
||||
new EnrichMissingPublicationIsSupplementedBy();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo =
|
||||
new EnrichMissingDatasetIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy =
|
||||
new EnrichMissingDatasetIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences =
|
||||
new EnrichMissingDatasetReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo =
|
||||
new EnrichMissingDatasetIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy =
|
||||
new EnrichMissingDatasetIsSupplementedBy();
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
.toString(
|
||||
GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -128,8 +66,16 @@ public class GenerateEventsApplication {
|
|||
final String eventsPath = parser.get("eventsPath");
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final String dedupConfigProfileId = parser.get("dedupConfProfile");
|
||||
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
removeOutputDir(spark, eventsPath);
|
||||
|
@ -137,14 +83,10 @@ public class GenerateEventsApplication {
|
|||
final Dataset<Event> all = spark.emptyDataset(Encoders.kryo(Event.class));
|
||||
|
||||
for (final Class<? extends Result> r1 : BrokerConstants.RESULT_CLASSES) {
|
||||
all.union(generateSimpleEvents(spark, graphPath, r1));
|
||||
|
||||
for (final Class<? extends Result> r2 : BrokerConstants.RESULT_CLASSES) {
|
||||
all.union(generateRelationEvents(spark, graphPath, r1, r2));
|
||||
}
|
||||
all.union(generateEvents(spark, graphPath, r1, dedupConfig));
|
||||
}
|
||||
|
||||
all.write().mode(SaveMode.Overwrite).json(eventsPath);
|
||||
all.write().mode(SaveMode.Overwrite).option("compression", "gzip").json(eventsPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
@ -153,155 +95,84 @@ public class GenerateEventsApplication {
|
|||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static <R extends Result> Dataset<Event> generateSimpleEvents(final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<R> resultClazz) {
|
||||
|
||||
final Dataset<Result> results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final TypedColumn<Tuple2<Result, Relation>, ResultGroup> aggr = new ResultAggregator().toColumn();
|
||||
|
||||
return results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
|
||||
.groupByKey((MapFunction<Tuple2<Result, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
||||
.filter(ResultGroup::isValid)
|
||||
.map((MapFunction<ResultGroup, EventGroup>) g -> GenerateEventsApplication.generateSimpleEvents(g), Encoders.kryo(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
|
||||
}
|
||||
|
||||
private static EventGroup generateSimpleEvents(final ResultGroup results) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final Result target : results.getData()) {
|
||||
list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData()));
|
||||
list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData()));
|
||||
}
|
||||
|
||||
final EventGroup events = new EventGroup();
|
||||
list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement);
|
||||
return events;
|
||||
}
|
||||
|
||||
private static <SRC extends Result, TRG extends OafEntity> Dataset<Event> generateRelationEvents(final SparkSession spark,
|
||||
private static <SRC extends Result> Dataset<Event> generateEvents(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass,
|
||||
final Class<TRG> targetClass) {
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Dataset<SRC> sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
final Dataset<TRG> targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass);
|
||||
final Dataset<ResultWithRelations> results = expandResultsWithRelations(spark, graphPath, sourceClass);
|
||||
|
||||
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final TypedColumn<Tuple2<ResultWithRelations, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
.toColumn();
|
||||
|
||||
return results
|
||||
.joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<ResultWithRelations, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
||||
.filter(ResultGroup::isValid)
|
||||
.map(
|
||||
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
|
||||
Encoders.kryo(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
|
||||
}
|
||||
|
||||
private static <SRC extends Result> Dataset<ResultWithRelations> expandResultsWithRelations(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass) {
|
||||
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
|
||||
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
|
||||
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
|
||||
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
|
||||
|
||||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
if (targetClass == Project.class) {
|
||||
// TODO join using: generateProjectsEvents
|
||||
} else if (targetClass == Software.class) {
|
||||
// TODO join using: generateSoftwareEvents
|
||||
} else if (targetClass == Publication.class) {
|
||||
// TODO join using: generatePublicationRelatedEvents
|
||||
} else if (targetClass == eu.dnetlib.dhp.schema.oaf.Dataset.class) {
|
||||
// TODO join using: generateDatasetRelatedEvents
|
||||
}
|
||||
final Dataset<ResultWithRelations> r0 = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(r -> new ResultWithRelations(r), Encoders.kryo(ResultWithRelations.class));
|
||||
final Dataset<ResultWithRelations> r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class));
|
||||
final Dataset<ResultWithRelations> r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class));
|
||||
final Dataset<ResultWithRelations> r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class));
|
||||
final Dataset<ResultWithRelations> r4 = join(
|
||||
r3, rels, relatedEntities(publications, rels, RelatedProject.class));
|
||||
;
|
||||
|
||||
return null;
|
||||
return r4;
|
||||
}
|
||||
|
||||
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final Pair<Result, List<Project>> target : childrenWithProjects) {
|
||||
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||
list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
|
||||
final Dataset<Relation> rels,
|
||||
final Class<RT> clazz) {
|
||||
return rels
|
||||
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
|
||||
Encoders.kryo(clazz));
|
||||
}
|
||||
|
||||
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
private static <T> Dataset<ResultWithRelations> join(final Dataset<ResultWithRelations> sources,
|
||||
final Dataset<Relation> rels,
|
||||
final Dataset<T> typedRels) {
|
||||
|
||||
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
|
||||
list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||
list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||
}
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private List<Event> generatePublicationRelatedEvents(final String relType,
|
||||
final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels) {
|
||||
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
final List<Pair<Result, List<Publication>>> cleanedChildrens = childrenWithRels
|
||||
.stream()
|
||||
.filter(p -> p.getRight().containsKey(relType))
|
||||
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||
.filter(p -> p.getRight().size() > 0)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
|
||||
if (relType.equals("isRelatedTo")) {
|
||||
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("references")) {
|
||||
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isReferencedBy")) {
|
||||
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedTo")) {
|
||||
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedBy")) {
|
||||
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
}
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private List<Event> generateDatasetRelatedEvents(final String relType,
|
||||
final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels) {
|
||||
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
final List<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>> cleanedChildrens = childrenWithRels
|
||||
.stream()
|
||||
.filter(p -> p.getRight().containsKey(relType))
|
||||
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||
.filter(p -> p.getRight().size() > 0)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
|
||||
if (relType.equals("isRelatedTo")) {
|
||||
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("references")) {
|
||||
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isReferencedBy")) {
|
||||
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedTo")) {
|
||||
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedBy")) {
|
||||
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
}
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
final TypedColumn<Tuple2<ResultWithRelations, T>, ResultWithRelations> aggr = new ResultWithRelationsAggregator<T>()
|
||||
.toColumn();
|
||||
;
|
||||
|
||||
return sources
|
||||
.joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<ResultWithRelations, T>, String>) t -> t._1.getResult().getId(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.kryo(ResultWithRelations.class));
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
|
@ -313,4 +184,23 @@ public class GenerateEventsApplication {
|
|||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
|
||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
final String conf = isLookUpService
|
||||
.getResourceProfileByQuery(
|
||||
String
|
||||
.format(
|
||||
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
||||
profId));
|
||||
|
||||
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
|
||||
dedupConfig.getPace().initModel();
|
||||
dedupConfig.getPace().initTranslationMap();
|
||||
// dedupConfig.getWf().setConfigurationId("???");
|
||||
|
||||
return dedupConfig;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,28 +6,48 @@ import java.util.Collection;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
public abstract class UpdateMatcher<K, T> {
|
||||
public abstract class UpdateMatcher<T> {
|
||||
|
||||
private final boolean multipleUpdate;
|
||||
private final Function<T, Topic> topicFunction;
|
||||
private final BiConsumer<Publication, T> compileHighlightFunction;
|
||||
private final Function<T, String> highlightToStringFunction;
|
||||
|
||||
public UpdateMatcher(final boolean multipleUpdate) {
|
||||
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
|
||||
final BiConsumer<Publication, T> compileHighlightFunction,
|
||||
final Function<T, String> highlightToStringFunction) {
|
||||
this.multipleUpdate = multipleUpdate;
|
||||
this.topicFunction = topicFunction;
|
||||
this.compileHighlightFunction = compileHighlightFunction;
|
||||
this.highlightToStringFunction = highlightToStringFunction;
|
||||
}
|
||||
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others) {
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final ResultWithRelations res,
|
||||
final Collection<ResultWithRelations> others,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||
|
||||
for (final K source : others) {
|
||||
for (final ResultWithRelations source : others) {
|
||||
if (source != res) {
|
||||
for (final UpdateInfo<T> info : findUpdates(source, res)) {
|
||||
for (final T hl : findDifferences(source, res)) {
|
||||
final Topic topic = getTopicFunction().apply(hl);
|
||||
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(),
|
||||
getHighlightToStringFunction(),
|
||||
dedupConfig);
|
||||
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||
} else {
|
||||
|
@ -51,11 +71,7 @@ public abstract class UpdateMatcher<K, T> {
|
|||
}
|
||||
}
|
||||
|
||||
protected abstract List<UpdateInfo<T>> findUpdates(K source, K target);
|
||||
|
||||
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
|
||||
final K source,
|
||||
final K target);
|
||||
protected abstract List<T> findDifferences(ResultWithRelations source, ResultWithRelations target);
|
||||
|
||||
protected static boolean isMissing(final List<Field<String>> list) {
|
||||
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
|
||||
|
@ -65,4 +81,20 @@ public abstract class UpdateMatcher<K, T> {
|
|||
return field == null || StringUtils.isBlank(field.getValue());
|
||||
}
|
||||
|
||||
public boolean isMultipleUpdate() {
|
||||
return multipleUpdate;
|
||||
}
|
||||
|
||||
public Function<T, Topic> getTopicFunction() {
|
||||
return topicFunction;
|
||||
}
|
||||
|
||||
public BiConsumer<Publication, T> getCompileHighlightFunction() {
|
||||
return compileHighlightFunction;
|
||||
}
|
||||
|
||||
public Function<T, String> getHighlightToStringFunction() {
|
||||
return highlightToStringFunction;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,59 +5,47 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public abstract class AbstractEnrichMissingDataset
|
||||
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
|
||||
|
||||
private final Topic topic;
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Dataset> {
|
||||
|
||||
public AbstractEnrichMissingDataset(final Topic topic) {
|
||||
super(true);
|
||||
this.topic = topic;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
|
||||
final Pair<Result, List<Dataset>> source,
|
||||
final Pair<Result, List<Dataset>> target) {
|
||||
|
||||
final Set<String> existingDatasets = target
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(Dataset::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getRight()
|
||||
.stream()
|
||||
.filter(d -> !existingDatasets.contains(d.getId()))
|
||||
.map(ConversionUtils::oafDatasetToBrokerDataset)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Dataset highlightValue,
|
||||
final Pair<Result, List<Dataset>> source,
|
||||
final Pair<Result, List<Dataset>> target) {
|
||||
return new UpdateInfo<>(
|
||||
getTopic(),
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
rel -> topic,
|
||||
(p, rel) -> p.getDatasets().add(rel),
|
||||
rel -> rel.getInstances().get(0).getUrl());
|
||||
}
|
||||
|
||||
public Topic getTopic() {
|
||||
return topic;
|
||||
protected abstract boolean filterByType(String relType);
|
||||
|
||||
@Override
|
||||
protected final List<eu.dnetlib.broker.objects.Dataset> findDifferences(
|
||||
final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
|
||||
final Set<String> existingDatasets = target
|
||||
.getDatasets()
|
||||
.stream()
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(RelatedDataset::getRelDataset)
|
||||
.map(Dataset::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getDatasets()
|
||||
.stream()
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(RelatedDataset::getRelDataset)
|
||||
.filter(d -> !existingDatasets.contains(d.getId()))
|
||||
.map(ConversionUtils::oafDatasetToBrokerDataset)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isReferencedBy");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isRelatedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedBy");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset
|
|||
super(Topic.ENRICH_MISSING_DATASET_REFERENCES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("references");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,48 +5,34 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMissingProject
|
||||
extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Project> {
|
||||
|
||||
public EnrichMissingProject() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
|
||||
if (source.getRight().isEmpty()) {
|
||||
return Arrays.asList();
|
||||
} else {
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafProjectToBrokerProject)
|
||||
.map(p -> generateUpdateInfo(p, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||
final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PROJECT,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
prj -> Topic.ENRICH_MISSING_PROJECT,
|
||||
(p, prj) -> p.getProjects().add(prj),
|
||||
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Project> findDifferences(final ResultWithRelations source, final ResultWithRelations target) {
|
||||
if (source.getProjects().isEmpty()) {
|
||||
return Arrays.asList();
|
||||
} else {
|
||||
return target
|
||||
.getProjects()
|
||||
.stream()
|
||||
.map(RelatedProject::getRelProject)
|
||||
.map(ConversionUtils::oafProjectToBrokerProject)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,50 +5,40 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||
public class EnrichMoreProject extends UpdateMatcher<eu.dnetlib.broker.objects.Project> {
|
||||
|
||||
public EnrichMoreProject() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
|
||||
final Set<String> existingProjects = source
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(Project::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.filter(p -> !existingProjects.contains(p.getId()))
|
||||
.map(ConversionUtils::oafProjectToBrokerProject)
|
||||
.map(p -> generateUpdateInfo(p, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||
final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_PROJECT,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
prj -> Topic.ENRICH_MORE_PROJECT,
|
||||
(p, prj) -> p.getProjects().add(prj),
|
||||
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Project> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
|
||||
final Set<String> existingProjects = source
|
||||
.getProjects()
|
||||
.stream()
|
||||
.map(RelatedProject::getRelProject)
|
||||
.map(Project::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target
|
||||
.getProjects()
|
||||
.stream()
|
||||
.map(RelatedProject::getRelProject)
|
||||
.filter(p -> !existingProjects.contains(p.getId()))
|
||||
.map(ConversionUtils::oafProjectToBrokerProject)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,59 +5,47 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public abstract class AbstractEnrichMissingPublication
|
||||
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
|
||||
|
||||
private final Topic topic;
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Publication> {
|
||||
|
||||
public AbstractEnrichMissingPublication(final Topic topic) {
|
||||
super(true);
|
||||
this.topic = topic;
|
||||
super(true,
|
||||
rel -> topic,
|
||||
(p, rel) -> p.getPublications().add(rel),
|
||||
rel -> rel.getInstances().get(0).getUrl());
|
||||
|
||||
}
|
||||
|
||||
protected abstract boolean filterByType(String relType);
|
||||
|
||||
@Override
|
||||
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
|
||||
final Pair<Result, List<Publication>> source,
|
||||
final Pair<Result, List<Publication>> target) {
|
||||
protected final List<eu.dnetlib.broker.objects.Publication> findDifferences(
|
||||
final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
|
||||
final Set<String> existingPublications = target
|
||||
.getRight()
|
||||
.getPublications()
|
||||
.stream()
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(RelatedPublication::getRelPublication)
|
||||
.map(Publication::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getRight()
|
||||
.getPublications()
|
||||
.stream()
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(RelatedPublication::getRelPublication)
|
||||
.filter(d -> !existingPublications.contains(d.getId()))
|
||||
.map(ConversionUtils::oafResultToBrokerPublication)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Publication highlightValue,
|
||||
final Pair<Result, List<Publication>> source,
|
||||
final Pair<Result, List<Publication>> target) {
|
||||
return new UpdateInfo<>(
|
||||
getTopic(),
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
(p, rel) -> p.getPublications().add(rel),
|
||||
rel -> rel.getInstances().get(0).getUrl());
|
||||
}
|
||||
|
||||
public Topic getTopic() {
|
||||
return topic;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,8 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isReferencedBy");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isRelatedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,8 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedBy");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_REFERENCES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("references");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,49 +5,37 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMissingSoftware
|
||||
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
|
||||
|
||||
public EnrichMissingSoftware() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
|
||||
if (source.getRight().isEmpty()) {
|
||||
return Arrays.asList();
|
||||
} else {
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware)
|
||||
.map(p -> generateUpdateInfo(p, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_SOFTWARE,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
s -> Topic.ENRICH_MISSING_SOFTWARE,
|
||||
(p, s) -> p.getSoftwares().add(s),
|
||||
s -> s.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
|
||||
final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
|
||||
if (source.getSoftwares().isEmpty()) {
|
||||
return Arrays.asList();
|
||||
} else {
|
||||
return target
|
||||
.getSoftwares()
|
||||
.stream()
|
||||
.map(RelatedSoftware::getRelSoftware)
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,52 +5,42 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class EnrichMoreSoftware
|
||||
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
|
||||
|
||||
public EnrichMoreSoftware() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
|
||||
final Set<String> existingSoftwares = source
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(Software::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.filter(p -> !existingSoftwares.contains(p.getId()))
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware)
|
||||
.map(p -> generateUpdateInfo(p, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_SOFTWARE,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
s -> Topic.ENRICH_MORE_SOFTWARE,
|
||||
(p, s) -> p.getSoftwares().add(s),
|
||||
s -> s.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
|
||||
final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
|
||||
final Set<String> existingSoftwares = source
|
||||
.getSoftwares()
|
||||
.stream()
|
||||
.map(RelatedSoftware::getRelSoftware)
|
||||
.map(Software::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target
|
||||
.getSoftwares()
|
||||
.stream()
|
||||
.map(RelatedSoftware::getRelSoftware)
|
||||
.filter(p -> !existingSoftwares.contains(p.getId()))
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -7,32 +7,25 @@ import java.util.List;
|
|||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
|
||||
public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
||||
|
||||
public EnrichMissingAbstract() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
|
||||
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target));
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_ABSTRACT,
|
||||
highlightValue, source, target,
|
||||
super(false,
|
||||
s -> Topic.ENRICH_MISSING_ABSTRACT,
|
||||
(p, s) -> p.getAbstracts().add(s),
|
||||
s -> s);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
if (isMissing(target.getResult().getDescription()) && !isMissing(source.getResult().getDescription())) {
|
||||
return Arrays
|
||||
.asList(source.getResult().getDescription().get(0).getValue());
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,36 +1,53 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> {
|
||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<String> {
|
||||
|
||||
public EnrichMissingAuthorOrcid() {
|
||||
super(true);
|
||||
super(true,
|
||||
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||
(p, aut) -> p.getCreators().add(aut),
|
||||
aut -> aut);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||
return Arrays.asList();
|
||||
}
|
||||
protected List<String> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||
highlightValue, source, target,
|
||||
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
final Set<String> existingOrcids = target
|
||||
.getResult()
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(Author::getPid)
|
||||
.flatMap(List::stream)
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||
.map(pid -> pid.getValue())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
final List<String> list = new ArrayList<>();
|
||||
|
||||
for (final Author author : source.getResult().getAuthor()) {
|
||||
final String name = author.getFullname();
|
||||
|
||||
for (final StructuredProperty pid : author.getPid()) {
|
||||
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")
|
||||
&& !existingOrcids.contains(pid.getValue())) {
|
||||
list.add(name + " [ORCID: " + pid.getValue() + "]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return list;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,18 +10,22 @@ import eu.dnetlib.dhp.broker.model.Topic;
|
|||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
|
||||
|
||||
public EnrichMissingOpenAccess() {
|
||||
super(true);
|
||||
super(true,
|
||||
i -> Topic.ENRICH_MISSING_OA_VERSION,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||
protected List<Instance> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
final long count = target
|
||||
.getResult()
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(i -> i.getAccessright().getClassid())
|
||||
|
@ -33,24 +37,13 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
}
|
||||
|
||||
return source
|
||||
.getResult()
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_OA_VERSION,
|
||||
highlightValue, source, target,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,38 +9,32 @@ import eu.dnetlib.broker.objects.Pid;
|
|||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
|
||||
public class EnrichMissingPid extends UpdateMatcher<Pid> {
|
||||
|
||||
public EnrichMissingPid() {
|
||||
super(true);
|
||||
super(true,
|
||||
pid -> Topic.ENRICH_MISSING_PID,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||
final long count = target.getPid().size();
|
||||
protected List<Pid> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
final long count = target.getResult().getPid().size();
|
||||
|
||||
if (count > 0) {
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
return source
|
||||
.getResult()
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafPidToBrokerPid)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PID,
|
||||
highlightValue, source, target,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -7,32 +7,25 @@ import java.util.List;
|
|||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> {
|
||||
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
||||
|
||||
public EnrichMissingPublicationDate() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||
if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) {
|
||||
return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target));
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||
highlightValue, source, target,
|
||||
super(false,
|
||||
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||
(p, date) -> p.setPublicationdate(date),
|
||||
s -> s);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
if (isMissing(target.getResult().getDateofacceptance())
|
||||
&& !isMissing(source.getResult().getDateofacceptance())) {
|
||||
return Arrays.asList(source.getResult().getDateofacceptance().getValue());
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -10,20 +10,24 @@ import org.apache.commons.lang3.tuple.Pair;
|
|||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||
public class EnrichMissingSubject extends UpdateMatcher<Pair<String, String>> {
|
||||
|
||||
public EnrichMissingSubject() {
|
||||
super(true);
|
||||
super(true,
|
||||
pair -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + pair.getLeft()),
|
||||
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||
protected List<Pair<String, String>> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
final Set<String> existingTypes = target
|
||||
.getResult()
|
||||
.getSubject()
|
||||
.stream()
|
||||
.map(StructuredProperty::getQualifier)
|
||||
|
@ -31,24 +35,12 @@ public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, Str
|
|||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getResult()
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
|
||||
.map(ConversionUtils::oafSubjectToPair)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
|
||||
return new UpdateInfo<>(
|
||||
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
|
||||
highlightValue, source, target,
|
||||
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -10,18 +10,22 @@ import eu.dnetlib.dhp.broker.model.Topic;
|
|||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
|
||||
|
||||
public EnrichMoreOpenAccess() {
|
||||
super(true);
|
||||
super(true,
|
||||
i -> Topic.ENRICH_MORE_OA_VERSION,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||
protected List<Instance> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
final Set<String> urls = target
|
||||
.getResult()
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||
|
@ -30,25 +34,14 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getResult()
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.filter(i -> !urls.contains(i.getUrl()))
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_OA_VERSION,
|
||||
highlightValue, source, target,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,39 +9,34 @@ import eu.dnetlib.broker.objects.Pid;
|
|||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
|
||||
public class EnrichMorePid extends UpdateMatcher<Pid> {
|
||||
|
||||
public EnrichMorePid() {
|
||||
super(true);
|
||||
super(true,
|
||||
pid -> Topic.ENRICH_MORE_PID,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||
protected List<Pid> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
final Set<String> existingPids = target
|
||||
.getResult()
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getResult()
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||
.map(ConversionUtils::oafPidToBrokerPid)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_PID,
|
||||
highlightValue, source, target,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -10,42 +10,34 @@ import org.apache.commons.lang3.tuple.Pair;
|
|||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||
public class EnrichMoreSubject extends UpdateMatcher<Pair<String, String>> {
|
||||
|
||||
public EnrichMoreSubject() {
|
||||
super(true);
|
||||
super(true,
|
||||
pair -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + pair.getLeft()),
|
||||
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||
protected List<Pair<String, String>> findDifferences(final ResultWithRelations source,
|
||||
final ResultWithRelations target) {
|
||||
final Set<String> existingSubjects = target
|
||||
.getResult()
|
||||
.getSubject()
|
||||
.stream()
|
||||
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getResult()
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||
.map(ConversionUtils::oafSubjectToPair)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
|
||||
return new UpdateInfo<>(
|
||||
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
|
||||
highlightValue, source, target,
|
||||
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -15,6 +15,9 @@ public class BrokerConstants {
|
|||
public static final String OPEN_ACCESS = "OPEN";
|
||||
public static final String IS_MERGED_IN_CLASS = "isMergedIn";
|
||||
|
||||
public static final float MIN_TRUST = 0.25f;
|
||||
public static final float MAX_TRUST = 1.00f;
|
||||
|
||||
public static final List<Class<? extends Result>> RESULT_CLASSES = Arrays
|
||||
.asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class);
|
||||
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
public class EventFinder {
|
||||
|
||||
private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
|
||||
static {
|
||||
matchers.add(new EnrichMissingAbstract());
|
||||
matchers.add(new EnrichMissingAuthorOrcid());
|
||||
matchers.add(new EnrichMissingOpenAccess());
|
||||
matchers.add(new EnrichMissingPid());
|
||||
matchers.add(new EnrichMissingPublicationDate());
|
||||
matchers.add(new EnrichMissingSubject());
|
||||
matchers.add(new EnrichMoreOpenAccess());
|
||||
matchers.add(new EnrichMorePid());
|
||||
matchers.add(new EnrichMoreSubject());
|
||||
|
||||
// Advanced matchers
|
||||
matchers.add(new EnrichMissingProject());
|
||||
matchers.add(new EnrichMoreProject());
|
||||
matchers.add(new EnrichMissingSoftware());
|
||||
matchers.add(new EnrichMoreSoftware());
|
||||
matchers.add(new EnrichMissingPublicationIsRelatedTo());
|
||||
matchers.add(new EnrichMissingPublicationIsReferencedBy());
|
||||
matchers.add(new EnrichMissingPublicationReferences());
|
||||
matchers.add(new EnrichMissingPublicationIsSupplementedTo());
|
||||
matchers.add(new EnrichMissingPublicationIsSupplementedBy());
|
||||
matchers.add(new EnrichMissingDatasetIsRelatedTo());
|
||||
matchers.add(new EnrichMissingDatasetIsReferencedBy());
|
||||
matchers.add(new EnrichMissingDatasetReferences());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
|
||||
matchers.add(new EnrichMissingAbstract());
|
||||
}
|
||||
|
||||
public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final ResultWithRelations target : results.getData()) {
|
||||
for (final UpdateMatcher<?> matcher : matchers) {
|
||||
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig));
|
||||
}
|
||||
}
|
||||
|
||||
return asEventGroup(list);
|
||||
}
|
||||
|
||||
private static EventGroup asEventGroup(final List<UpdateInfo<?>> list) {
|
||||
final EventGroup events = new EventGroup();
|
||||
list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement);
|
||||
return events;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
public class TrustUtils {
|
||||
|
||||
public static float rescale(final double score, final double threshold) {
|
||||
if (score >= BrokerConstants.MAX_TRUST) {
|
||||
return BrokerConstants.MAX_TRUST;
|
||||
}
|
||||
|
||||
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST)
|
||||
/ (BrokerConstants.MAX_TRUST - threshold);
|
||||
|
||||
if (val < BrokerConstants.MIN_TRUST) {
|
||||
return BrokerConstants.MIN_TRUST;
|
||||
}
|
||||
if (val > BrokerConstants.MAX_TRUST) {
|
||||
return BrokerConstants.MAX_TRUST;
|
||||
}
|
||||
|
||||
return (float) val;
|
||||
}
|
||||
}
|
|
@ -5,13 +5,23 @@ import java.util.List;
|
|||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
||||
import eu.dnetlib.broker.objects.Provenance;
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
||||
public final class UpdateInfo<T> {
|
||||
|
||||
|
@ -19,9 +29,9 @@ public final class UpdateInfo<T> {
|
|||
|
||||
private final T highlightValue;
|
||||
|
||||
private final Result source;
|
||||
private final ResultWithRelations source;
|
||||
|
||||
private final Result target;
|
||||
private final ResultWithRelations target;
|
||||
|
||||
private final BiConsumer<Publication, T> compileHighlight;
|
||||
|
||||
|
@ -29,33 +39,50 @@ public final class UpdateInfo<T> {
|
|||
|
||||
private final float trust;
|
||||
|
||||
public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
|
||||
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
|
||||
|
||||
public UpdateInfo(final Topic topic, final T highlightValue, final ResultWithRelations source,
|
||||
final ResultWithRelations target,
|
||||
final BiConsumer<Publication, T> compileHighlight,
|
||||
final Function<T, String> highlightToString) {
|
||||
final Function<T, String> highlightToString,
|
||||
final DedupConfig dedupConfig) {
|
||||
this.topic = topic;
|
||||
this.highlightValue = highlightValue;
|
||||
this.source = source;
|
||||
this.target = target;
|
||||
this.compileHighlight = compileHighlight;
|
||||
this.highlightToString = highlightToString;
|
||||
this.trust = calculateTrust(source, target);
|
||||
this.trust = calculateTrust(dedupConfig, source.getResult(), target.getResult());
|
||||
}
|
||||
|
||||
public T getHighlightValue() {
|
||||
return highlightValue;
|
||||
}
|
||||
|
||||
public Result getSource() {
|
||||
public ResultWithRelations getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public Result getTarget() {
|
||||
public ResultWithRelations getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
private float calculateTrust(final Result source, final Result target) {
|
||||
// TODO
|
||||
return 0.9f;
|
||||
private float calculateTrust(final DedupConfig dedupConfig, final Result r1, final Result r2) {
|
||||
try {
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
final MapDocument doc1 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
||||
final MapDocument doc2 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
||||
|
||||
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
||||
final double threshold = dedupConfig.getWf().getThreshold();
|
||||
|
||||
return TrustUtils.rescale(score, threshold);
|
||||
} catch (final Exception e) {
|
||||
log.error("Error computing score between results", e);
|
||||
return BrokerConstants.MIN_TRUST;
|
||||
}
|
||||
}
|
||||
|
||||
protected Topic getTopic() {
|
||||
|
@ -76,20 +103,22 @@ public final class UpdateInfo<T> {
|
|||
|
||||
public OpenAireEventPayload asBrokerPayload() {
|
||||
|
||||
final Publication p = ConversionUtils.oafResultToBrokerPublication(getSource());
|
||||
final Publication p = ConversionUtils.oafResultToBrokerPublication(getSource().getResult());
|
||||
compileHighlight.accept(p, getHighlightValue());
|
||||
|
||||
final Publication hl = new Publication();
|
||||
compileHighlight.accept(hl, getHighlightValue());
|
||||
|
||||
final String provId = getSource().getOriginalId().stream().findFirst().orElse(null);
|
||||
final String provId = getSource().getResult().getOriginalId().stream().findFirst().orElse(null);
|
||||
final String provRepo = getSource()
|
||||
.getResult()
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(KeyValue::getValue)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
final String provUrl = getSource()
|
||||
.getResult()
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(Instance::getUrl)
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.simple;
|
||||
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class ResultAggregator extends Aggregator<Tuple2<Result, Relation>, ResultGroup, ResultGroup> {
|
||||
public class ResultAggregator extends Aggregator<Tuple2<ResultWithRelations, Relation>, ResultGroup, ResultGroup> {
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -21,7 +22,7 @@ public class ResultAggregator extends Aggregator<Tuple2<Result, Relation>, Resul
|
|||
}
|
||||
|
||||
@Override
|
||||
public ResultGroup reduce(final ResultGroup group, final Tuple2<Result, Relation> t) {
|
||||
public ResultGroup reduce(final ResultGroup group, final Tuple2<ResultWithRelations, Relation> t) {
|
||||
return group.addElement(t._1);
|
||||
}
|
||||
|
|
@ -1,10 +1,11 @@
|
|||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.simple;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
|
||||
|
||||
public class ResultGroup implements Serializable {
|
||||
|
||||
|
@ -13,13 +14,13 @@ public class ResultGroup implements Serializable {
|
|||
*/
|
||||
private static final long serialVersionUID = -3360828477088669296L;
|
||||
|
||||
private final List<Result> data = new ArrayList<>();
|
||||
private final List<ResultWithRelations> data = new ArrayList<>();
|
||||
|
||||
public List<Result> getData() {
|
||||
public List<ResultWithRelations> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public ResultGroup addElement(final Result elem) {
|
||||
public ResultGroup addElement(final ResultWithRelations elem) {
|
||||
data.add(elem);
|
||||
return this;
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
|
||||
public class RelatedDataset {
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Dataset relDataset;
|
||||
|
||||
public RelatedDataset(final String source, final String relType, final Dataset relDataset) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relDataset = relDataset;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Dataset getRelDataset() {
|
||||
return relDataset;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class RelatedEntityFactory {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <RT, T> RT newRelatedEntity(final String sourceId, final String relType, final T target,
|
||||
final Class<RT> clazz) {
|
||||
if (clazz == RelatedProject.class) {
|
||||
return (RT) new RelatedProject(sourceId, relType, (Project) target);
|
||||
}
|
||||
if (clazz == RelatedSoftware.class) {
|
||||
return (RT) new RelatedSoftware(sourceId, relType, (Software) target);
|
||||
}
|
||||
if (clazz == RelatedDataset.class) {
|
||||
return (RT) new RelatedDataset(sourceId, relType, (Dataset) target);
|
||||
}
|
||||
if (clazz == RelatedPublication.class) {
|
||||
return (RT) new RelatedPublication(sourceId, relType, (Publication) target);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
|
||||
public class RelatedProject {
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Project relProject;
|
||||
|
||||
public RelatedProject(final String source, final String relType, final Project relProject) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relProject = relProject;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Project getRelProject() {
|
||||
return relProject;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
public class RelatedPublication {
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Publication relPublication;
|
||||
|
||||
public RelatedPublication(final String source, final String relType, final Publication relPublication) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relPublication = relPublication;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Publication getRelPublication() {
|
||||
return relPublication;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class RelatedSoftware {
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Software relSoftware;
|
||||
|
||||
public RelatedSoftware(final String source, final String relType, final Software relSoftware) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relSoftware = relSoftware;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Software getRelSoftware() {
|
||||
return relSoftware;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class ResultWithRelations implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -1368401915974311571L;
|
||||
|
||||
private Result result;
|
||||
|
||||
private final List<RelatedDataset> datasets = new ArrayList<>();
|
||||
private final List<RelatedPublication> publications = new ArrayList<>();
|
||||
private final List<RelatedSoftware> softwares = new ArrayList<>();
|
||||
private final List<RelatedProject> projects = new ArrayList<>();
|
||||
|
||||
public ResultWithRelations() {
|
||||
}
|
||||
|
||||
public ResultWithRelations(final Result result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public Result getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<RelatedDataset> getDatasets() {
|
||||
return datasets;
|
||||
}
|
||||
|
||||
public List<RelatedPublication> getPublications() {
|
||||
return publications;
|
||||
}
|
||||
|
||||
public List<RelatedSoftware> getSoftwares() {
|
||||
return softwares;
|
||||
}
|
||||
|
||||
public List<RelatedProject> getProjects() {
|
||||
return projects;
|
||||
}
|
||||
|
||||
public void setResult(final Result result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
public class ResultWithRelationsAggregator<T>
|
||||
extends Aggregator<Tuple2<ResultWithRelations, T>, ResultWithRelations, ResultWithRelations> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -3687878788861013488L;
|
||||
|
||||
@Override
|
||||
public ResultWithRelations zero() {
|
||||
return new ResultWithRelations();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultWithRelations finish(final ResultWithRelations g) {
|
||||
return g;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultWithRelations reduce(final ResultWithRelations g, final Tuple2<ResultWithRelations, T> t) {
|
||||
if (g.getResult() == null) {
|
||||
return t._1;
|
||||
} else if (t._2 instanceof RelatedSoftware) {
|
||||
g.getSoftwares().add((RelatedSoftware) t._2);
|
||||
} else if (t._2 instanceof RelatedDataset) {
|
||||
g.getDatasets().add((RelatedDataset) t._2);
|
||||
} else if (t._2 instanceof RelatedPublication) {
|
||||
g.getPublications().add((RelatedPublication) t._2);
|
||||
} else if (t._2 instanceof RelatedProject) {
|
||||
g.getProjects().add((RelatedProject) t._2);
|
||||
}
|
||||
return g;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultWithRelations merge(final ResultWithRelations g1, final ResultWithRelations g2) {
|
||||
if (g1.getResult() != null) {
|
||||
g1.getSoftwares().addAll(g2.getSoftwares());
|
||||
g1.getDatasets().addAll(g2.getDatasets());
|
||||
g1.getPublications().addAll(g2.getPublications());
|
||||
g1.getProjects().addAll(g2.getProjects());
|
||||
return g1;
|
||||
} else {
|
||||
return g2;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<ResultWithRelations> bufferEncoder() {
|
||||
return Encoders.kryo(ResultWithRelations.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<ResultWithRelations> outputEncoder() {
|
||||
return Encoders.kryo(ResultWithRelations.class);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>eventsOutputPath</name>
|
||||
<description>the path where the the events will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dedupConfProfId</name>
|
||||
<description>the id of a valid Dedup Configuration Profile</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="generate_events"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="generate_events">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${eventsOutputPath}"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</main-class>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphPath",
|
||||
"paramDescription": "the path where there the graph is stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "eventsPath",
|
||||
"paramDescription": "the path where the generated events will be stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "lu",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the address of the ISLookUpService",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "dedupConfProfile",
|
||||
"paramDescription": "the id of a valid Dedup Configuration Profile",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,73 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TrustUtilsTest {
|
||||
|
||||
private static final double THRESHOLD = 0.95;
|
||||
|
||||
@Test
|
||||
public void rescaleTest_1() {
|
||||
verifyValue(-0.3, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_2() {
|
||||
verifyValue(0.0, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_3() {
|
||||
verifyValue(0.5, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_4() {
|
||||
verifyValue(0.95, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_5() {
|
||||
verifyValue(0.96, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_6() {
|
||||
verifyValue(0.97, 0.3f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_7() {
|
||||
verifyValue(0.98, 0.45f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_8() {
|
||||
verifyValue(0.99, 0.6f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_9() {
|
||||
verifyValue(1.00, BrokerConstants.MAX_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_10() {
|
||||
verifyValue(1.01, BrokerConstants.MAX_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_11() {
|
||||
verifyValue(2.00, BrokerConstants.MAX_TRUST);
|
||||
}
|
||||
|
||||
private void verifyValue(final double originalScore, final float expectedTrust) {
|
||||
final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
|
||||
System.out.println(trust);
|
||||
assertTrue(Math.abs(trust - expectedTrust) < 0.01);
|
||||
}
|
||||
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-dedup-openaire</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,10 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -166,8 +166,10 @@ case object Crossref2Oaf {
|
|||
|
||||
val has_review = (json \ "relation" \"has-review" \ "id")
|
||||
|
||||
if(has_review != JNothing)
|
||||
instance.setRefereed(asField("peerReviewed"))
|
||||
if(has_review != JNothing) {
|
||||
instance.setRefereed(
|
||||
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
|
||||
}
|
||||
|
||||
|
||||
instance.setAccessright(getRestrictedQualifier())
|
||||
|
@ -243,7 +245,7 @@ case object Crossref2Oaf {
|
|||
val queue = new mutable.Queue[Relation]
|
||||
|
||||
|
||||
def snfRule(award:String): String = {
|
||||
def snsfRule(award:String): String = {
|
||||
var tmp1 = StringUtils.substringAfter(award,"_")
|
||||
val tmp2 = StringUtils.substringBefore(tmp1,"/")
|
||||
logger.debug(s"From $award to $tmp2")
|
||||
|
@ -318,7 +320,7 @@ case object Crossref2Oaf {
|
|||
case "10.13039/501100006588" |
|
||||
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
|
||||
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
|
||||
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snfRule)
|
||||
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
|
||||
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
|
||||
case "10.13039/100004440"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
|
||||
|
|
|
@ -1,16 +1,10 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import java.io.IOException
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
||||
import eu.dnetlib.doiboost.crossref.Crossref2Oaf
|
||||
import org.apache.commons.lang.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
@ -22,7 +16,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err
|
|||
|
||||
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
||||
object ORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||
val mapper = new ObjectMapper
|
||||
|
||||
def isJsonValid(inputStr: String): Boolean = {
|
||||
|
@ -57,7 +51,7 @@ object ORCIDToOAF {
|
|||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||
try{
|
||||
pub.setAuthor(input.authors.map(a=> {
|
||||
generateAuhtor(a.name, a.surname, a.creditName, a.oid)
|
||||
generateAuthor(a.name, a.surname, a.creditName, a.oid)
|
||||
}).asJava)
|
||||
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
||||
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
|
@ -69,7 +63,7 @@ object ORCIDToOAF {
|
|||
}
|
||||
}
|
||||
|
||||
def generateAuhtor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
a.setSurname(family)
|
||||
|
|
|
@ -58,3 +58,5 @@ reference,Array of Reference,No,List of references made by the work,,Future impr
|
|||
content-domain,Content Domain,No,Information on domains that support Crossmark for this work,N/A,
|
||||
relation,Relations,No,Relations to other works,Result/Instance/refereed,"if(relation.has-review) instance.refereed = ""peerReviewed"". "
|
||||
review,Review,No,Peer review metadata,N/A,
|
||||
funder,Array of Funder,No,,Relation between Result and Project,"The mapping between Crossref funder elements and OpenAIRE projects is implemented in eu.dnetlib.doiboost.crossref.Crossref2Oaf.mappingFunderToRelations.
|
||||
The matching is based on Funder DOIs, Funder names, and grant codes. Different mapping approaches are applied to cover the different cases in Crossref records."
|
||||
|
|
|
|
@ -0,0 +1,37 @@
|
|||
Micorsoft Academic Graph Field,OAF field,Comments
|
||||
Papers/DOI,OafEntity/originalId,
|
||||
,OafEntity/PID,
|
||||
,Oaf/id in the form 50|doiboost____::md5(DOI),
|
||||
Papers/PaperId,OafEntity/originalId,
|
||||
Papers/PaperTitle,"Result/Title with Qualifier(""main title"", ""dnet:dataCite_title"")",
|
||||
Papers/OriginalTitle,"Result/Title with Qualifier(""alternative title"", ""dnet:dataCite_title"")",
|
||||
Papers/BookTitle,Publication/Source,
|
||||
Papers/Year,N/A,
|
||||
Papers/Date,Result/dateofacceptance,
|
||||
Papers/Publisher,Result/Publisher,Possibly overridden by Journal/Publisher
|
||||
Papers/JournalId,N/A,
|
||||
Journal/Rank,N/A,
|
||||
Journal/NormalizedName,N/A,
|
||||
Journal/DisplayName,Publication/Journal/Name,
|
||||
Journal/Issn,Publication/Journal/issnPrinted,
|
||||
Journal/Publisher,Result/publisher,"If avalable, it overrides value in Papers/Publisher"
|
||||
Journal/Webpage,N/A,
|
||||
Journal/PaperCount,N/A,
|
||||
Journal/CitationCount,N/A,
|
||||
Journal/CreatedDate,N/A,
|
||||
ConferenceInstances/DisplayName,Publication/Journal/Name,
|
||||
ConferenceInstances/Location,Publication/Journal/Conferenceplace,
|
||||
ConferenceInstances/StartDate,Publication/Journal/Conferencedate,subjectTo be constructed as StartDate - EndDate (only the first 10 chars ofthe dates are kept)
|
||||
ConferenceInstances/EndDate,Publication/Journal/Conferencedate,To be constructed as StartDate - EndDate (only the first 10 chars ofthe dates are kept)
|
||||
Papers/Volume,Publication/Journal/vol,
|
||||
Papers/Issue,Publication/Journal/iss,
|
||||
Papers/FirstPage,Publication/Journal/sp,
|
||||
Papers/LastPage,Publication/Journal/ep,
|
||||
Papers/ReferenceCount,N/A,
|
||||
Papers/CitationCount,N/A,
|
||||
Papers/EstimatedCitation,N/A,
|
||||
Papers/OriginalVenue,N/A,
|
||||
Papers/FamilyId,N/A,
|
||||
Papers/CreatedDate,Result/LastUpdateTimeStamp,
|
||||
Papers/FieldOfStudy/DisplayName,"Result/subject with Qualifier(""keyword"", ""dnet:subject_classification_typologies"")",Some FieldOfStudy come from the UMLS controlled vocabulary. Future releases of DOIBoost will include such terms with a specific Qualifier (i.e. not as generic keywords)
|
||||
Papers/FieldOfStudy/MainType,"Result/subject with Qualifier(""keyword"", ""dnet:subject_classification_typologies"")","If MainType is splittable on . (like 'food.type_of_dish', 'aviation.airport', 'soccer.team'), then the first token (i.e. food, aviation, soccer) is also added as additional subject."
|
|
|
@ -0,0 +1,6 @@
|
|||
ORCID field,OAF mapping,Comment
|
||||
doi,Result/pid,
|
||||
oid,Result/Author/pid,
|
||||
name,Result/Author/name,
|
||||
surname,Result/Author/surname,
|
||||
creditName,N/A,Result/Author/fullname is generated by concatenating name and surname
|
|
|
@ -0,0 +1,11 @@
|
|||
Unpaywall field,Description,OAF Mapping,Comment
|
||||
doi,The DOI of this resource.,Result/pid,
|
||||
is_oa,Is there an OA copy of this resource.,N/A,Add a Result/Instance only if is_oa is true
|
||||
best_oa_location,The best OA Location Object we could find for this DOI.,,Create one Result/Instance with Open Access right
|
||||
best_oa_location.url,The url_for_pdf if there is one; otherwise landing page URL.,Result/instance/url,
|
||||
best_oa_location.license,The license under which this copy is published.,Result/instance/license,
|
||||
oa_status,"The OA status, or color, of this resource. Classifies OA resources by location and license terms as one of: gold, hybrid, bronze, green or closed.",N/A,New field to be introduced in the OpenAIRE data model
|
||||
published_date,"The date this resource was published. As reported by the publishers, who unfortunately have inconsistent definitions of what counts as officially ""published.""",N/A,"Not used in the mapping. Could be used to improve the coverage of dates, if needed."
|
||||
title,The title of this resource.,N/A,
|
||||
oa_locations,List of all the OA Location objects associated with this resource.,N/A,"Coud be useful if we plan to ""patch"" existing instances. If the instance URL is a URL of a oa_location, then it's Open.
|
||||
"
|
|
|
@ -1,16 +1,14 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.doiboost.crossref.Crossref2Oaf
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
import org.junit.jupiter.api.Test
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import org.junit.jupiter.api.Assertions._
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
class MappingORCIDToOAFTest {
|
||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
@Test
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class CleanGraphSparkJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CleanGraphSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
String graphTableClassName = parser.get("graphTableClassName");
|
||||
log.info("graphTableClassName: {}", graphTableClassName);
|
||||
|
||||
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
|
||||
|
||||
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
fixGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Oaf> void fixGraphTable(
|
||||
SparkSession spark,
|
||||
VocabularyGroup vocs,
|
||||
String inputPath,
|
||||
Class<T> clazz,
|
||||
String outputPath) {
|
||||
|
||||
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
|
||||
|
||||
readTableFromPath(spark, inputPath, clazz)
|
||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
||||
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
||||
|
||||
log.info("Reading Graph table from: {}", inputEntityPath);
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputEntityPath)
|
||||
.map(
|
||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
|
||||
Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
|
||||
|
||||
/**
|
||||
* Creates the mapping for the Oaf types subject to cleaning
|
||||
*
|
||||
* @param vocabularies
|
||||
*/
|
||||
public static CleaningRuleMap create(VocabularyGroup vocabularies) {
|
||||
CleaningRuleMap mapping = new CleaningRuleMap();
|
||||
mapping.put(Qualifier.class, o -> {
|
||||
Qualifier q = (Qualifier) o;
|
||||
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
||||
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
|
||||
q.setClassid(newValue.getClassid());
|
||||
q.setClassname(newValue.getClassname());
|
||||
}
|
||||
});
|
||||
mapping.put(StructuredProperty.class, o -> {
|
||||
StructuredProperty sp = (StructuredProperty) o;
|
||||
// TODO implement a policy
|
||||
/*
|
||||
* if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null);
|
||||
* }
|
||||
*/
|
||||
});
|
||||
return mapping;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
|
||||
public class OafCleaner implements Serializable {
|
||||
|
||||
public static <E extends Oaf> E apply(E oaf, CleaningRuleMap mapping) {
|
||||
try {
|
||||
navigate(oaf, mapping);
|
||||
} catch (IllegalAccessException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return oaf;
|
||||
}
|
||||
|
||||
private static void navigate(Object o, CleaningRuleMap mapping) throws IllegalAccessException {
|
||||
if (isPrimitive(o)) {
|
||||
return;
|
||||
} else if (isIterable(o.getClass())) {
|
||||
for (final Object elem : (Iterable<?>) o) {
|
||||
navigate(elem, mapping);
|
||||
}
|
||||
} else if (hasMapping(o, mapping)) {
|
||||
mapping.get(o.getClass()).accept(o);
|
||||
} else {
|
||||
for (final Field f : getAllFields(o.getClass())) {
|
||||
f.setAccessible(true);
|
||||
final Object val = f.get(o);
|
||||
if (!isPrimitive(val) && hasMapping(val, mapping)) {
|
||||
mapping.get(val.getClass()).accept(val);
|
||||
} else {
|
||||
navigate(f.get(o), mapping);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean hasMapping(Object o, CleaningRuleMap mapping) {
|
||||
return mapping.containsKey(o.getClass());
|
||||
}
|
||||
|
||||
private static boolean isIterable(final Class<?> cl) {
|
||||
return Iterable.class.isAssignableFrom(cl);
|
||||
}
|
||||
|
||||
private static boolean isPrimitive(Object o) {
|
||||
return Objects.isNull(o)
|
||||
|| o.getClass().isPrimitive()
|
||||
|| o instanceof Class
|
||||
|| o instanceof Integer
|
||||
|| o instanceof Double
|
||||
|| o instanceof Float
|
||||
|| o instanceof Long
|
||||
|| o instanceof Boolean
|
||||
|| o instanceof String
|
||||
|| o instanceof Byte;
|
||||
}
|
||||
|
||||
private static List<Field> getAllFields(Class<?> clazz) {
|
||||
return getAllFields(new LinkedList<>(), clazz);
|
||||
}
|
||||
|
||||
private static List<Field> getAllFields(List<Field> fields, Class<?> clazz) {
|
||||
fields.addAll(Arrays.asList(clazz.getDeclaredFields()));
|
||||
|
||||
final Class<?> superclass = clazz.getSuperclass();
|
||||
if (Objects.nonNull(superclass) && superclass.getPackage().equals(Oaf.class.getPackage())) {
|
||||
getAllFields(fields, superclass);
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
}
|
|
@ -63,6 +63,8 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
protected final VocabularyGroup vocs;
|
||||
|
||||
private final boolean invisible;
|
||||
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
|
||||
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
|
||||
protected static final Qualifier ORCID_PID_TYPE = qualifier(
|
||||
|
@ -85,8 +87,9 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
|
||||
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
|
||||
|
||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs) {
|
||||
protected AbstractMdRecordToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
||||
this.vocs = vocs;
|
||||
this.invisible = invisible;
|
||||
}
|
||||
|
||||
public List<Oaf> processMdRecord(final String xml) {
|
||||
|
@ -112,7 +115,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return null;
|
||||
}
|
||||
|
||||
final DataInfo info = prepareDataInfo(doc);
|
||||
final DataInfo info = prepareDataInfo(doc, invisible);
|
||||
final long lastUpdateTimestamp = new Date().getTime();
|
||||
|
||||
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
|
||||
|
@ -510,11 +513,11 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
|
||||
}
|
||||
|
||||
protected DataInfo prepareDataInfo(final Document doc) {
|
||||
protected DataInfo prepareDataInfo(final Document doc, final boolean invisible) {
|
||||
final Node n = doc.selectSingleNode("//oaf:datainfo");
|
||||
|
||||
if (n == null) {
|
||||
return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
|
||||
return dataInfo(false, null, false, invisible, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
|
||||
}
|
||||
|
||||
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
|
||||
|
@ -528,7 +531,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
final String trust = n.valueOf("./oaf:trust");
|
||||
|
||||
return dataInfo(
|
||||
deletedbyinference, inferenceprovenance, inferred, false,
|
||||
deletedbyinference, inferenceprovenance, inferred, invisible,
|
||||
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
|
||||
}
|
||||
|
||||
|
|
|
@ -39,6 +39,8 @@ import eu.dnetlib.dhp.schema.oaf.Project;
|
|||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateEntitiesApplication {
|
||||
|
@ -71,7 +73,8 @@ public class GenerateEntitiesApplication {
|
|||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl);
|
||||
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
@ -137,10 +140,16 @@ public class GenerateEntitiesApplication {
|
|||
final String type = StringUtils.substringAfter(id, ":");
|
||||
|
||||
switch (type.toLowerCase()) {
|
||||
case "native_oaf":
|
||||
return new OafToOafMapper(vocs).processMdRecord(s);
|
||||
case "native_odf":
|
||||
return new OdfToOafMapper(vocs).processMdRecord(s);
|
||||
case "oaf-store-cleaned":
|
||||
case "oaf-store-claim":
|
||||
return new OafToOafMapper(vocs, false).processMdRecord(s);
|
||||
case "odf-store-cleaned":
|
||||
case "odf-store-claim":
|
||||
return new OdfToOafMapper(vocs, false).processMdRecord(s);
|
||||
case "oaf-store-intersection":
|
||||
return new OafToOafMapper(vocs, true).processMdRecord(s);
|
||||
case "odf-store-intersection":
|
||||
return new OdfToOafMapper(vocs, true).processMdRecord(s);
|
||||
case "datasource":
|
||||
return Arrays.asList(convertFromJson(s, Datasource.class));
|
||||
case "organization":
|
||||
|
|
|
@ -71,6 +71,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
|
|||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
|
||||
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
|
@ -151,7 +152,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
super(hdfsPath);
|
||||
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
|
||||
this.lastUpdateTimestamp = new Date().getTime();
|
||||
this.vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl);
|
||||
this.vocs = VocabularyGroup.loadVocsFromIS(ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
}
|
||||
|
||||
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer)
|
||||
|
|
|
@ -26,8 +26,7 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio
|
|||
IOUtils
|
||||
.toString(
|
||||
MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
||||
|
@ -60,7 +59,7 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio
|
|||
final String currentColl = entry.getValue();
|
||||
|
||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||
emit(xml, "native_" + format);
|
||||
emit(xml, String.format("%s-%s-%s", format, layout, interpretation));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,13 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
|||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
@ -37,8 +31,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
public OafToOafMapper(final VocabularyGroup vocs) {
|
||||
super(vocs);
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
||||
super(vocs, invisible);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -139,7 +133,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
instance
|
||||
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
|
||||
instance
|
||||
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance
|
||||
|
@ -281,12 +275,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
res
|
||||
.add(
|
||||
getRelation(
|
||||
docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info,
|
||||
docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
|
||||
lastUpdateTimestamp));
|
||||
res
|
||||
.add(
|
||||
getRelation(
|
||||
otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info,
|
||||
otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
|
||||
lastUpdateTimestamp));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,19 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
|||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PART;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -44,8 +32,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
|
||||
public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/";
|
||||
|
||||
public OdfToOafMapper(final VocabularyGroup vocs) {
|
||||
super(vocs);
|
||||
public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
||||
super(vocs, invisible);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -129,7 +117,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
instance
|
||||
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
||||
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
||||
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info));
|
||||
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
|
||||
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
||||
instance
|
||||
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
||||
|
|
|
@ -60,6 +60,10 @@ public class OafMapperUtils {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static Qualifier unknown(final String schemeid, final String schemename) {
|
||||
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
|
||||
}
|
||||
|
||||
public static Qualifier qualifier(
|
||||
final String classid,
|
||||
final String classname,
|
||||
|
|
|
@ -4,14 +4,29 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
|
|||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
public class Vocabulary implements Serializable {
|
||||
|
||||
private final String id;
|
||||
private final String name;
|
||||
|
||||
/**
|
||||
* Code to Term mappings for this Vocabulary.
|
||||
*/
|
||||
private final Map<String, VocabularyTerm> terms = new HashMap<>();
|
||||
|
||||
/**
|
||||
* Synonym to Code mappings for this Vocabulary.
|
||||
*/
|
||||
private final Map<String, String> synonyms = Maps.newHashMap();
|
||||
|
||||
public Vocabulary(final String id, final String name) {
|
||||
this.id = id;
|
||||
this.name = name;
|
||||
|
@ -30,7 +45,7 @@ public class Vocabulary implements Serializable {
|
|||
}
|
||||
|
||||
public VocabularyTerm getTerm(final String id) {
|
||||
return terms.get(id.toLowerCase());
|
||||
return Optional.ofNullable(id).map(s -> s.toLowerCase()).map(s -> terms.get(s)).orElse(null);
|
||||
}
|
||||
|
||||
protected void addTerm(final String id, final String name) {
|
||||
|
@ -40,4 +55,32 @@ public class Vocabulary implements Serializable {
|
|||
protected boolean termExists(final String id) {
|
||||
return terms.containsKey(id.toLowerCase());
|
||||
}
|
||||
|
||||
protected void addSynonym(final String syn, final String termCode) {
|
||||
synonyms.put(syn, termCode.toLowerCase());
|
||||
}
|
||||
|
||||
public VocabularyTerm getTermBySynonym(final String syn) {
|
||||
return getTerm(synonyms.get(syn));
|
||||
}
|
||||
|
||||
public Qualifier getTermAsQualifier(final String termId) {
|
||||
if (StringUtils.isBlank(termId)) {
|
||||
return OafMapperUtils.unknown(getId(), getName());
|
||||
} else if (termExists(termId)) {
|
||||
final VocabularyTerm t = getTerm(termId);
|
||||
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
|
||||
} else {
|
||||
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
|
||||
}
|
||||
}
|
||||
|
||||
public Qualifier getSynonymAsQualifier(final String syn) {
|
||||
return Optional
|
||||
.ofNullable(getTermBySynonym(syn))
|
||||
.map(term -> getTermAsQualifier(term.getId()))
|
||||
.orElse(null);
|
||||
// .orElse(OafMapperUtils.unknown(getId(), getName()));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,33 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class VocabularyGroup implements Serializable {
|
||||
|
||||
public static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
|
||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
public static final String VOCABULARIES_XQUERY = "for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') \n"
|
||||
+
|
||||
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
|
||||
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
|
||||
"for $term in ($x//TERM)\n" +
|
||||
"return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)";
|
||||
|
||||
final String xquery = IOUtils
|
||||
.toString(
|
||||
GenerateEntitiesApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
|
||||
public static final String VOCABULARY_SYNONYMS_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')\n"
|
||||
+
|
||||
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
|
||||
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
|
||||
"for $term in ($x//TERM)\n" +
|
||||
"for $syn in ($term//SYNONYM/@term)\n" +
|
||||
"return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)\n";
|
||||
|
||||
public static VocabularyGroup loadVocsFromIS(ISLookUpService isLookUpService) throws ISLookUpException {
|
||||
|
||||
final VocabularyGroup vocs = new VocabularyGroup();
|
||||
|
||||
for (final String s : isLookUpService.quickSearchProfile(xquery)) {
|
||||
for (final String s : isLookUpService.quickSearchProfile(VOCABULARIES_XQUERY)) {
|
||||
final String[] arr = s.split("@=@");
|
||||
if (arr.length == 4) {
|
||||
final String vocId = arr[0].trim();
|
||||
|
@ -40,6 +46,19 @@ public class VocabularyGroup implements Serializable {
|
|||
}
|
||||
|
||||
vocs.addTerm(vocId, termId, termName);
|
||||
vocs.addSynonyms(vocId, termId, termId);
|
||||
}
|
||||
}
|
||||
|
||||
for (final String s : isLookUpService.quickSearchProfile(VOCABULARY_SYNONYMS_XQUERY)) {
|
||||
final String[] arr = s.split("@=@");
|
||||
if (arr.length == 3) {
|
||||
final String vocId = arr[0].trim();
|
||||
final String termId = arr[1].trim();
|
||||
final String syn = arr[2].trim();
|
||||
|
||||
vocs.addSynonyms(vocId, termId, syn);
|
||||
vocs.addSynonyms(vocId, termId, termId);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -66,16 +85,37 @@ public class VocabularyGroup implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public Qualifier getTermAsQualifier(final String vocId, final String id) {
|
||||
if (StringUtils.isBlank(id)) {
|
||||
return OafMapperUtils.qualifier("UNKNOWN", "UNKNOWN", vocId, vocId);
|
||||
} else if (termExists(vocId, id)) {
|
||||
final Vocabulary v = vocs.get(vocId.toLowerCase());
|
||||
final VocabularyTerm t = v.getTerm(id);
|
||||
return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName());
|
||||
} else {
|
||||
return OafMapperUtils.qualifier(id, id, vocId, vocId);
|
||||
public Set<String> getTerms(String vocId) {
|
||||
if (!vocabularyExists(vocId)) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
return vocs
|
||||
.get(vocId.toLowerCase())
|
||||
.getTerms()
|
||||
.values()
|
||||
.stream()
|
||||
.map(t -> t.getId())
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
public Qualifier lookup(String vocId, String id) {
|
||||
return Optional
|
||||
.ofNullable(getSynonymAsQualifier(vocId, id))
|
||||
.orElse(getTermAsQualifier(vocId, id));
|
||||
}
|
||||
|
||||
public Qualifier getTermAsQualifier(final String vocId, final String id) {
|
||||
if (vocabularyExists(vocId)) {
|
||||
return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id);
|
||||
}
|
||||
return OafMapperUtils.qualifier(id, id, "", "");
|
||||
}
|
||||
|
||||
public Qualifier getSynonymAsQualifier(final String vocId, final String syn) {
|
||||
if (StringUtils.isBlank(vocId)) {
|
||||
return OafMapperUtils.unknown("", "");
|
||||
}
|
||||
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
|
||||
}
|
||||
|
||||
public boolean termExists(final String vocId, final String id) {
|
||||
|
@ -86,4 +126,16 @@ public class VocabularyGroup implements Serializable {
|
|||
return vocs.containsKey(vocId.toLowerCase());
|
||||
}
|
||||
|
||||
private void addSynonyms(final String vocId, final String termId, final String syn) {
|
||||
String id = Optional
|
||||
.ofNullable(vocId)
|
||||
.map(s -> s.toLowerCase())
|
||||
.orElseThrow(
|
||||
() -> new IllegalArgumentException(String.format("empty vocabulary id for [term:%s, synonym:%s]")));
|
||||
Optional
|
||||
.ofNullable(vocs.get(id))
|
||||
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))
|
||||
.addSynonym(syn, termId);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,281 @@
|
|||
<workflow-app name="clean graph" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the input path to read graph content</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphOutputPath</name>
|
||||
<description>the target path to store cleaned graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="fork_clean_graph"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<fork name="fork_clean_graph">
|
||||
<path start="clean_publication"/>
|
||||
<path start="clean_dataset"/>
|
||||
<path start="clean_otherresearchproduct"/>
|
||||
<path start="clean_software"/>
|
||||
<path start="clean_datasource"/>
|
||||
<path start="clean_organization"/>
|
||||
<path start="clean_project"/>
|
||||
<path start="clean_relation"/>
|
||||
</fork>
|
||||
|
||||
<action name="clean_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean publications</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean datasets</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_otherresearchproduct">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean otherresearchproducts</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean softwares</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_datasource">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean datasources</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_organization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean organizations</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean projects</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="clean_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean relations</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_clean" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "in",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path to the graph data dump to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path to store the output graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "isu",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "url to the ISLookup Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "class",
|
||||
"paramLongName": "graphTableClassName",
|
||||
"paramDescription": "class name moelling the graph table",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -210,6 +210,23 @@
|
|||
<arg>--mdLayout</arg><arg>store</arg>
|
||||
<arg>--mdInterpretation</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF_invisible"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF_invisible">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${contentPath}/oaf_records_invisible"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>--hdfsPath</arg><arg>${contentPath}/oaf_records_invisible</arg>
|
||||
<arg>--mongoBaseUrl</arg><arg>${mongoURL}</arg>
|
||||
<arg>--mongoDb</arg><arg>${mongoDb}</arg>
|
||||
<arg>--mdFormat</arg><arg>OAF</arg>
|
||||
<arg>--mdLayout</arg><arg>store</arg>
|
||||
<arg>--mdInterpretation</arg><arg>intersection</arg>
|
||||
</java>
|
||||
<ok to="wait_import"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
@ -237,7 +254,7 @@
|
|||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims</arg>
|
||||
<arg>--sourcePaths</arg><arg>${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims,${contentPath}/oaf_records_invisible</arg>
|
||||
<arg>--targetPath</arg><arg>${workingDir}/entities_claim</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
</spark>
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')
|
||||
let $vocid := $x//VOCABULARY_NAME/@code
|
||||
let $vocname := $x//VOCABULARY_NAME/text()
|
||||
for $term in ($x//TERM)
|
||||
for $syn in ($term//SYNONYM/@term)
|
||||
return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)
|
|
@ -0,0 +1,106 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class CleaningFunctionTest {
|
||||
|
||||
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
@Mock
|
||||
private ISLookUpService isLookUpService;
|
||||
|
||||
private VocabularyGroup vocabularies;
|
||||
|
||||
private CleaningRuleMap mapping;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws ISLookUpException, IOException {
|
||||
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||
lenient()
|
||||
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||
.thenReturn(synonyms());
|
||||
|
||||
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||
mapping = CleaningRuleMap.create(vocabularies);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCleaning() throws Exception {
|
||||
|
||||
assertNotNull(vocabularies);
|
||||
assertNotNull(mapping);
|
||||
|
||||
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||
|
||||
Publication p_out = OafCleaner.apply(p_in, mapping);
|
||||
|
||||
assertNotNull(p_out);
|
||||
|
||||
assertEquals("eng", p_out.getLanguage().getClassid());
|
||||
assertEquals("English", p_out.getLanguage().getClassname());
|
||||
|
||||
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
|
||||
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
|
||||
|
||||
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
|
||||
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
|
||||
|
||||
Set<String> pidTerms = vocabularies.getTerms("dnet:pid_types");
|
||||
assertTrue(
|
||||
p_out
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(p -> p.getQualifier())
|
||||
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_out));
|
||||
|
||||
/*
|
||||
* assertTrue( p_out .getPid() .stream() .allMatch(sp -> StringUtils.isNotBlank(sp.getValue())));
|
||||
*/
|
||||
}
|
||||
|
||||
private Stream<Qualifier> getAuthorPidTypes(Publication pub) {
|
||||
return pub
|
||||
.getAuthor()
|
||||
.stream()
|
||||
.map(a -> a.getPid())
|
||||
.flatMap(p -> p.stream())
|
||||
.map(s -> s.getQualifier());
|
||||
}
|
||||
|
||||
private List<String> vocs() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
||||
}
|
||||
|
||||
private List<String> synonyms() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
||||
}
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
|
@ -55,7 +56,7 @@ public class MappersTest {
|
|||
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
|
||||
|
||||
final List<Oaf> list = new OafToOafMapper(vocs).processMdRecord(xml);
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, false).processMdRecord(xml);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
|
@ -69,6 +70,7 @@ public class MappersTest {
|
|||
assertValidId(p.getId());
|
||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||
assertFalse(p.getDataInfo().getInvisible());
|
||||
|
||||
assertTrue(p.getAuthor().size() > 0);
|
||||
final Optional<Author> author = p
|
||||
|
@ -134,11 +136,27 @@ public class MappersTest {
|
|||
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPublicationInvisible() throws IOException {
|
||||
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml"));
|
||||
|
||||
final List<Oaf> list = new OafToOafMapper(vocs, true).processMdRecord(xml);
|
||||
|
||||
assertTrue(list.size() > 0);
|
||||
assertTrue(list.get(0) instanceof Publication);
|
||||
|
||||
final Publication p = (Publication) list.get(0);
|
||||
|
||||
assertTrue(p.getDataInfo().getInvisible());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDataset() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml"));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
|
||||
assertEquals(3, list.size());
|
||||
assertTrue(list.get(0) instanceof Dataset);
|
||||
|
@ -220,7 +238,7 @@ public class MappersTest {
|
|||
void testSoftware() throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml"));
|
||||
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs).processMdRecord(xml);
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false).processMdRecord(xml);
|
||||
|
||||
assertEquals(1, list.size());
|
||||
assertTrue(list.get(0) instanceof Software);
|
||||
|
|
|
@ -0,0 +1,757 @@
|
|||
{
|
||||
"author": [
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Brien, Tom",
|
||||
"name": "Tom",
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "ORCID12",
|
||||
"classname": "ORCID12",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "0000-0001-9613-6639"
|
||||
}
|
||||
],
|
||||
"rank": 1,
|
||||
"surname": "Brien"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Ade, Peter",
|
||||
"name": "Peter",
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "xyz",
|
||||
"classname": "XYZ",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "qwerty"
|
||||
}
|
||||
],
|
||||
"rank": 2,
|
||||
"surname": "Ade"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Barry, Peter S.",
|
||||
"name": "Peter S.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 3,
|
||||
"surname": "Barry"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Dunscombe, Chris J.",
|
||||
"name": "Chris J.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 4,
|
||||
"surname": "Dunscombe"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Leadley, David R.",
|
||||
"name": "David R.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 5,
|
||||
"surname": "Leadley"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Morozov, Dmitry V.",
|
||||
"name": "Dmitry V.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 6,
|
||||
"surname": "Morozov"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Myronov, Maksym",
|
||||
"name": "Maksym",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 7,
|
||||
"surname": "Myronov"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Parker, Evan",
|
||||
"name": "Evan",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 8,
|
||||
"surname": "Parker"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Prest, Martin J.",
|
||||
"name": "Martin J.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 9,
|
||||
"surname": "Prest"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Prunnila, Mika",
|
||||
"name": "Mika",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 10,
|
||||
"surname": "Prunnila"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Sudiwala, Rashmi V.",
|
||||
"name": "Rashmi V.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 11,
|
||||
"surname": "Sudiwala"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Whall, Terry E.",
|
||||
"name": "Terry E.",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 12,
|
||||
"surname": "Whall"
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": "Mauskopf",
|
||||
"name": "",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 13,
|
||||
"surname": ""
|
||||
},
|
||||
{
|
||||
"affiliation": [
|
||||
],
|
||||
"fullname": " P. D. ",
|
||||
"name": "",
|
||||
"pid": [
|
||||
],
|
||||
"rank": 14,
|
||||
"surname": ""
|
||||
}
|
||||
],
|
||||
"bestaccessright": {
|
||||
"classid": "CLOSED",
|
||||
"classname": "Closed Access",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"collectedfrom": [
|
||||
{
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
"value": "VIRTA"
|
||||
}
|
||||
],
|
||||
"context": [
|
||||
],
|
||||
"contributor": [
|
||||
],
|
||||
"country": [
|
||||
],
|
||||
"coverage": [
|
||||
],
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"dateofacceptance": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "2016-01-01"
|
||||
},
|
||||
"dateofcollection": "",
|
||||
"dateoftransformation": "2020-04-22T12:34:08.009Z",
|
||||
"description": [
|
||||
],
|
||||
"externalReference": [
|
||||
],
|
||||
"extraInfo": [
|
||||
],
|
||||
"format": [
|
||||
],
|
||||
"fulltext": [
|
||||
],
|
||||
"id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375",
|
||||
"instance": [
|
||||
{
|
||||
"accessright": {
|
||||
"classid": "CLOSED",
|
||||
"classname": "CLOSED",
|
||||
"schemeid": "dnet:access_modes",
|
||||
"schemename": "dnet:access_modes"
|
||||
},
|
||||
"collectedfrom": {
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
"value": "VIRTA"
|
||||
},
|
||||
"dateofacceptance": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"value": "2016-01-01"
|
||||
},
|
||||
"distributionlocation": "",
|
||||
"hostedby": {
|
||||
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||
"value": "VIRTA"
|
||||
},
|
||||
"instancetype": {
|
||||
"classid": "Comentario",
|
||||
"classname": "Comentario",
|
||||
"schemeid": "dnet:publication_resource",
|
||||
"schemename": "dnet:publication_resource"
|
||||
},
|
||||
"url": [
|
||||
"http://juuli.fi/Record/0275158616",
|
||||
"http://dx.doi.org/10.1007/s109090161569x"
|
||||
]
|
||||
}
|
||||
],
|
||||
"journal": {
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"edition": "",
|
||||
"ep": " 7",
|
||||
"iss": "9 March",
|
||||
"issnLinking": "",
|
||||
"issnOnline": "",
|
||||
"issnPrinted": "0022-2291",
|
||||
"name": "Journal of Low Temperature Physics - Early Acces",
|
||||
"sp": "1 ",
|
||||
"vol": ""
|
||||
},
|
||||
"language": {
|
||||
"classid": "en",
|
||||
"classname": "en",
|
||||
"schemeid": "dnet:languages",
|
||||
"schemename": "dnet:languages"
|
||||
},
|
||||
"lastupdatetimestamp": 1591283286319,
|
||||
"oaiprovenance": {
|
||||
"originDescription": {
|
||||
"altered": true,
|
||||
"baseURL": "https%3A%2F%2Fvirta-jtp.csc.fi%2Fapi%2Fcerif",
|
||||
"datestamp": "2019-07-30",
|
||||
"harvestDate": "2020-04-22T11:04:38.685Z",
|
||||
"identifier": "oai:virta-jtp.csc.fi:Publications/0275158616",
|
||||
"metadataNamespace": ""
|
||||
}
|
||||
},
|
||||
"originalId": [
|
||||
"CSC_________::2250a70c903c6ac6e4c01438259e9375"
|
||||
],
|
||||
"pid": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "doi",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1007/s109090161569x"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "doi",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": "10.1007/s109090161569x"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "doi",
|
||||
"classname": "doi",
|
||||
"schemeid": "dnet:pid_types",
|
||||
"schemename": "dnet:pid_types"
|
||||
},
|
||||
"value": ""
|
||||
}
|
||||
],
|
||||
"relevantdate": [
|
||||
],
|
||||
"resourcetype": {
|
||||
"classid": "0001",
|
||||
"classname": "0001",
|
||||
"schemeid": "dnet:dataCite_resource",
|
||||
"schemename": "dnet:dataCite_resource"
|
||||
},
|
||||
"resulttype": {
|
||||
"classid": "publication",
|
||||
"classname": "publication",
|
||||
"schemeid": "dnet:result_typologies",
|
||||
"schemename": "dnet:result_typologies"
|
||||
},
|
||||
"source": [
|
||||
],
|
||||
"subject": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "ta213"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "infrared detectors"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "lens antennas"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "silicon"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "slot antennas"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "strained silicon"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "cold electron bolometers"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "doped silicon"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "measure noise"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "noise equivalent power"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "optical characterisation"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "optical response"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "photon noise"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
},
|
||||
"value": "silicon absorbers"
|
||||
}
|
||||
],
|
||||
"title": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
||||
}
|
||||
]
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue