Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi

pull/42/head
Enrico Ottonello 4 years ago
commit c5e149c46e

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
<packaging>jar</packaging>

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>

@ -5,7 +5,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../</relativePath>
</parent>

@ -1,6 +1,10 @@
package eu.dnetlib.dhp.schema.common;
import java.security.Key;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class ModelConstants {
@ -95,6 +99,9 @@ public class ModelConstants {
SYSIMPORT_CROSSWALK_ENTITYREGISTRY, SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS);
public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
private static Qualifier qualifier(
final String classid,
final String classname,
@ -107,4 +114,12 @@ public class ModelConstants {
q.setSchemename(schemename);
return q;
}
private static KeyValue keyValue(String key, String value) {
KeyValue kv = new KeyValue();
kv.setKey(key);
kv.setValue(value);
kv.setDataInfo(new DataInfo());
return kv;
}
}

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-actionmanager</artifactId>

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.promote;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import java.util.function.BiFunction;
@ -20,6 +21,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
@ -134,24 +136,39 @@ public class PromoteActionPayloadForGraphTableJob {
.map(
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
Encoders.bean(rowClazz));
/*
* return spark .read() .parquet(path) .as(Encoders.bean(rowClazz));
*/
}
private static <A extends Oaf> Dataset<A> readActionPayload(
SparkSession spark, String path, Class<A> actionPayloadClazz) {
logger.info("Reading action payload from path: {}", path);
return spark
.read()
.parquet(path)
.map((MapFunction<Row, String>) value -> extractPayload(value), Encoders.STRING())
.map(
(MapFunction<Row, A>) value -> OBJECT_MAPPER
.readValue(value.<String> getAs("payload"), actionPayloadClazz),
(MapFunction<String, A>) value -> decodePayload(actionPayloadClazz, value),
Encoders.bean(actionPayloadClazz));
}
private static String extractPayload(Row value) {
try {
return value.<String> getAs("payload");
} catch (IllegalArgumentException | ClassCastException e) {
logger.error("cannot extract payload from action: {}", value.toString());
throw e;
}
}
private static <A extends Oaf> A decodePayload(Class<A> actionPayloadClazz, String payload) throws IOException {
try {
return OBJECT_MAPPER.readValue(payload, actionPayloadClazz);
} catch (UnrecognizedPropertyException e) {
logger.error("error decoding payload: {}", payload);
throw e;
}
}
private static <G extends Oaf, A extends Oaf> Dataset<G> promoteActionPayloadForGraphTable(
Dataset<G> rowDS,
Dataset<A> actionPayloadDS,

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
@ -53,7 +53,7 @@
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-broker-common</artifactId>
<version>[3.0.1,4.0.0)</version>
<version>[3.0.3,4.0.0)</version>
</dependency>
</dependencies>

@ -2,7 +2,6 @@
package eu.dnetlib.dhp.broker.model;
import java.io.Serializable;
import java.util.Map;
public class Event implements Serializable {
@ -25,7 +24,7 @@ public class Event implements Serializable {
private boolean instantMessage;
private Map<String, Object> map;
private MappedFields map;
public Event() {
}
@ -33,7 +32,7 @@ public class Event implements Serializable {
public Event(final String producerId, final String eventId, final String topic, final String payload,
final Long creationDate, final Long expiryDate,
final boolean instantMessage,
final Map<String, Object> map) {
final MappedFields map) {
this.producerId = producerId;
this.eventId = eventId;
this.topic = topic;
@ -100,11 +99,11 @@ public class Event implements Serializable {
this.instantMessage = instantMessage;
}
public Map<String, Object> getMap() {
public MappedFields getMap() {
return this.map;
}
public void setMap(final Map<String, Object> map) {
public void setMap(final MappedFields map) {
this.map = map;
}
}

@ -3,15 +3,14 @@ package eu.dnetlib.dhp.broker.model;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
public class EventFactory {
@ -30,10 +29,10 @@ public class EventFactory {
final Event res = new Event();
final Map<String, Object> map = createMapFromResult(updateInfo);
final MappedFields map = createMapFromResult(updateInfo);
final String eventId = calculateEventId(
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId(), updateInfo.getHighlightValueAsString());
updateInfo.getTopicPath(), updateInfo.getTarget().getOpenaireId(), updateInfo.getHighlightValueAsString());
res.setEventId(eventId);
res.setProducerId(PRODUCER_ID);
@ -46,35 +45,35 @@ public class EventFactory {
return res;
}
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
final Map<String, Object> map = new HashMap<>();
private static MappedFields createMapFromResult(final UpdateInfo<?> updateInfo) {
final MappedFields map = new MappedFields();
final OpenaireBrokerResult source = updateInfo.getSource();
final OpenaireBrokerResult target = updateInfo.getTarget();
final OaBrokerMainEntity source = updateInfo.getSource();
final OaBrokerMainEntity target = updateInfo.getTarget();
map.put("target_datasource_id", target.getCollectedFromId());
map.put("target_datasource_name", target.getCollectedFromName());
map.setTargetDatasourceId(target.getCollectedFromId());
map.setTargetDatasourceName(target.getCollectedFromName());
map.put("target_publication_id", target.getOriginalId());
map.setTargetResultId(target.getOpenaireId());
final List<String> titles = target.getTitles();
if (titles.size() > 0) {
map.put("target_publication_title", titles.get(0));
map.setTargetResultTitle(titles.get(0));
}
final long date = parseDateTolong(target.getPublicationdate());
if (date > 0) {
map.put("target_dateofacceptance", date);
map.setTargetDateofacceptance(date);
}
map.put("target_publication_subject_list", target.getSubjects());
map.put("target_publication_author_list", target.getCreators());
map.setTargetSubjects(target.getSubjects().stream().map(s -> s.getValue()).collect(Collectors.toList()));
map.setTargetAuthors(target.getCreators().stream().map(a -> a.getFullname()).collect(Collectors.toList()));
// PROVENANCE INFO
map.put("trust", updateInfo.getTrust());
map.put("provenance_datasource_id", source.getCollectedFromId());
map.put("provenance_datasource_name", source.getCollectedFromName());
map.put("provenance_publication_id_list", source.getOriginalId());
map.setTrust(updateInfo.getTrust());
map.setProvenanceDatasourceId(source.getCollectedFromId());
map.setProvenanceDatasourceName(source.getCollectedFromName());
map.setProvenanceResultId(source.getOpenaireId());
return map;
}

@ -0,0 +1,114 @@
package eu.dnetlib.dhp.broker.model;
import java.io.Serializable;
import java.util.List;
public class MappedFields implements Serializable {
/**
*
*/
private static final long serialVersionUID = -7999704113195802008L;
private String targetDatasourceId;
private String targetDatasourceName;
private String targetResultId;
private String targetResultTitle;
private long targetDateofacceptance;
private List<String> targetSubjects;
private List<String> targetAuthors;
private float trust;
private String provenanceDatasourceId;
private String provenanceDatasourceName;
private String provenanceResultId;
public String getTargetDatasourceId() {
return targetDatasourceId;
}
public void setTargetDatasourceId(final String targetDatasourceId) {
this.targetDatasourceId = targetDatasourceId;
}
public String getTargetDatasourceName() {
return targetDatasourceName;
}
public void setTargetDatasourceName(final String targetDatasourceName) {
this.targetDatasourceName = targetDatasourceName;
}
public String getTargetResultId() {
return targetResultId;
}
public void setTargetResultId(final String targetResultId) {
this.targetResultId = targetResultId;
}
public String getTargetResultTitle() {
return targetResultTitle;
}
public void setTargetResultTitle(final String targetResultTitle) {
this.targetResultTitle = targetResultTitle;
}
public long getTargetDateofacceptance() {
return targetDateofacceptance;
}
public void setTargetDateofacceptance(final long targetDateofacceptance) {
this.targetDateofacceptance = targetDateofacceptance;
}
public List<String> getTargetSubjects() {
return targetSubjects;
}
public void setTargetSubjects(final List<String> targetSubjects) {
this.targetSubjects = targetSubjects;
}
public List<String> getTargetAuthors() {
return targetAuthors;
}
public void setTargetAuthors(final List<String> targetAuthors) {
this.targetAuthors = targetAuthors;
}
public float getTrust() {
return trust;
}
public void setTrust(final float trust) {
this.trust = trust;
}
public String getProvenanceDatasourceId() {
return provenanceDatasourceId;
}
public void setProvenanceDatasourceId(final String provenanceDatasourceId) {
this.provenanceDatasourceId = provenanceDatasourceId;
}
public String getProvenanceDatasourceName() {
return provenanceDatasourceName;
}
public void setProvenanceDatasourceName(final String provenanceDatasourceName) {
this.provenanceDatasourceName = provenanceDatasourceName;
}
public String getProvenanceResultId() {
return provenanceResultId;
}
public void setProvenanceResultId(final String provenanceResultId) {
this.provenanceResultId = provenanceResultId;
}
}

@ -1,228 +0,0 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import scala.Tuple2;
public class GenerateEventsApplication {
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateEventsApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String eventsPath = parser.get("eventsPath");
log.info("eventsPath: {}", eventsPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String dedupConfigProfileId = parser.get("dedupConfProfile");
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
final SparkConf conf = new SparkConf();
// conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
// conf.registerKryoClasses(BrokerConstants.getModelClasses());
// TODO UNCOMMENT
// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
final DedupConfig dedupConfig = null;
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, eventsPath);
// TODO REMOVE THIS
expandResultsWithRelations(spark, graphPath, Publication.class)
.write()
.mode(SaveMode.Overwrite)
.json(eventsPath);
// TODO UNCOMMENT THIS
// spark
// .emptyDataset(Encoders.bean(Event.class))
// .union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
// .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
// .union(generateEvents(spark, graphPath, Software.class, dedupConfig))
// .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(eventsPath);
});
}
private static void removeOutputDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static <SRC extends Result> Dataset<Event> generateEvents(
final SparkSession spark,
final String graphPath,
final Class<SRC> sourceClass,
final DedupConfig dedupConfig) {
final Dataset<OpenaireBrokerResult> results = expandResultsWithRelations(spark, graphPath, sourceClass);
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final TypedColumn<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup> aggr = new ResultAggregator()
.toColumn();
return results
.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
.groupByKey(
(MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
.agg(aggr)
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
.filter(ResultGroup::isValid)
.map(
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
Encoders.bean(EventGroup.class))
.flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class));
}
private static <SRC extends Result> Dataset<OpenaireBrokerResult> expandResultsWithRelations(
final SparkSession spark,
final String graphPath,
final Class<SRC> sourceClass) {
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.cache();
final Dataset<OpenaireBrokerResult> r0 = readPath(
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class));
// TODO UNCOMMENT THIS
// final Dataset<OpenaireBrokerResult> r1 = join(r0, rels, relatedEntities(projects, rels,
// RelatedProject.class));
// final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels,
// RelatedSoftware.class));
// final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels,
// RelatedDataset.class));
// final Dataset<OpenaireBrokerResult> r4 = join(r3, rels, relatedEntities(publications, rels,
// RelatedPublication.class));;
return r0; // TODO it should be r4
}
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
final Dataset<Relation> rels,
final Class<RT> clazz) {
return rels
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
.map(
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
Encoders.bean(clazz));
}
private static <T> Dataset<OpenaireBrokerResult> join(final Dataset<OpenaireBrokerResult> sources,
final Dataset<Relation> rels,
final Dataset<T> typedRels) {
final TypedColumn<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator<T>()
.toColumn();
;
return sources
.joinWith(typedRels, sources.col("openaireId").equalTo(rels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OpenaireBrokerResult.class));
}
public static <R> Dataset<R> readPath(
final SparkSession spark,
final String inputPath,
final Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String conf = isLookUpService
.getResourceProfileByQuery(
String
.format(
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
profId));
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
dedupConfig.getPace().initModel();
dedupConfig.getPace().initTranslationMap();
// dedupConfig.getWf().setConfigurationId("???");
return dedupConfig;
}
}

@ -0,0 +1,103 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
public class GenerateEventsJob {
private static final Logger log = LoggerFactory.getLogger(GenerateEventsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
GenerateEventsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_events.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String dedupConfigProfileId = parser.get("dedupConfProfile");
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
final String eventsPath = workingPath + "/events";
log.info("eventsPath: {}", eventsPath);
final SparkConf conf = new SparkConf();
// TODO UNCOMMENT
// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
final DedupConfig dedupConfig = null;
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, eventsPath);
final Dataset<ResultGroup> groups = ClusterUtils
.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
final Dataset<Event> events = groups
.map(
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
Encoders.bean(EventGroup.class))
.flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class));
events.write().mode(SaveMode.Overwrite).json(eventsPath);
});
}
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String conf = isLookUpService
.getResourceProfileByQuery(
String
.format(
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
profId));
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
dedupConfig.getPace().initModel();
dedupConfig.getPace().initTranslationMap();
// dedupConfig.getWf().setConfigurationId("???");
return dedupConfig;
}
}

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProjectAggregator;
import scala.Tuple2;
public class JoinStep1Job {
private static final Logger log = LoggerFactory.getLogger(JoinStep1Job.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
JoinStep1Job.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, joinedEntitiesPath);
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
final Dataset<RelatedProject> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
.toColumn();
sources
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedProject>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
.write()
.mode(SaveMode.Overwrite)
.json(joinedEntitiesPath);
});
}
}

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftwareAggregator;
import scala.Tuple2;
public class JoinStep2Job {
private static final Logger log = LoggerFactory.getLogger(JoinStep2Job.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
JoinStep2Job.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, joinedEntitiesPath);
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
final Dataset<RelatedSoftware> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
.toColumn();
sources
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedSoftware>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
.write()
.mode(SaveMode.Overwrite)
.json(joinedEntitiesPath);
});
}
}

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasetAggregator;
import scala.Tuple2;
public class JoinStep3Job {
private static final Logger log = LoggerFactory.getLogger(JoinStep3Job.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
JoinStep3Job.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, joinedEntitiesPath);
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
final Dataset<RelatedDataset> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
.toColumn();
sources
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDataset>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
.write()
.mode(SaveMode.Overwrite)
.json(joinedEntitiesPath);
});
}
}

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublicationAggregator;
import scala.Tuple2;
public class JoinStep4Job {
private static final Logger log = LoggerFactory.getLogger(JoinStep4Job.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
JoinStep4Job.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, joinedEntitiesPath);
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
final Dataset<RelatedPublication> typedRels = ClusterUtils
.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
.toColumn();
sources
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedPublication>, String>) t -> t._1.getOpenaireId(),
Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
.write()
.mode(SaveMode.Overwrite)
.json(joinedEntitiesPath);
});
}
}

@ -0,0 +1,88 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.TypedColumn;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator;
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class PrepareGroupsJob {
private static final Logger log = LoggerFactory.getLogger(PrepareGroupsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareGroupsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String groupsPath = workingPath + "/duplicates";
log.info("groupsPath: {}", groupsPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, groupsPath);
final Dataset<OaBrokerMainEntity> results = ClusterUtils
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
final Dataset<Relation> mergedRels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
.toColumn();
final Dataset<ResultGroup> groups = results
.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
.groupByKey(
(MapFunction<Tuple2<OaBrokerMainEntity, Relation>, String>) t -> t._2.getTarget(),
Encoders.STRING())
.agg(aggr)
.map(
(MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
.filter(rg -> rg.getData().size() > 1);
groups
.write()
.mode(SaveMode.Overwrite)
.json(groupsPath);
});
}
}

@ -0,0 +1,85 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareRelatedDatasetsJob {
private static final Logger log = LoggerFactory.getLogger(PrepareRelatedDatasetsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareRelatedDatasetsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String relsPath = workingPath + "/relatedDatasets";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, relsPath);
final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
rels
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> {
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
rel.getRelDataset().setRelType(t._1.getRelClass());
return rel;
}, Encoders.bean(RelatedDataset.class))
.write()
.mode(SaveMode.Overwrite)
.json(relsPath);
});
}
}

@ -0,0 +1,82 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerProject;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareRelatedProjectsJob {
private static final Logger log = LoggerFactory.getLogger(PrepareRelatedProjectsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareRelatedProjectsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String relsPath = workingPath + "/relatedProjects";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, relsPath);
final Dataset<OaBrokerProject> projects = ClusterUtils
.readPath(spark, graphPath + "/project", Project.class)
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
rels
.joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class))
.write()
.mode(SaveMode.Overwrite)
.json(relsPath);
});
}
}

@ -0,0 +1,88 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class PrepareRelatedPublicationsJob {
private static final Logger log = LoggerFactory.getLogger(PrepareRelatedPublicationsJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareRelatedPublicationsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String relsPath = workingPath + "/relatedPublications";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, relsPath);
final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
.readPath(spark, graphPath + "/publication", Publication.class)
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
.map(
ConversionUtils::oafPublicationToBrokerPublication,
Encoders.bean(OaBrokerRelatedPublication.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
rels
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> {
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
rel.getRelPublication().setRelType(t._1.getRelClass());
return rel;
}, Encoders.bean(RelatedPublication.class))
.write()
.mode(SaveMode.Overwrite)
.json(relsPath);
});
}
}

@ -0,0 +1,83 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
public class PrepareRelatedSoftwaresJob {
private static final Logger log = LoggerFactory.getLogger(PrepareRelatedSoftwaresJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareRelatedSoftwaresJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String relsPath = workingPath + "/relatedSoftwares";
log.info("relsPath: {}", relsPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, relsPath);
final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
.readPath(spark, graphPath + "/software", Software.class)
.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
final Dataset<Relation> rels = ClusterUtils
.readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
rels
.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class))
.write()
.mode(SaveMode.Overwrite)
.json(relsPath);
});
}
}

@ -0,0 +1,82 @@
package eu.dnetlib.dhp.broker.oa;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
public class PrepareSimpleEntititiesJob {
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareSimpleEntititiesJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String graphPath = parser.get("graphPath");
log.info("graphPath: {}", graphPath);
final String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
final String simpleEntitiesPath = workingPath + "/simpleEntities";
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
ClusterUtils.removeDir(spark, simpleEntitiesPath);
prepareSimpleEntities(spark, graphPath, Publication.class)
.union(prepareSimpleEntities(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class))
.union(prepareSimpleEntities(spark, graphPath, Software.class))
.union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.json(simpleEntitiesPath);
});
}
private static <SRC extends Result> Dataset<OaBrokerMainEntity> prepareSimpleEntities(
final SparkSession spark,
final String graphPath,
final Class<SRC> sourceClass) {
return ClusterUtils
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
.filter(r -> r.getDataInfo().getDeletedbyinference())
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
}
}

@ -1,74 +1,80 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class UpdateMatcher<T> {
private final boolean multipleUpdate;
private final int maxNumber;
private final Function<T, Topic> topicFunction;
private final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction;
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
private final Function<T, String> highlightToStringFunction;
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction,
public UpdateMatcher(final int maxNumber, final Function<T, Topic> topicFunction,
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
final Function<T, String> highlightToStringFunction) {
this.multipleUpdate = multipleUpdate;
this.maxNumber = maxNumber;
this.topicFunction = topicFunction;
this.compileHighlightFunction = compileHighlightFunction;
this.highlightToStringFunction = highlightToStringFunction;
}
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OpenaireBrokerResult res,
final Collection<OpenaireBrokerResult> others,
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity res,
final Collection<OaBrokerMainEntity> others,
final DedupConfig dedupConfig) {
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
for (final OpenaireBrokerResult source : others) {
for (final OaBrokerMainEntity source : others) {
if (source != res) {
for (final T hl : findDifferences(source, res)) {
final Topic topic = getTopicFunction().apply(hl);
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(),
getHighlightToStringFunction(), dedupConfig);
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
} else {
infoMap.put(s, info);
if (topic != null) {
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res,
getCompileHighlightFunction(),
getHighlightToStringFunction(), dedupConfig);
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
infoMap.put(s, info);
}
}
}
}
}
final Collection<UpdateInfo<T>> values = infoMap.values();
if (values.isEmpty() || multipleUpdate) {
return values;
final List<UpdateInfo<T>> values = infoMap
.values()
.stream()
.sorted((o1, o2) -> Float.compare(o2.getTrust(), o1.getTrust())) // DESCENDING
.collect(Collectors.toList());
if (values.isEmpty()) {
return new ArrayList<>();
} else if (values.size() > maxNumber) {
System.err.println("Too many events (" + values.size() + ") matched by " + getClass().getSimpleName());
return values.subList(0, maxNumber);
} else {
final UpdateInfo<T> v = values
.stream()
.sorted((o1, o2) -> Float.compare(o1.getTrust(), o2.getTrust()))
.findFirst()
.get();
return Arrays.asList(v);
return values;
}
}
protected abstract List<T> findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target);
protected abstract List<T> findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target);
protected static boolean isMissing(final List<String> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
@ -78,15 +84,15 @@ public abstract class UpdateMatcher<T> {
return StringUtils.isBlank(field);
}
public boolean isMultipleUpdate() {
return multipleUpdate;
public int getMaxNumber() {
return maxNumber;
}
public Function<T, Topic> getTopicFunction() {
return topicFunction;
}
public BiConsumer<OpenaireBrokerResult, T> getCompileHighlightFunction() {
public BiConsumer<OaBrokerMainEntity, T> getCompileHighlightFunction() {
return compileHighlightFunction;
}

@ -5,39 +5,38 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Dataset;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public abstract class AbstractEnrichMissingDataset
extends UpdateMatcher<Dataset> {
public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
public AbstractEnrichMissingDataset(final Topic topic) {
super(true,
super(10,
rel -> topic,
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getOriginalId());
rel -> rel.getOpenaireId());
}
protected abstract boolean filterByType(String relType);
@Override
protected final List<Dataset> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected final List<OaBrokerRelatedDataset> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingDatasets = target
.getDatasets()
.stream()
.filter(rel -> filterByType(rel.getRelType()))
.map(Dataset::getOriginalId)
.map(OaBrokerRelatedDataset::getOpenaireId)
.collect(Collectors.toSet());
return source
.getDatasets()
.stream()
.filter(rel -> filterByType(rel.getRelType()))
.filter(d -> !existingDatasets.contains(d.getOriginalId()))
.filter(d -> !existingDatasets.contains(d.getOpenaireId()))
.collect(Collectors.toList());
}

@ -4,22 +4,22 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerProject;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingProject extends UpdateMatcher<Project> {
public class EnrichMissingProject extends UpdateMatcher<OaBrokerProject> {
public EnrichMissingProject() {
super(true,
super(20,
prj -> Topic.ENRICH_MISSING_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
}
@Override
protected List<Project> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
if (target.getProjects().isEmpty()) {
return source.getProjects();
} else {

@ -5,27 +5,27 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerProject;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreProject extends UpdateMatcher<Project> {
public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
public EnrichMoreProject() {
super(true,
super(20,
prj -> Topic.ENRICH_MORE_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> projectAsString(prj));
}
private static String projectAsString(final Project prj) {
private static String projectAsString(final OaBrokerProject prj) {
return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode();
}
@Override
protected List<eu.dnetlib.broker.objects.Project> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingProjects = target
.getProjects()

@ -5,40 +5,40 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Publication;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<Publication> {
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
public AbstractEnrichMissingPublication(final Topic topic) {
super(true,
super(10,
rel -> topic,
(p, rel) -> p.getPublications().add(rel),
rel -> rel.getOriginalId());
rel -> rel.getOpenaireId());
}
protected abstract boolean filterByType(String relType);
@Override
protected final List<eu.dnetlib.broker.objects.Publication> findDifferences(
final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected final List<OaBrokerRelatedPublication> findDifferences(
final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingPublications = target
.getPublications()
.stream()
.filter(rel -> filterByType(rel.getRelType()))
.map(Publication::getOriginalId)
.map(OaBrokerRelatedPublication::getOpenaireId)
.collect(Collectors.toSet());
return source
.getPublications()
.stream()
.filter(rel -> filterByType(rel.getRelType()))
.filter(p -> !existingPublications.contains(p.getOriginalId()))
.filter(p -> !existingPublications.contains(p.getOpenaireId()))
.collect(Collectors.toList());
}

@ -4,24 +4,25 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingSoftware
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
extends UpdateMatcher<OaBrokerRelatedSoftware> {
public EnrichMissingSoftware() {
super(true,
super(10,
s -> Topic.ENRICH_MISSING_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
s -> s.getOpenaireId());
}
@Override
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerRelatedSoftware> findDifferences(
final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getSoftwares().isEmpty()) {
return source.getSoftwares();

@ -5,29 +5,29 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Software;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreSoftware extends UpdateMatcher<Software> {
public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
public EnrichMoreSoftware() {
super(true,
super(10,
s -> Topic.ENRICH_MORE_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
s -> s.getOpenaireId());
}
@Override
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerRelatedSoftware> findDifferences(
final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingSoftwares = source
.getSoftwares()
.stream()
.map(Software::getName)
.map(OaBrokerRelatedSoftware::getName)
.collect(Collectors.toSet());
return target

@ -5,21 +5,21 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingAbstract extends UpdateMatcher<String> {
public EnrichMissingAbstract() {
super(false,
super(1,
s -> Topic.ENRICH_MISSING_ABSTRACT,
(p, s) -> p.getAbstracts().add(s),
s -> s);
}
@Override
protected List<String> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
protected List<String> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
return Arrays.asList(source.getAbstracts().get(0));
} else {

@ -7,28 +7,28 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.broker.objects.Author;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Author> {
public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
public EnrichMissingAuthorOrcid() {
super(true,
super(40,
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
(p, aut) -> p.getCreators().add(aut),
aut -> aut.getOrcid());
}
@Override
protected List<Author> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerAuthor> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingOrcids = target
.getCreators()
.stream()
.map(Author::getOrcid)
.map(OaBrokerAuthor::getOrcid)
.filter(StringUtils::isNotBlank)
.collect(Collectors.toSet());

@ -5,28 +5,28 @@ import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerInstance;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
public EnrichMissingOpenAccess() {
super(true,
super(20,
i -> Topic.ENRICH_MISSING_OA_VERSION,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
OaBrokerInstance::getUrl);
}
@Override
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final long count = target
.getInstances()
.stream()
.map(Instance::getLicense)
.map(OaBrokerInstance::getLicense)
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
.count();

@ -5,23 +5,23 @@ import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingPid extends UpdateMatcher<TypedValue> {
public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMissingPid() {
super(true,
super(10,
pid -> Topic.ENRICH_MISSING_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());
}
@Override
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final long count = target.getPids().size();
if (count > 0) {

@ -5,22 +5,22 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
public EnrichMissingPublicationDate() {
super(false,
super(1,
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
(p, date) -> p.setPublicationdate(date),
s -> s);
}
@Override
protected List<String> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<String> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) {
return Arrays.asList(source.getPublicationdate());

@ -5,23 +5,23 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMissingSubject() {
super(true,
super(20,
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));
}
@Override
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingSubject = target
.getSubjects()
.stream()
@ -35,7 +35,7 @@ public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
.collect(Collectors.toList());
}
private static String subjectAsString(final TypedValue s) {
private static String subjectAsString(final OaBrokerTypedValue s) {
return s.getType() + "::" + s.getValue();
}

@ -5,24 +5,24 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerInstance;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
public EnrichMoreOpenAccess() {
super(true,
super(20,
i -> Topic.ENRICH_MORE_OA_VERSION,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
OaBrokerInstance::getUrl);
}
@Override
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> urls = target
.getInstances()
.stream()

@ -5,23 +5,23 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMorePid extends UpdateMatcher<TypedValue> {
public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMorePid() {
super(true,
super(20,
pid -> Topic.ENRICH_MORE_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pidAsString(pid));
}
@Override
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingPids = target
.getPids()
.stream()
@ -35,7 +35,7 @@ public class EnrichMorePid extends UpdateMatcher<TypedValue> {
.collect(Collectors.toList());
}
private static String pidAsString(final TypedValue pid) {
private static String pidAsString(final OaBrokerTypedValue pid) {
return pid.getType() + "::" + pid.getValue();
}
}

@ -5,23 +5,24 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
public EnrichMoreSubject() {
super(true,
super(20,
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
(p, s) -> p.getSubjects().add(s),
s -> subjectAsString(s));
}
@Override
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
final OpenaireBrokerResult target) {
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final Set<String> existingSubjects = target
.getSubjects()
.stream()
@ -35,7 +36,7 @@ public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
.collect(Collectors.toList());
}
private static String subjectAsString(final TypedValue s) {
private static String subjectAsString(final OaBrokerTypedValue s) {
return s.getType() + "::" + s.getValue();
}
}

@ -17,6 +17,8 @@ public class BrokerConstants {
public static final float MIN_TRUST = 0.25f;
public static final float MAX_TRUST = 1.00f;
public static final int MAX_NUMBER_OF_RELS = 20;
public static Class<?>[] getModelClasses() {
final Set<Class<?>> list = new HashSet<>();
list.addAll(Arrays.asList(ModelSupport.getOafModelClasses()));

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.broker.oa.util;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.HdfsSupport;
public class ClusterUtils {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void createDirIfMissing(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static void removeDir(final SparkSession spark, final String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
public static <R> Dataset<R> readPath(
final SparkSession spark,
final String inputPath,
final Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static boolean isDedupRoot(final String id) {
return id.contains("dedup_wf_");
}
public static final boolean isValidResultResultClass(final String s) {
return s.equals("isReferencedBy")
|| s.equals("isRelatedTo")
|| s.equals("references")
|| s.equals("isSupplementedBy")
|| s.equals("isSupplementedTo");
}
}

@ -3,18 +3,28 @@ package eu.dnetlib.dhp.broker.oa.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import com.google.common.base.Function;
import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerExternalReference;
import eu.dnetlib.broker.objects.OaBrokerInstance;
import eu.dnetlib.broker.objects.OaBrokerJournal;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerProject;
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
@ -24,6 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
@ -32,145 +43,148 @@ public class ConversionUtils {
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
public static List<eu.dnetlib.broker.objects.Instance> oafInstanceToBrokerInstances(final Instance i) {
return i.getUrl().stream().map(url -> {
return new eu.dnetlib.broker.objects.Instance()
.setUrl(url)
.setInstancetype(i.getInstancetype().getClassid())
.setLicense(BrokerConstants.OPEN_ACCESS)
.setHostedby(i.getHostedby().getValue());
}).collect(Collectors.toList());
}
public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) {
return sp != null ? new TypedValue()
.setValue(sp.getValue())
.setType(sp.getQualifier().getClassid()) : null;
}
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null;
}
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
return d != null ? new eu.dnetlib.broker.objects.Dataset()
.setOriginalId(d.getOriginalId().get(0))
.setTitle(structPropValue(d.getTitle()))
.setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
d
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
: null;
public static List<OaBrokerInstance> oafInstanceToBrokerInstances(final Instance i) {
if (i == null) {
return new ArrayList<>();
}
return mappedList(i.getUrl(), url -> {
final OaBrokerInstance res = new OaBrokerInstance();
res.setUrl(url);
res.setInstancetype(classId(i.getInstancetype()));
res.setLicense(BrokerConstants.OPEN_ACCESS);
res.setHostedby(kvValue(i.getHostedby()));
return res;
});
}
public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) {
return p != null ? new eu.dnetlib.broker.objects.Publication()
.setOriginalId(p.getOriginalId().get(0))
.setTitle(structPropValue(p.getTitle()))
.setPids(p.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
p
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setCollectedFrom(p.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
: null;
public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) {
return oafStructPropToBrokerTypedValue(sp);
}
public static OaBrokerTypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) {
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
if (d == null) {
return null;
}
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
res.setOpenaireId(d.getId());
res.setOriginalId(first(d.getOriginalId()));
res.setTitle(structPropValue(d.getTitle()));
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue));
return res;
}
public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) {
return result != null ? new OpenaireBrokerResult()
.setOpenaireId(result.getId())
.setOriginalId(result.getOriginalId().get(0))
.setTypology(result.getResulttype().getClassid())
.setTitles(structPropList(result.getTitle()))
.setAbstracts(fieldList(result.getDescription()))
.setLanguage(result.getLanguage().getClassid())
.setSubjects(structPropTypedList(result.getSubject()))
.setCreators(
result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList()))
.setPublicationdate(result.getDateofacceptance().getValue())
.setPublisher(fieldValue(result.getPublisher()))
.setEmbargoenddate(fieldValue(result.getEmbargoenddate()))
.setContributor(fieldList(result.getContributor()))
public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) {
if (p == null) {
return null;
}
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
res.setOpenaireId(p.getId());
res.setOriginalId(first(p.getOriginalId()));
res.setTitle(structPropValue(p.getTitle()));
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue));
return res;
}
public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) {
if (result == null) {
return null;
}
final OaBrokerMainEntity res = new OaBrokerMainEntity();
res.setOpenaireId(result.getId());
res.setOriginalId(first(result.getOriginalId()));
res.setTypology(classId(result.getResulttype()));
res.setTitles(structPropList(result.getTitle()));
res.setAbstracts(fieldList(result.getDescription()));
res.setLanguage(classId(result.getLanguage()));
res.setSubjects(structPropTypedList(result.getSubject()));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
res.setPublisher(fieldValue(result.getPublisher()));
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
res.setContributor(fieldList(result.getContributor()));
res
.setJournal(
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
.setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null))
.setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
.setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
result
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setExternalReferences(
result
.getExternalReference()
.stream()
.map(ConversionUtils::oafExtRefToBrokerExtRef)
.collect(Collectors.toList()))
: null;
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
return res;
}
private static List<TypedValue> structPropTypedList(final List<StructuredProperty> list) {
return list
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
if (author == null) {
return null;
}
final String pids = author.getPid() != null ? author
.getPid()
.stream()
.map(
p -> new TypedValue()
.setValue(p.getValue())
.setType(p.getQualifier().getClassid()))
.collect(Collectors.toList());
}
.filter(pid -> pid != null)
.filter(pid -> pid.getQualifier() != null)
.filter(pid -> pid.getQualifier().getClassid() != null)
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.map(pid -> pid.getValue())
.filter(StringUtils::isNotBlank)
.findFirst()
.orElse(null) : null;
private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) {
return author != null ? new eu.dnetlib.broker.objects.Author()
.setFullname(author.getFullname())
.setOrcid(
author
.getPid()
.stream()
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.map(pid -> pid.getValue())
.findFirst()
.orElse(null))
: null;
return new OaBrokerAuthor(author.getFullname(), pids);
}
private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) {
return journal != null ? new eu.dnetlib.broker.objects.Journal()
.setName(journal.getName())
.setIssn(journal.getIssnPrinted())
.setEissn(journal.getIssnOnline())
.setLissn(journal.getIssnLinking()) : null;
private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) {
if (journal == null) {
return null;
}
final OaBrokerJournal res = new OaBrokerJournal();
res.setName(journal.getName());
res.setIssn(journal.getIssnPrinted());
res.setEissn(journal.getIssnOnline());
res.setLissn(journal.getIssnLinking());
return res;
}
private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
return ref != null ? new eu.dnetlib.broker.objects.ExternalReference()
.setRefidentifier(ref.getRefidentifier())
.setSitename(ref.getSitename())
.setType(ref.getQualifier().getClassid())
.setUrl(ref.getUrl())
: null;
private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
if (ref == null) {
return null;
}
final OaBrokerExternalReference res = new OaBrokerExternalReference();
res.setRefidentifier(ref.getRefidentifier());
res.setSitename(ref.getSitename());
res.setType(classId(ref.getQualifier()));
res.setUrl(ref.getUrl());
return res;
}
public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) {
public static final OaBrokerProject oafProjectToBrokerProject(final Project p) {
if (p == null) {
return null;
}
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project()
.setTitle(fieldValue(p.getTitle()))
.setAcronym(fieldValue(p.getAcronym()))
.setCode(fieldValue(p.getCode()));
final OaBrokerProject res = new OaBrokerProject();
res.setOpenaireId(p.getId());
res.setTitle(fieldValue(p.getTitle()));
res.setAcronym(fieldValue(p.getAcronym()));
res.setCode(fieldValue(p.getCode()));
final String ftree = fieldValue(p.getFundingtree());
if (StringUtils.isNotBlank(ftree)) {
@ -187,13 +201,27 @@ public class ConversionUtils {
return res;
}
public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) {
return sw != null ? new eu.dnetlib.broker.objects.Software()
.setName(structPropValue(sw.getTitle()))
.setDescription(fieldValue(sw.getDescription()))
.setRepository(fieldValue(sw.getCodeRepositoryUrl()))
.setLandingPage(fieldValue(sw.getDocumentationUrl()))
: null;
public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) {
if (sw == null) {
return null;
}
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
res.setOpenaireId(sw.getId());
res.setName(structPropValue(sw.getTitle()));
res.setDescription(fieldValue(sw.getDescription()));
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
res.setLandingPage(fieldValue(sw.getDocumentationUrl()));
return res;
}
private static String first(final List<String> list) {
return list != null && list.size() > 0 ? list.get(0) : null;
}
private static String kvValue(final KeyValue kv) {
return kv != null ? kv.getValue() : null;
}
private static String fieldValue(final Field<String> f) {
@ -205,6 +233,10 @@ public class ConversionUtils {
: null;
}
private static String classId(final Qualifier q) {
return q != null ? q.getClassid() : null;
}
private static String structPropValue(final List<StructuredProperty> props) {
return props != null
? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
@ -226,4 +258,55 @@ public class ConversionUtils {
.collect(Collectors.toList())
: new ArrayList<>();
}
private static List<OaBrokerTypedValue> structPropTypedList(final List<StructuredProperty> list) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(ConversionUtils::oafStructPropToBrokerTypedValue)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(func::apply)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(func::apply)
.flatMap(List::stream)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> T mappedFirst(final List<F> list, final Function<F, T> func) {
if (list == null) {
return null;
}
return list
.stream()
.map(func::apply)
.filter(Objects::nonNull)
.findFirst()
.orElse(null);
}
}

@ -4,23 +4,10 @@ package eu.dnetlib.dhp.broker.oa.util;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
@ -47,28 +34,28 @@ public class EventFinder {
matchers.add(new EnrichMorePid());
matchers.add(new EnrichMoreSubject());
// Advanced matchers
// // Advanced matchers
matchers.add(new EnrichMissingProject());
matchers.add(new EnrichMoreProject());
matchers.add(new EnrichMissingSoftware());
matchers.add(new EnrichMoreSoftware());
matchers.add(new EnrichMissingPublicationIsRelatedTo());
matchers.add(new EnrichMissingPublicationIsReferencedBy());
matchers.add(new EnrichMissingPublicationReferences());
matchers.add(new EnrichMissingPublicationIsSupplementedTo());
matchers.add(new EnrichMissingPublicationIsSupplementedBy());
matchers.add(new EnrichMissingDatasetIsRelatedTo());
matchers.add(new EnrichMissingDatasetIsReferencedBy());
matchers.add(new EnrichMissingDatasetReferences());
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
matchers.add(new EnrichMissingAbstract());
// matchers.add(new EnrichMoreProject());
// matchers.add(new EnrichMissingSoftware());
// matchers.add(new EnrichMoreSoftware());
// matchers.add(new EnrichMissingPublicationIsRelatedTo());
// matchers.add(new EnrichMissingPublicationIsReferencedBy());
// matchers.add(new EnrichMissingPublicationReferences());
// matchers.add(new EnrichMissingPublicationIsSupplementedTo());
// matchers.add(new EnrichMissingPublicationIsSupplementedBy());
// matchers.add(new EnrichMissingDatasetIsRelatedTo());
// matchers.add(new EnrichMissingDatasetIsReferencedBy());
// matchers.add(new EnrichMissingDatasetReferences());
// matchers.add(new EnrichMissingDatasetIsSupplementedTo());
// matchers.add(new EnrichMissingDatasetIsSupplementedBy());
// matchers.add(new EnrichMissingAbstract());
}
public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {
final List<UpdateInfo<?>> list = new ArrayList<>();
for (final OpenaireBrokerResult target : results.getData()) {
for (final OaBrokerMainEntity target : results.getData()) {
for (final UpdateMatcher<?> matcher : matchers) {
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig));
}

@ -14,12 +14,16 @@ public class EventGroup implements Serializable {
*/
private static final long serialVersionUID = 765977943803533130L;
private final List<Event> data = new ArrayList<>();
private List<Event> data = new ArrayList<>();
public List<Event> getData() {
return data;
}
public void setData(final List<Event> data) {
this.data = data;
}
public EventGroup addElement(final Event elem) {
data.add(elem);
return this;

@ -9,10 +9,10 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.OpenAireEventPayload;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.Provenance;
import eu.dnetlib.broker.objects.OaBrokerEventPayload;
import eu.dnetlib.broker.objects.OaBrokerInstance;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerProvenance;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
@ -25,11 +25,11 @@ public final class UpdateInfo<T> {
private final T highlightValue;
private final OpenaireBrokerResult source;
private final OaBrokerMainEntity source;
private final OpenaireBrokerResult target;
private final OaBrokerMainEntity target;
private final BiConsumer<OpenaireBrokerResult, T> compileHighlight;
private final BiConsumer<OaBrokerMainEntity, T> compileHighlight;
private final Function<T, String> highlightToString;
@ -37,9 +37,9 @@ public final class UpdateInfo<T> {
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source,
final OpenaireBrokerResult target,
final BiConsumer<OpenaireBrokerResult, T> compileHighlight,
public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source,
final OaBrokerMainEntity target,
final BiConsumer<OaBrokerMainEntity, T> compileHighlight,
final Function<T, String> highlightToString,
final DedupConfig dedupConfig) {
this.topic = topic;
@ -55,17 +55,17 @@ public final class UpdateInfo<T> {
return highlightValue;
}
public OpenaireBrokerResult getSource() {
public OaBrokerMainEntity getSource() {
return source;
}
public OpenaireBrokerResult getTarget() {
public OaBrokerMainEntity getTarget() {
return target;
}
private float calculateTrust(final DedupConfig dedupConfig,
final OpenaireBrokerResult r1,
final OpenaireBrokerResult r2) {
final OaBrokerMainEntity r1,
final OaBrokerMainEntity r2) {
if (dedupConfig == null) {
return BrokerConstants.MIN_TRUST;
@ -104,31 +104,33 @@ public final class UpdateInfo<T> {
return highlightToString.apply(getHighlightValue());
}
public OpenAireEventPayload asBrokerPayload() {
public OaBrokerEventPayload asBrokerPayload() {
compileHighlight.accept(target, getHighlightValue());
final OpenaireBrokerResult hl = new OpenaireBrokerResult();
final OaBrokerMainEntity hl = new OaBrokerMainEntity();
compileHighlight.accept(hl, getHighlightValue());
final String provId = getSource().getOriginalId();
final String provId = getSource().getOpenaireId();
final String provRepo = getSource().getCollectedFromName();
final String provUrl = getSource()
.getInstances()
.stream()
.map(Instance::getUrl)
.map(OaBrokerInstance::getUrl)
.findFirst()
.orElse(null);
;
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl);
return new OpenAireEventPayload()
.setPublication(target)
.setHighlight(hl)
.setTrust(trust)
.setProvenance(provenance);
final OaBrokerEventPayload res = new OaBrokerEventPayload();
res.setResult(target);
res.setHighlight(hl);
res.setTrust(trust);
res.setProvenance(provenance);
return res;
}
}

@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup, ResultGroup> {
public class ResultAggregator extends Aggregator<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup, ResultGroup> {
/**
*
@ -22,13 +22,15 @@ public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Re
}
@Override
public ResultGroup reduce(final ResultGroup group, final Tuple2<OpenaireBrokerResult, Relation> t) {
return group.addElement(t._1);
public ResultGroup reduce(final ResultGroup group, final Tuple2<OaBrokerMainEntity, Relation> t) {
group.getData().add(t._1);
return group;
}
@Override
public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) {
return g1.addGroup(g2);
g1.getData().addAll(g2.getData());
return g1;
}
@Override

@ -5,7 +5,7 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
public class ResultGroup implements Serializable {
@ -14,23 +14,14 @@ public class ResultGroup implements Serializable {
*/
private static final long serialVersionUID = -3360828477088669296L;
private final List<OpenaireBrokerResult> data = new ArrayList<>();
private List<OaBrokerMainEntity> data = new ArrayList<>();
public List<OpenaireBrokerResult> getData() {
public List<OaBrokerMainEntity> getData() {
return data;
}
public ResultGroup addElement(final OpenaireBrokerResult elem) {
data.add(elem);
return this;
public void setData(final List<OaBrokerMainEntity> data) {
this.data = data;
}
public ResultGroup addGroup(final ResultGroup group) {
data.addAll(group.getData());
return this;
}
public boolean isValid() {
return data.size() > 1;
}
}

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import scala.Tuple2;
public class OpenaireBrokerResultAggregator<T>
extends Aggregator<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult, OpenaireBrokerResult> {
/**
*
*/
private static final long serialVersionUID = -3687878788861013488L;
@Override
public OpenaireBrokerResult zero() {
return new OpenaireBrokerResult();
}
@Override
public OpenaireBrokerResult finish(final OpenaireBrokerResult g) {
return g;
}
@Override
public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2<OpenaireBrokerResult, T> t) {
if (g.getOriginalId() == null) {
return t._1;
} else if (t._2 instanceof RelatedSoftware) {
g.getSoftwares().add(((RelatedSoftware) t._2).getRelSoftware());
} else if (t._2 instanceof RelatedDataset) {
g.getDatasets().add(((RelatedDataset) t._2).getRelDataset());
} else if (t._2 instanceof RelatedPublication) {
g.getPublications().add(((RelatedPublication) t._2).getRelPublication());
} else if (t._2 instanceof RelatedProject) {
g.getProjects().add(((RelatedProject) t._2).getRelProject());
}
return g;
}
@Override
public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) {
if (g1.getOriginalId() != null) {
g1.getSoftwares().addAll(g2.getSoftwares());
g1.getDatasets().addAll(g2.getDatasets());
g1.getPublications().addAll(g2.getPublications());
g1.getProjects().addAll(g2.getProjects());
return g1;
} else {
return g2;
}
}
@Override
public Encoder<OpenaireBrokerResult> bufferEncoder() {
return Encoders.bean(OpenaireBrokerResult.class);
}
@Override
public Encoder<OpenaireBrokerResult> outputEncoder() {
return Encoders.bean(OpenaireBrokerResult.class);
}
}

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Dataset;
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
public class RelatedDataset implements Serializable {
@ -11,13 +11,15 @@ public class RelatedDataset implements Serializable {
*
*/
private static final long serialVersionUID = 774487705184038324L;
private final String source;
private final String relType;
private final Dataset relDataset;
public RelatedDataset(final String source, final String relType, final Dataset relDataset) {
private String source;
private OaBrokerRelatedDataset relDataset;
public RelatedDataset() {
}
public RelatedDataset(final String source, final OaBrokerRelatedDataset relDataset) {
this.source = source;
this.relType = relType;
this.relDataset = relDataset;
}
@ -25,12 +27,16 @@ public class RelatedDataset implements Serializable {
return source;
}
public String getRelType() {
return relType;
public void setSource(final String source) {
this.source = source;
}
public Dataset getRelDataset() {
public OaBrokerRelatedDataset getRelDataset() {
return relDataset;
}
public void setRelDataset(final OaBrokerRelatedDataset relDataset) {
this.relDataset = relDataset;
}
}

@ -0,0 +1,68 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import scala.Tuple2;
public class RelatedDatasetAggregator
extends Aggregator<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity, OaBrokerMainEntity> {
/**
*
*/
private static final long serialVersionUID = 6969761680131482557L;
@Override
public OaBrokerMainEntity zero() {
return new OaBrokerMainEntity();
}
@Override
public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
return g;
}
@Override
public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2<OaBrokerMainEntity, RelatedDataset> t) {
final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1;
if (t._2 != null && res.getDatasets().size() < BrokerConstants.MAX_NUMBER_OF_RELS) {
res.getDatasets().add(t._2.getRelDataset());
}
return res;
}
@Override
public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
if (StringUtils.isNotBlank(g1.getOpenaireId())) {
final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getDatasets().size();
if (availables > 0) {
if (g2.getDatasets().size() <= availables) {
g1.getDatasets().addAll(g2.getDatasets());
} else {
g1.getDatasets().addAll(g2.getDatasets().subList(0, availables));
}
}
return g1;
} else {
return g2;
}
}
@Override
public Encoder<OaBrokerMainEntity> bufferEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
@Override
public Encoder<OaBrokerMainEntity> outputEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
}

@ -1,34 +0,0 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Software;
public class RelatedEntityFactory {
@SuppressWarnings("unchecked")
public static <RT, T> RT newRelatedEntity(final String sourceId,
final String relType,
final T target,
final Class<RT> clazz) {
if (clazz == RelatedProject.class) {
return (RT) new RelatedProject(sourceId, relType,
ConversionUtils.oafProjectToBrokerProject((Project) target));
} else if (clazz == RelatedSoftware.class) {
return (RT) new RelatedSoftware(sourceId, relType,
ConversionUtils.oafSoftwareToBrokerSoftware((Software) target));
} else if (clazz == RelatedDataset.class) {
return (RT) new RelatedDataset(sourceId, relType,
ConversionUtils.oafDatasetToBrokerDataset((Dataset) target));
} else if (clazz == RelatedPublication.class) {
return (RT) new RelatedPublication(sourceId, relType,
ConversionUtils.oafPublicationToBrokerPublication((Publication) target));
} else {
return null;
}
}
}

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.broker.objects.OaBrokerProject;
public class RelatedProject implements Serializable {
@ -12,13 +12,14 @@ public class RelatedProject implements Serializable {
*/
private static final long serialVersionUID = 4941437626549329870L;
private final String source;
private final String relType;
private final Project relProject;
private String source;
private OaBrokerProject relProject;
public RelatedProject(final String source, final String relType, final Project relProject) {
public RelatedProject() {
}
public RelatedProject(final String source, final OaBrokerProject relProject) {
this.source = source;
this.relType = relType;
this.relProject = relProject;
}
@ -26,12 +27,16 @@ public class RelatedProject implements Serializable {
return source;
}
public String getRelType() {
return relType;
public void setSource(final String source) {
this.source = source;
}
public Project getRelProject() {
public OaBrokerProject getRelProject() {
return relProject;
}
public void setRelProject(final OaBrokerProject relProject) {
this.relProject = relProject;
}
}

@ -0,0 +1,68 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import scala.Tuple2;
public class RelatedProjectAggregator
extends Aggregator<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity, OaBrokerMainEntity> {
/**
*
*/
private static final long serialVersionUID = 8559808519152275763L;
@Override
public OaBrokerMainEntity zero() {
return new OaBrokerMainEntity();
}
@Override
public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
return g;
}
@Override
public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2<OaBrokerMainEntity, RelatedProject> t) {
final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1;
if (t._2 != null && res.getProjects().size() < BrokerConstants.MAX_NUMBER_OF_RELS) {
res.getProjects().add(t._2.getRelProject());
}
return res;
}
@Override
public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
if (StringUtils.isNotBlank(g1.getOpenaireId())) {
final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getProjects().size();
if (availables > 0) {
if (g2.getProjects().size() <= availables) {
g1.getProjects().addAll(g2.getProjects());
} else {
g1.getProjects().addAll(g2.getProjects().subList(0, availables));
}
}
return g1;
} else {
return g2;
}
}
@Override
public Encoder<OaBrokerMainEntity> bufferEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
@Override
public Encoder<OaBrokerMainEntity> outputEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
}

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Publication;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
public class RelatedPublication implements Serializable {
@ -12,13 +12,14 @@ public class RelatedPublication implements Serializable {
*/
private static final long serialVersionUID = 9021609640411395128L;
private final String source;
private final String relType;
private final Publication relPublication;
private String source;
private OaBrokerRelatedPublication relPublication;
public RelatedPublication(final String source, final String relType, final Publication relPublication) {
public RelatedPublication() {
}
public RelatedPublication(final String source, final OaBrokerRelatedPublication relPublication) {
this.source = source;
this.relType = relType;
this.relPublication = relPublication;
}
@ -26,12 +27,16 @@ public class RelatedPublication implements Serializable {
return source;
}
public String getRelType() {
return relType;
public void setSource(final String source) {
this.source = source;
}
public Publication getRelPublication() {
public OaBrokerRelatedPublication getRelPublication() {
return relPublication;
}
public void setRelPublication(final OaBrokerRelatedPublication relPublication) {
this.relPublication = relPublication;
}
}

@ -0,0 +1,70 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import scala.Tuple2;
public class RelatedPublicationAggregator
extends Aggregator<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity, OaBrokerMainEntity> {
/**
*
*/
private static final long serialVersionUID = 4656934981558135919L;
@Override
public OaBrokerMainEntity zero() {
return new OaBrokerMainEntity();
}
@Override
public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
return g;
}
@Override
public OaBrokerMainEntity reduce(final OaBrokerMainEntity g,
final Tuple2<OaBrokerMainEntity, RelatedPublication> t) {
final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1;
if (t._2 != null && res.getPublications().size() < BrokerConstants.MAX_NUMBER_OF_RELS) {
res.getPublications().add(t._2.getRelPublication());
}
return res;
}
@Override
public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
if (StringUtils.isNotBlank(g1.getOpenaireId())) {
final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getPublications().size();
if (availables > 0) {
if (g2.getPublications().size() <= availables) {
g1.getPublications().addAll(g2.getPublications());
} else {
g1.getPublications().addAll(g2.getPublications().subList(0, availables));
}
}
return g1;
} else {
return g2;
}
}
@Override
public Encoder<OaBrokerMainEntity> bufferEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
@Override
public Encoder<OaBrokerMainEntity> outputEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
}

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import java.io.Serializable;
import eu.dnetlib.broker.objects.Software;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
public class RelatedSoftware implements Serializable {
@ -11,13 +11,15 @@ public class RelatedSoftware implements Serializable {
*
*/
private static final long serialVersionUID = 7573383356943300157L;
private final String source;
private final String relType;
private final Software relSoftware;
public RelatedSoftware(final String source, final String relType, final Software relSoftware) {
private String source;
private OaBrokerRelatedSoftware relSoftware;
public RelatedSoftware() {
}
public RelatedSoftware(final String source, final OaBrokerRelatedSoftware relSoftware) {
this.source = source;
this.relType = relType;
this.relSoftware = relSoftware;
}
@ -25,12 +27,16 @@ public class RelatedSoftware implements Serializable {
return source;
}
public String getRelType() {
return relType;
public void setSource(final String source) {
this.source = source;
}
public Software getRelSoftware() {
public OaBrokerRelatedSoftware getRelSoftware() {
return relSoftware;
}
public void setRelSoftware(final OaBrokerRelatedSoftware relSoftware) {
this.relSoftware = relSoftware;
}
}

@ -0,0 +1,68 @@
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import scala.Tuple2;
public class RelatedSoftwareAggregator
extends Aggregator<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity, OaBrokerMainEntity> {
/**
*
*/
private static final long serialVersionUID = -8987959389106443702L;
@Override
public OaBrokerMainEntity zero() {
return new OaBrokerMainEntity();
}
@Override
public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
return g;
}
@Override
public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2<OaBrokerMainEntity, RelatedSoftware> t) {
final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1;
if (t._2 != null && res.getSoftwares().size() < BrokerConstants.MAX_NUMBER_OF_RELS) {
res.getSoftwares().add(t._2.getRelSoftware());
}
return res;
}
@Override
public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
if (StringUtils.isNotBlank(g1.getOpenaireId())) {
final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getSoftwares().size();
if (availables > 0) {
if (g2.getSoftwares().size() <= availables) {
g1.getSoftwares().addAll(g2.getSoftwares());
} else {
g1.getSoftwares().addAll(g2.getSoftwares().subList(0, availables));
}
}
return g1;
} else {
return g2;
}
}
@Override
public Encoder<OaBrokerMainEntity> bufferEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
@Override
public Encoder<OaBrokerMainEntity> outputEncoder() {
return Encoders.bean(OaBrokerMainEntity.class);
}
}

@ -0,0 +1,14 @@
[
{
"paramName": "g",
"paramLongName": "graphPath",
"paramDescription": "the path where there the graph is stored",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "workingPath",
"paramDescription": "the path where the temporary data will be stored",
"paramRequired": true
}
]

@ -6,8 +6,8 @@
<description>the path where the graph is stored</description>
</property>
<property>
<name>eventsOutputPath</name>
<description>the path where the the events will be stored</description>
<name>workingPath</name>
<description>the path where the the generated data will be stored</description>
</property>
<property>
<name>isLookupUrl</name>
@ -73,18 +73,107 @@
</configuration>
</global>
<start to="generate_events"/>
<start to="ensure_working_path"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="generate_events">
<action name="ensure_working_path">
<fs>
<mkdir path='${workingPath}'/>
</fs>
<ok to="start_entities_and_rels"/>
<error to="Kill"/>
</action>
<fork name="start_entities_and_rels">
<path start="prepare_simple_entities"/>
<path start="prepare_related_softwares"/>
<path start="prepare_related_datasets"/>
<path start="prepare_related_projects"/>
<path start="prepare_related_publications"/>
</fork>
<action name="prepare_simple_entities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareSimpleEntititiesJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
</action>
<action name="prepare_related_datasets">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareRelatedDatasetsJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedDatasetsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
</action>
<action name="prepare_related_projects">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareRelatedProjectsJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedProjectsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
</action>
<action name="prepare_related_publications">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEvents</name>
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</class>
<name>PrepareRelatedPublicationsJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedPublicationsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
@ -97,14 +186,182 @@
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
</action>
<action name="prepare_related_softwares">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareRelatedSoftwaresJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareRelatedSoftwaresJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="wait_entities_and_rels"/>
<error to="Kill"/>
</action>
<join name="wait_entities_and_rels" to="join_entities_step1"/>
<action name="join_entities_step1">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>JoinStep1</name>
<class>eu.dnetlib.dhp.broker.oa.JoinStep1Job</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="join_entities_step2"/>
<error to="Kill"/>
</action>
<action name="join_entities_step2">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>JoinStep2</name>
<class>eu.dnetlib.dhp.broker.oa.JoinStep2Job</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="join_entities_step3"/>
<error to="Kill"/>
</action>
<action name="join_entities_step3">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>JoinStep3</name>
<class>eu.dnetlib.dhp.broker.oa.JoinStep3Job</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="join_entities_step4"/>
<error to="Kill"/>
</action>
<action name="join_entities_step4">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>JoinStep4</name>
<class>eu.dnetlib.dhp.broker.oa.JoinStep4Job</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="prepare_groups"/>
<error to="Kill"/>
</action>
<action name="prepare_groups">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareGroupsJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareGroupsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</spark>
<ok to="generate_events"/>
<error to="Kill"/>
</action>
<action name="generate_events">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEventsJob</name>
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>

@ -1,13 +1,7 @@
[
{
"paramName": "g",
"paramLongName": "graphPath",
"paramDescription": "the path where there the graph is stored",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "eventsPath",
"paramLongName": "workingPath",
"paramDescription": "the path where the generated events will be stored",
"paramRequired": true
},

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

@ -0,0 +1,110 @@
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the path where the graph is stored</description>
</property>
<property>
<name>workingPath</name>
<description>the path where the the generated data will be stored</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>dedupConfProfId</name>
<description>the id of a valid Dedup Configuration Profile</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="generate_events"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="generate_events">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEventsJob</name>
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -271,6 +271,26 @@ object DoiBoostMappingUtil {
}
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
sp.setValue(value)
sp
}
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
sp.setValue(value)
sp.setDataInfo(dataInfo)
sp
}
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))
@ -279,6 +299,8 @@ object DoiBoostMappingUtil {
}
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
val sp = new StructuredProperty
sp.setQualifier(createQualifier(classId, schemeId))

@ -129,16 +129,16 @@ case object ConversionUtil {
val fieldOfStudy = item._2
if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) {
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
val s1 = createSP(s.DisplayName, "keyword", "dnet:subject_classification_typologies")
val s1 = createSP(s.DisplayName, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies")
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
var resList: List[StructuredProperty] = List(s1)
if (s.MainType.isDefined) {
val maintp = s.MainType.get
val s2 = createSP(s.MainType.get, "keyword", "dnet:subject_classification_typologies")
val s2 = createSP(s.MainType.get, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies")
s2.setDataInfo(di)
resList = resList ::: List(s2)
if (maintp.contains(".")) {
val s3 = createSP(maintp.split("\\.").head, "keyword", "dnet:subject_classification_typologies")
val s3 = createSP(maintp.split("\\.").head, "MAG","Microsoft Academic Graph classification", "dnet:subject_classification_typologies", "dnet:subject_classification_typologies")
s3.setDataInfo(di)
resList = resList ::: List(s3)
}

@ -43,7 +43,7 @@ object SparkPreProcessMAG {
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
logger.info("Phase 6) Enrich Publication with description")
logger.info("Phase 0) Enrich Publication with description")
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")

@ -32,7 +32,7 @@
<start to="GenerateActionSet"/>
<start to="CreateDOIBoost"/>
<kill name="Kill">

@ -18,6 +18,9 @@ class CrossrefMappingTest {
val mapper = new ObjectMapper()
@Test
def testFunderRelationshipsMapping(): Unit = {
val template = Source.fromInputStream(getClass.getResourceAsStream("article_funder_template.json")).mkString
@ -58,6 +61,27 @@ class CrossrefMappingTest {
}
@Test
def testOrcidID() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("orcid_data.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty);
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
val items = resultList.filter(p => p.isInstanceOf[Result])
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
items.foreach(p => println(mapper.writeValueAsString(p)))
}
@Test
def testEmptyTitle() :Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("empty_title.json")).mkString

@ -0,0 +1,271 @@
{
"DOI":"10.1016/j.carbpol.2020.115930",
"issued":{
"date-parts":[
[
2020,
4
]
]
},
"published-print":{
"date-parts":[
[
2020,
4
]
]
},
"prefix":"10.1016",
"subject":[
"Organic Chemistry",
"Materials Chemistry",
"Polymers and Plastics"
],
"author":[
{
"affiliation":[
],
"given":"Lei",
"family":"Fang",
"sequence":"first"
},
{
"affiliation":[
],
"given":"Hua",
"family":"Lin",
"sequence":"additional"
},
{
"affiliation":[
],
"given":"Zhenfeng",
"family":"Wu",
"sequence":"additional"
},
{
"affiliation":[
],
"given":"Zhen",
"family":"Wang",
"sequence":"additional"
},
{
"affiliation":[
],
"given":"Xinxin",
"family":"Fan",
"sequence":"additional"
},
{
"affiliation":[
],
"given":"Ziting",
"family":"Cheng",
"sequence":"additional"
},
{
"affiliation":[
],
"given":"Xiaoya",
"family":"Hou",
"sequence":"additional"
},
{
"authenticated-orcid":false,
"given":"Daquan",
"family":"Chen",
"sequence":"additional",
"affiliation":[
],
"ORCID":"http://orcid.org/0000-0002-6796-0204"
}
],
"reference-count":41,
"ISSN":[
"0144-8617"
],
"assertion":[
{
"name":"publisher",
"value":"Elsevier",
"label":"This article is maintained by"
},
{
"name":"articletitle",
"value":"In vitro/vivo evaluation of novel mitochondrial targeting charge-reversal polysaccharide-based antitumor nanoparticle",
"label":"Article Title"
},
{
"name":"journaltitle",
"value":"Carbohydrate Polymers",
"label":"Journal Title"
},
{
"name":"articlelink",
"value":"https://doi.org/10.1016/j.carbpol.2020.115930",
"label":"CrossRef DOI link to publisher maintained version"
},
{
"name":"content_type",
"value":"article",
"label":"Content Type"
},
{
"name":"copyright",
"value":"\\u00a9 2020 Elsevier Ltd. All rights reserved.",
"label":"Copyright"
}
],
"member":"78",
"source":"Crossref",
"score":1.0,
"deposited":{
"timestamp":1584590965000,
"date-time":"2020-03-19T04:09:25Z",
"date-parts":[
[
2020,
3,
19
]
]
},
"indexed":{
"timestamp":1584592912467,
"date-time":"2020-03-19T04:41:52Z",
"date-parts":[
[
2020,
3,
19
]
]
},
"type":"journal-article",
"URL":"http://dx.doi.org/10.1016/j.carbpol.2020.115930",
"is-referenced-by-count":0,
"volume":"234",
"issn-type":[
{
"type":"print",
"value":"0144-8617"
}
],
"link":[
{
"URL":"https://api.elsevier.com/content/article/PII:S0144861720301041?httpAccept=text/xml",
"intended-application":"text-mining",
"content-version":"vor",
"content-type":"text/xml"
},
{
"URL":"https://api.elsevier.com/content/article/PII:S0144861720301041?httpAccept=text/plain",
"intended-application":"text-mining",
"content-version":"vor",
"content-type":"text/plain"
}
],
"update-policy":"http://dx.doi.org/10.1016/elsevier_cm_policy",
"references-count":41,
"short-container-title":[
"Carbohydrate Polymers"
],
"publisher":"Elsevier BV",
"content-domain":{
"domain":[
"elsevier.com",
"sciencedirect.com"
],
"crossmark-restriction":true
},
"language":"en",
"license":[
{
"URL":"https://www.elsevier.com/tdm/userlicense/1.0/",
"start":{
"timestamp":1585699200000,
"date-time":"2020-04-01T00:00:00Z",
"date-parts":[
[
2020,
4,
1
]
]
},
"content-version":"tdm",
"delay-in-days":0
}
],
"created":{
"timestamp":1581759678000,
"date-time":"2020-02-15T09:41:18Z",
"date-parts":[
[
2020,
2,
15
]
]
},
"title":[
"In vitro/vivo evaluation of novel mitochondrial targeting charge-reversal polysaccharide-based antitumor nanoparticle"
],
"alternative-id":[
"S0144861720301041"
],
"container-title":[
"Carbohydrate Polymers"
],
"funder":[
{
"doi-asserted-by":"publisher",
"DOI":"10.13039/501100007129",
"name":"Natural Science Foundation of Shandong Province",
"award":[
"ZR2019ZD24",
"ZR2019YQ30"
]
},
{
"doi-asserted-by":"publisher",
"DOI":"10.13039/100010449",
"name":"Ministry of Education, Libya",
"award":[
]
},
{
"doi-asserted-by":"publisher",
"DOI":"10.13039/501100012249",
"name":"Jiangxi University of Traditional Chinese Medicine",
"award":[
"TCM-0906"
]
},
{
"name":"Taishan Scholar Program",
"award":[
"qnts20161035"
]
},
{
"name":"Open fund project of Key Laboratory of Modern Preparation of TCM",
"award":[
]
}
],
"page":"115930",
"article-number":"115930"
}

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.BufferedInputStream;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -19,7 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -84,12 +90,100 @@ public class CleanGraphSparkJob {
readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> fixDefaults(value), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
r
.getSubject()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
r
.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
}
}
if (Objects.nonNull(r.getAuthor())) {
boolean nullRank = r
.getAuthor()
.stream()
.anyMatch(a -> Objects.isNull(a.getRank()));
if (nullRank) {
int i = 1;
for (Author author : r.getAuthor()) {
author.setRank(i++);
}
}
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
private static <T extends Oaf> Dataset<T> readTableFromPath(
SparkSession spark, String inputEntityPath, Class<T> clazz) {

@ -4,10 +4,13 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.util.HashMap;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
@ -18,23 +21,24 @@ public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>
*/
public static CleaningRuleMap create(VocabularyGroup vocabularies) {
CleaningRuleMap mapping = new CleaningRuleMap();
mapping.put(Qualifier.class, o -> {
Qualifier q = (Qualifier) o;
if (vocabularies.vocabularyExists(q.getSchemeid())) {
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
q.setClassid(newValue.getClassid());
q.setClassname(newValue.getClassname());
mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o));
mapping.put(Country.class, o -> {
final Country c = (Country) o;
if (StringUtils.isBlank(c.getSchemeid())) {
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
}
});
mapping.put(StructuredProperty.class, o -> {
StructuredProperty sp = (StructuredProperty) o;
// TODO implement a policy
/*
* if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null);
* }
*/
cleanQualifier(vocabularies, c);
});
return mapping;
}
private static <Q extends Qualifier> void cleanQualifier(VocabularyGroup vocabularies, Q q) {
if (vocabularies.vocabularyExists(q.getSchemeid())) {
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
q.setClassid(newValue.getClassid());
q.setClassname(newValue.getClassname());
}
}
}

@ -50,8 +50,6 @@ import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
@ -106,6 +104,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final String dbPassword = parser.get("postgresPassword");
log.info("postgresPassword: xxx");
final String dbSchema = parser.get("dbschema");
log.info("dbSchema {}: " + dbSchema);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
@ -125,7 +126,11 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
smdbe.execute("queryDatasources.sql", smdbe::processDatasource);
log.info("Processing projects...");
smdbe.execute("queryProjects.sql", smdbe::processProject);
if (dbSchema.equalsIgnoreCase("beta")) {
smdbe.execute("queryProjects.sql", smdbe::processProject);
} else {
smdbe.execute("queryProjects_production.sql", smdbe::processProject);
}
log.info("Processing orgs...");
smdbe.execute("queryOrganizations.sql", smdbe::processOrganization);

@ -9,7 +9,15 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
@ -89,7 +97,9 @@ public class OafMapperUtils {
}
public static StructuredProperty structuredProperty(
final String value, final Qualifier qualifier, final DataInfo dataInfo) {
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
@ -192,8 +202,12 @@ public class OafMapperUtils {
}
public static String createOpenaireId(
final int prefix, final String originalId, final boolean to_md5) {
if (to_md5) {
final int prefix,
final String originalId,
final boolean to_md5) {
if (StringUtils.isBlank(originalId)) {
return null;
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
@ -203,7 +217,9 @@ public class OafMapperUtils {
}
public static String createOpenaireId(
final String type, final String originalId, final boolean to_md5) {
final String type,
final String originalId,
final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);

@ -122,7 +122,11 @@ public class VocabularyGroup implements Serializable {
}
public boolean vocabularyExists(final String vocId) {
return vocs.containsKey(vocId.toLowerCase());
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(id -> vocs.containsKey(id))
.orElse(false);
}
private void addSynonyms(final String vocId, final String termId, final String syn) {

@ -34,5 +34,11 @@
"paramLongName": "isLookupUrl",
"paramDescription": "the url of the ISLookupService",
"paramRequired": true
},
{
"paramName": "dbschema",
"paramLongName": "dbschema",
"paramDescription": "the database schema according to the D-Net infrastructure (beta or production)",
"paramRequired": true
}
]

@ -25,6 +25,11 @@
<property>
<name>postgresPassword</name>
<description>the password postgres</description>
</property>
<property>
<name>dbSchema</name>
<value>beta</value>
<description>the database schema according to the D-Net infrastructure (beta or production)</description>
</property>
<property>
<name>mongoURL</name>
@ -125,6 +130,7 @@
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--action</arg><arg>claims</arg>
<arg>--dbschema</arg><arg>${dbSchema}</arg>
</java>
<ok to="ImportODF_claims"/>
<error to="Kill"/>
@ -175,6 +181,7 @@
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--dbschema</arg><arg>${dbSchema}</arg>
</java>
<ok to="ImportODF"/>
<error to="Kill"/>

@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -56,6 +57,9 @@ public class CleaningFunctionTest {
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
Publication p_out = OafCleaner.apply(p_in, mapping);
assertNotNull(p_out);
@ -63,6 +67,9 @@ public class CleaningFunctionTest {
assertEquals("und", p_out.getLanguage().getClassid());
assertEquals("Undetermined", p_out.getLanguage().getClassname());
assertEquals("DE", p_out.getCountry().get(0).getClassid());
assertEquals("Germany", p_out.getCountry().get(0).getClassname());
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());

@ -111,6 +111,7 @@ public class MappersTest {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("0001", p.getInstance().get(0).getRefereed().getClassid());
assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid());
@ -217,6 +218,7 @@ public class MappersTest {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
assertValidId(r1.getSource());
assertValidId(r1.getTarget());

@ -202,6 +202,12 @@
"contributor": [
],
"country": [
{
"classid": "DE",
"classname": "DE",
"schemeid": "dnet:countries",
"schemename": "dnet:countries"
}
],
"coverage": [
],

@ -57,6 +57,7 @@
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
<oaf:refereed>0001</oaf:refereed>
</metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">

@ -90,6 +90,7 @@
<oaf:hostedBy id="re3data_____::r3d100010468" name="Zenodo"/>
<oaf:projectid>corda_______::226852</oaf:projectid>
<oaf:collectedFrom id="re3data_____::r3d100010468" name="Zenodo"/>
<oaf:refereed>0001</oaf:refereed>s
</metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -4,9 +4,12 @@ import java.time.LocalDateTime
import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.common.PacePerson
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Relation, StructuredProperty}
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.codehaus.jackson.map.ObjectMapper
import scala.collection.JavaConverters._
@ -77,6 +80,76 @@ object DLIToOAF {
)
val rel_inverse: Map[String, String] = Map(
"isRelatedTo" -> "isRelatedTo",
"IsSupplementedBy" -> "isSupplementTo",
"cites" -> "IsCitedBy",
"IsCitedBy" -> "cites",
"reviews" -> "IsReviewedBy"
)
val PidTypeMap: Map[String, String] = Map(
"pbmid" -> "pmid",
"pmcid" -> "pmc",
"pmid" -> "pmid",
"pubmedid" -> "pmid",
"DOI" -> "doi",
"doi" -> "doi"
)
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: Dataset =>
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
a.setClazz(classOf[Dataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def convertClinicalTrial(dataset: DLIDataset): (String, String) = {
val currentId = generateId(dataset.getId)
val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}")
if (pids.isEmpty)
null
else
(currentId, pids.head)
}
def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = {
val eRefs = externalReferences.map(e => {
val result = new ExternalReference()
result.setSitename(e.sitename)
result.setLabel(e.label)
result.setUrl(e.url)
result.setRefidentifier(e.pid)
result.setDataInfo(generateDataInfo())
result.setQualifier(createQualifier(e.classId, "dnet:externalReference_typologies"))
result
})
publication.setExternalReference(eRefs.asJava)
publication
}
def filterPid(p: StructuredProperty): Boolean = {
if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url"))
if (filteredURL.exists(u => p.getValue.contains(u)))
@ -97,7 +170,6 @@ object DLIToOAF {
}
def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = {
val currentId = generateId(dataset.getId)
val pids = dataset.getPid.asScala.filter(filterPid)
if (pids == null || pids.isEmpty)
@ -109,7 +181,7 @@ object DLIToOAF {
pid.getQualifier.getClassname match {
case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
case "ena" =>
if(pid.getValue!= null && pid.getValue.nonEmpty && pid.getValue.length>7)
if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7)
DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
else
null
@ -126,43 +198,50 @@ object DLIToOAF {
}
def convertDLIPublicationToOAF(p: DLIPublication): Publication = {
def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = {
val result = new Publication
result.setId(generateId(p.getId))
val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid))
.map(p => {
p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid))
p
})
if (cleanedPids.isEmpty)
return null
result.setId(generateId(inputPublication.getId))
result.setDataInfo(generateDataInfo(invisibile = true))
if (p.getCollectedfrom == null || p.getCollectedfrom.size() == 0 || (p.getCollectedfrom.size() == 1 && p.getCollectedfrom.get(0) == null))
if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null))
return null
result.setCollectedfrom(p.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava)
result.setPid(p.getPid)
result.setDateofcollection(p.getDateofcollection)
result.setOriginalId(p.getPid.asScala.map(p => p.getValue).asJava)
result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
if(result.getCollectedfrom.isEmpty)
return null
result.setPid(cleanedPids.asJava)
result.setDateofcollection(inputPublication.getDateofcollection)
result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava)
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
if (p.getAuthor == null || p.getAuthor.isEmpty)
if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty)
return null
result.setAuthor(p.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(p.getResulttype.getClassid, p.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies"))
result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava)
result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies"))
if (p.getSubject != null)
result.setSubject(p.getSubject.asScala.map(convertSubject).asJava)
if (inputPublication.getSubject != null)
result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava)
if (p.getTitle == null || p.getTitle.isEmpty)
if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty)
return null
result.setTitle(List(patchTitle(p.getTitle.get(0))).asJava)
result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava)
if (p.getRelevantdate == null || p.getRelevantdate.size() == 0)
if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0)
return null
result.setRelevantdate(p.getRelevantdate.asScala.map(patchRelevantDate).asJava)
result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava)
result.setDescription(p.getDescription)
result.setDescription(inputPublication.getDescription)
result.setDateofacceptance(asField(p.getRelevantdate.get(0).getValue))
result.setPublisher(p.getPublisher)
result.setSource(p.getSource)
result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue))
result.setPublisher(inputPublication.getPublisher)
result.setSource(inputPublication.getSource)
result.setBestaccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes"))
val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue)
@ -170,7 +249,7 @@ object DLIToOAF {
return null
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(p.getInstance()), result.getDateofacceptance)
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance)
if (i != null)
result.setInstance(List(i).asJava)
@ -211,7 +290,9 @@ object DLIToOAF {
val result: Dataset = new Dataset
result.setId(generateId(d.getId))
result.setDataInfo(generateDataInfo())
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava)
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
if(result.getCollectedfrom.isEmpty)
return null
result.setPid(d.getPid)
@ -280,7 +361,7 @@ object DLIToOAF {
if (dataset)
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
else
i.setInstancetype(createQualifier("0000", "UNKNOWN", "dnet:publication_resource", "dnet:publication_resource"))
i.setInstancetype(createQualifier("0000", "Unknown", "dnet:publication_resource", "dnet:publication_resource"))
if (originalInstance != null && originalInstance.getHostedby != null)
i.setHostedby(originalInstance.getHostedby)

@ -4,10 +4,16 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.{SparkConf, SparkContext}
import org.codehaus.jackson.map.ObjectMapper
import scala.collection.mutable.ArrayBuffer
@ -36,57 +42,66 @@ object SparkExportContentForOpenAire {
implicit val dliRelEncoder: Encoder[DLIRelation] = Encoders.bean(classOf[DLIRelation])
import spark.implicits._
//
// val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j")
// .map(s => new ObjectMapper().readValue(s, classOf[DLIRelation]))
// .filter(p => p.getDataInfo.getDeletedbyinference == false)
// .map(DLIToOAF.convertDLIRelation).filter(p=>p!= null)
// spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS")
//
// val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset")
// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
// .filter(p => p.getDataInfo.getDeletedbyinference == false)
// .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null)
// spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS")
//
//
// val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication")
// .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication]))
// .filter(p => p.getDataInfo.getDeletedbyinference == false)
// .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null)
// spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS")
//
//
//
// val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication]
// val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset]
var relDS :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation]
//
//
// pubs.joinWith(relDS, pubs("id").equalTo(relDS("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1")
//
// relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation]
//
// relDS.joinWith(dats, relDS("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered")
//
//
// val r_source = relDS.select(relDS("source")).distinct()
// val r_target = relDS.select(relDS("source")).distinct()
//
//
// pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered")
//
// dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS_filtered")
//
// spark.createDataset(sc.textFile(s"$workingPath/dataset")
// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
// .map(DLIToOAF.convertDLIDatasetToExternalReference)
// .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference")
//
val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j")
.map(s => new ObjectMapper().readValue(s, classOf[DLIRelation]))
.filter(p => p.getDataInfo.getDeletedbyinference == false)
.map(DLIToOAF.convertDLIRelation).filter(p=>p!= null)
spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS")
val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset")
.map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
.filter(p => p.getDataInfo.getDeletedbyinference == false)
.map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null)
spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS")
val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication")
.map(s => new ObjectMapper().readValue(s, classOf[DLIPublication]))
.filter(p => p.getDataInfo.getDeletedbyinference == false)
.map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null)
spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS")
val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication]
val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset]
val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation]
val pub_id = pubs.select("id").distinct()
val dat_id = dats.select("id").distinct()
pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1")
val relDS2= spark.read.load(s"$workingPath/relationDS_f1").as[Relation]
relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered")
val r_source = relDS2.select(relDS2("source")).distinct()
val r_target = relDS2.select(relDS2("target")).distinct()
val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp")
pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1)
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered")
dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1)
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
.write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS")
spark.createDataset(sc.textFile(s"$workingPath/dataset")
.map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
.map(DLIToOAF.convertDLIDatasetToExternalReference)
.filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference")
val pf = spark.read.load(s"$workingPath/publicationDS_filtered").select("id")
relDS = spark.read.load(s"$workingPath/relationDS").as[Relation]
val relationTo = pf.joinWith(relDS, pf("id").equalTo(relDS("source")),"inner").map(t =>t._2)
val relDS3 = spark.read.load(s"$workingPath/relationDS").as[Relation]
val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2)
val extRef = spark.read.load(s"$workingPath/externalReference").as[DLIExternalReference]
@ -100,19 +115,70 @@ object SparkExportContentForOpenAire {
(f._1, dli_ext)
})).write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference_grouped")
val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS_filtered").as[Publication]
val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/externalReference_grouped").as[(String, List[DLIExternalReference])]
groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t =>
{
val publication = t._2
if (t._1 != null) {
val eRefs = t._1._2
DLIToOAF.insertExternalRefs(publication, eRefs)
} else
publication
}
).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS")
spark.createDataset(sc.textFile(s"$workingPath/dataset")
.map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
.map(DLIToOAF.convertClinicalTrial)
.filter(p => p != null))
.write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrials")
val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/clinicalTrials").as[(String,String)]
val relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation]
relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner")
.map(k =>{
val currentRel = k._1
currentRel.setTarget(k._2._2)
currentRel
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrialsRels")
val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/clinicalTrialsRels").as[Relation]
val rels:Dataset[Relation] = spark.read.load(s"$workingPath/relationDS_filtered").as[Relation]
rels.union(clRels).flatMap(r => {
val inverseRel = new Relation
inverseRel.setSource(r.getTarget)
inverseRel.setTarget(r.getSource)
inverseRel.setDataInfo(r.getDataInfo)
inverseRel.setCollectedfrom(r.getCollectedfrom)
inverseRel.setRelType(r.getRelType)
inverseRel.setSubRelType(r.getSubRelType)
inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass))
List(r, inverseRel)
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS")
val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet)
val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet)
val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet)
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

@ -3,7 +3,7 @@
<parent>
<artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<version>1.2.3-SNAPSHOT</version>
<version>1.2.4-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>

@ -25,9 +25,7 @@ import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
@ -109,11 +107,12 @@ public class CreateRelatedEntitiesJob_phase1 {
Class<E> clazz,
String outputPath) {
Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
Dataset<Tuple2<String, Relation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
.filter("dataInfo.deletedbyinference == false")
.map(
(MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)))
(MapFunction<Relation, Tuple2<String, Relation>>) r -> new Tuple2<>(r.getTarget(),
r),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class)))
.cache();
Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, clazz)
@ -129,7 +128,7 @@ public class CreateRelatedEntitiesJob_phase1 {
relsByTarget
.joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
.map(
(MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, RelatedEntity>>, RelatedEntityWrapper>) t -> new RelatedEntityWrapper(
(MapFunction<Tuple2<Tuple2<String, Relation>, Tuple2<String, RelatedEntity>>, RelatedEntityWrapper>) t -> new RelatedEntityWrapper(
t._1()._2(), t._2()._2()),
Encoders.kryo(RelatedEntityWrapper.class))
.write()
@ -232,11 +231,11 @@ public class CreateRelatedEntitiesJob_phase1 {
* @param relationPath
* @return the Dataset<SortableRelation> containing all the relationships
*/
private static Dataset<SortableRelation> readPathRelation(
private static Dataset<Relation> readPathRelation(
SparkSession spark, final String relationPath) {
log.info("Reading relations from: {}", relationPath);
return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class));
return spark.read().load(relationPath).as(Encoders.bean(Relation.class));
}
private static void removeOutputDir(SparkSession spark, String path) {

@ -3,35 +3,31 @@ package eu.dnetlib.dhp.oa.provision;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*;
import java.util.function.Function;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.clearspring.analytics.util.Lists;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
import scala.Function1;
import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/**
@ -133,22 +129,35 @@ public class PrepareRelationsJob {
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
int relPartitions) {
RDD<SortableRelation> cappedRels = readPathRelationRDD(spark, inputRelationsPath)
.repartition(relPartitions)
.filter(rel -> !rel.getDataInfo().getDeletedbyinference())
.filter(rel -> !relationFilter.contains(rel.getRelClass()))
// group by SOURCE and apply limit
.mapToPair(rel -> new Tuple2<>(rel.getSource(), rel))
.groupByKey(new RelationPartitioner(relPartitions))
.flatMap(group -> Iterables.limit(group._2(), maxRelations).iterator())
// group by TARGET and apply limit
.mapToPair(rel -> new Tuple2<>(rel.getTarget(), rel))
.groupByKey(new RelationPartitioner(relPartitions))
.flatMap(group -> Iterables.limit(group._2(), maxRelations).iterator())
// group by SOURCE and apply limit
RDD<Relation> bySource = readPathRelationRDD(spark, inputRelationsPath)
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false)
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getSource()), r))
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1)
.map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator)
.map(Tuple2::_2)
.rdd();
// group by TARGET and apply limit
RDD<Relation> byTarget = readPathRelationRDD(spark, inputRelationsPath)
.filter(rel -> rel.getDataInfo().getDeletedbyinference() == false)
.filter(rel -> relationFilter.contains(rel.getRelClass()) == false)
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, r.getTarget()), r))
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1)
.map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator)
.map(Tuple2::_2)
.rdd();
spark
.createDataset(cappedRels, Encoders.bean(SortableRelation.class))
.createDataset(bySource.union(byTarget), Encoders.bean(Relation.class))
.repartition(relPartitions)
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
@ -162,10 +171,10 @@ public class PrepareRelationsJob {
* @param inputPath
* @return the JavaRDD<SortableRelation> containing all the relationships
*/
private static JavaRDD<SortableRelation> readPathRelationRDD(
private static JavaRDD<Relation> readPathRelationRDD(
SparkSession spark, final String inputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class));
return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
}
private static void removeOutputDir(SparkSession spark, String path) {

@ -19,7 +19,7 @@ public class ProvisionModelSupport {
RelatedEntityWrapper.class,
JoinedEntity.class,
RelatedEntity.class,
SortableRelation.class));
SortableRelationKey.class));
return modelClasses.toArray(new Class[] {});
}
}

@ -5,28 +5,30 @@ import java.io.Serializable;
import com.google.common.base.Objects;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class RelatedEntityWrapper implements Serializable {
private SortableRelation relation;
private Relation relation;
private RelatedEntity target;
public RelatedEntityWrapper() {
}
public RelatedEntityWrapper(SortableRelation relation, RelatedEntity target) {
public RelatedEntityWrapper(Relation relation, RelatedEntity target) {
this(null, relation, target);
}
public RelatedEntityWrapper(TypedRow entity, SortableRelation relation, RelatedEntity target) {
public RelatedEntityWrapper(TypedRow entity, Relation relation, RelatedEntity target) {
this.relation = relation;
this.target = target;
}
public SortableRelation getRelation() {
public Relation getRelation() {
return relation;
}
public void setRelation(SortableRelation relation) {
public void setRelation(Relation relation) {
this.relation = relation;
}

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.oa.provision.model;
import java.io.Serializable;
import java.util.Map;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class SortableRelation extends Relation implements Comparable<Relation>, Serializable {
private static final Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("affiliation", 2);
weights.put("relationship", 3);
weights.put("publicationDataset", 4);
weights.put("similarity", 5);
weights.put("provision", 6);
weights.put("participation", 7);
weights.put("dedup", 8);
}
@Override
public int compareTo(Relation o) {
return ComparisonChain
.start()
.compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
.compare(getSource(), o.getSource())
.compare(getTarget(), o.getTarget())
.result();
}
}

@ -0,0 +1,90 @@
package eu.dnetlib.dhp.oa.provision.model;
import java.io.Serializable;
import java.util.Map;
import java.util.Optional;
import com.google.common.base.Objects;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
private static final Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("review", 2);
weights.put("citation", 3);
weights.put("affiliation", 4);
weights.put("relationship", 5);
weights.put("publicationDataset", 6);
weights.put("similarity", 7);
weights.put("provision", 8);
weights.put("participation", 9);
weights.put("dedup", 10);
}
private static final long serialVersionUID = 3232323;
private String groupingKey;
private String subRelType;
public static SortableRelationKey create(Relation r, String groupingKey) {
SortableRelationKey sr = new SortableRelationKey();
sr.setGroupingKey(groupingKey);
sr.setSubRelType(r.getSubRelType());
return sr;
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
SortableRelationKey that = (SortableRelationKey) o;
return getGroupingKey().equals(that.getGroupingKey());
}
@Override
public int hashCode() {
return Objects.hashCode(getGroupingKey());
}
@Override
public int compareTo(SortableRelationKey o) {
return ComparisonChain
.start()
.compare(getGroupingKey(), o.getGroupingKey())
.compare(getWeight(this), getWeight(o))
.result();
}
private Integer getWeight(SortableRelationKey o) {
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
}
public String getSubRelType() {
return subRelType;
}
public void setSubRelType(String subRelType) {
this.subRelType = subRelType;
}
public String getGroupingKey() {
return groupingKey;
}
public void setGroupingKey(String groupingKey) {
this.groupingKey = groupingKey;
}
}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save