forked from D-Net/dnet-hadoop
merge branch with fork master
This commit is contained in:
commit
25a7205549
|
@ -1,6 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.common;
|
||||
|
||||
import java.security.Key;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
public class ModelConstants {
|
||||
|
@ -95,6 +99,9 @@ public class ModelConstants {
|
|||
SYSIMPORT_CROSSWALK_ENTITYREGISTRY, SYSIMPORT_CROSSWALK_ENTITYREGISTRY,
|
||||
DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS);
|
||||
|
||||
public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
|
||||
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
|
||||
|
||||
private static Qualifier qualifier(
|
||||
final String classid,
|
||||
final String classname,
|
||||
|
@ -107,4 +114,12 @@ public class ModelConstants {
|
|||
q.setSchemename(schemename);
|
||||
return q;
|
||||
}
|
||||
|
||||
private static KeyValue keyValue(String key, String value) {
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey(key);
|
||||
kv.setValue(value);
|
||||
kv.setDataInfo(new DataInfo());
|
||||
return kv;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.promote;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.BiFunction;
|
||||
|
@ -20,6 +21,7 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.exc.UnrecognizedPropertyException;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
|
||||
|
@ -134,24 +136,39 @@ public class PromoteActionPayloadForGraphTableJob {
|
|||
.map(
|
||||
(MapFunction<String, G>) value -> OBJECT_MAPPER.readValue(value, rowClazz),
|
||||
Encoders.bean(rowClazz));
|
||||
|
||||
/*
|
||||
* return spark .read() .parquet(path) .as(Encoders.bean(rowClazz));
|
||||
*/
|
||||
}
|
||||
|
||||
private static <A extends Oaf> Dataset<A> readActionPayload(
|
||||
SparkSession spark, String path, Class<A> actionPayloadClazz) {
|
||||
logger.info("Reading action payload from path: {}", path);
|
||||
|
||||
return spark
|
||||
.read()
|
||||
.parquet(path)
|
||||
.map((MapFunction<Row, String>) value -> extractPayload(value), Encoders.STRING())
|
||||
.map(
|
||||
(MapFunction<Row, A>) value -> OBJECT_MAPPER
|
||||
.readValue(value.<String> getAs("payload"), actionPayloadClazz),
|
||||
(MapFunction<String, A>) value -> decodePayload(actionPayloadClazz, value),
|
||||
Encoders.bean(actionPayloadClazz));
|
||||
}
|
||||
|
||||
private static String extractPayload(Row value) {
|
||||
try {
|
||||
return value.<String> getAs("payload");
|
||||
} catch (IllegalArgumentException | ClassCastException e) {
|
||||
logger.error("cannot extract payload from action: {}", value.toString());
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private static <A extends Oaf> A decodePayload(Class<A> actionPayloadClazz, String payload) throws IOException {
|
||||
try {
|
||||
return OBJECT_MAPPER.readValue(payload, actionPayloadClazz);
|
||||
} catch (UnrecognizedPropertyException e) {
|
||||
logger.error("error decoding payload: {}", payload);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private static <G extends Oaf, A extends Oaf> Dataset<G> promoteActionPayloadForGraphTable(
|
||||
Dataset<G> rowDS,
|
||||
Dataset<A> actionPayloadDS,
|
||||
|
|
|
@ -24,11 +24,7 @@
|
|||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-hive_2.11</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
|
@ -45,10 +41,6 @@
|
|||
<artifactId>dnet-pace-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
|
@ -61,7 +53,7 @@
|
|||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||
<version>[3.0.1,4.0.0)</version>
|
||||
<version>[3.0.3,4.0.0)</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
|
|
@ -11,7 +11,7 @@ import org.apache.commons.codec.digest.DigestUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.time.DateUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
|
||||
public class EventFactory {
|
||||
|
@ -49,8 +49,8 @@ public class EventFactory {
|
|||
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
|
||||
final Map<String, Object> map = new HashMap<>();
|
||||
|
||||
final OpenaireBrokerResult source = updateInfo.getSource();
|
||||
final OpenaireBrokerResult target = updateInfo.getTarget();
|
||||
final OaBrokerMainEntity source = updateInfo.getSource();
|
||||
final OaBrokerMainEntity target = updateInfo.getTarget();
|
||||
|
||||
map.put("target_datasource_id", target.getCollectedFromId());
|
||||
map.put("target_datasource_name", target.getCollectedFromName());
|
||||
|
|
|
@ -18,25 +18,20 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OaBrokerMainEntityAggregator;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
@ -46,8 +41,6 @@ public class GenerateEventsApplication {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
|
@ -75,126 +68,108 @@ public class GenerateEventsApplication {
|
|||
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
conf.registerKryoClasses(BrokerConstants.getModelClasses());
|
||||
// conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
// conf.registerKryoClasses(BrokerConstants.getModelClasses());
|
||||
|
||||
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
||||
// TODO UNCOMMENT
|
||||
// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
||||
final DedupConfig dedupConfig = null;
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
removeOutputDir(spark, eventsPath);
|
||||
ClusterUtils.removeDir(spark, eventsPath);
|
||||
|
||||
spark
|
||||
.emptyDataset(Encoders.kryo(Event.class))
|
||||
.union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
|
||||
.union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
|
||||
.union(generateEvents(spark, graphPath, Software.class, dedupConfig))
|
||||
.union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
|
||||
// TODO REMOVE THIS
|
||||
|
||||
expandResultsWithRelations(spark, graphPath, Publication.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(eventsPath);
|
||||
|
||||
// TODO UNCOMMENT THIS
|
||||
// spark
|
||||
// .emptyDataset(Encoders.bean(Event.class))
|
||||
// .union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
|
||||
// .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
|
||||
// .union(generateEvents(spark, graphPath, Software.class, dedupConfig))
|
||||
// .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
|
||||
// .write()
|
||||
// .mode(SaveMode.Overwrite)
|
||||
// .option("compression", "gzip")
|
||||
// .json(eventsPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static void removeOutputDir(final SparkSession spark, final String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static <SRC extends Result> Dataset<Event> generateEvents(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Dataset<OpenaireBrokerResult> results = expandResultsWithRelations(spark, graphPath, sourceClass);
|
||||
final Dataset<OaBrokerMainEntity> results = expandResultsWithRelations(spark, graphPath, sourceClass);
|
||||
|
||||
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
final Dataset<Relation> mergedRels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final TypedColumn<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
.toColumn();
|
||||
|
||||
return results
|
||||
.joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner")
|
||||
.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
||||
.filter(ResultGroup::isValid)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
|
||||
.filter(rg -> rg.getData().size() > 1)
|
||||
.map(
|
||||
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
|
||||
Encoders.kryo(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
|
||||
Encoders.bean(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class));
|
||||
}
|
||||
|
||||
private static <SRC extends Result> Dataset<OpenaireBrokerResult> expandResultsWithRelations(
|
||||
private static <SRC extends Result> Dataset<OaBrokerMainEntity> expandResultsWithRelations(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass) {
|
||||
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
|
||||
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
|
||||
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
|
||||
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
|
||||
|
||||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||
.cache();
|
||||
// final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
|
||||
// spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
// final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
|
||||
// final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
|
||||
|
||||
final Dataset<OpenaireBrokerResult> r0 = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.kryo(OpenaireBrokerResult.class));
|
||||
final Dataset<OaBrokerMainEntity> r0 = ClusterUtils
|
||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
final Dataset<OpenaireBrokerResult> r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class));
|
||||
final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class));
|
||||
final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class));
|
||||
final Dataset<OpenaireBrokerResult> r4 = join(
|
||||
r3, rels, relatedEntities(publications, rels, RelatedProject.class));
|
||||
;
|
||||
// TODO UNCOMMENT THIS
|
||||
// final Dataset<OaBrokerMainEntity> r1 = join(r0, relatedProjects(spark, graphPath));
|
||||
// final Dataset<OaBrokerMainEntity> r2 = join(r1, relatedDataset(spark, graphPath));
|
||||
// final Dataset<OaBrokerMainEntity> r3 = join(r2, relatedPublications(spark, graphPath));
|
||||
// final Dataset<OaBrokerMainEntity> r4 = join(r3, relatedSoftwares(spark, graphPath));
|
||||
|
||||
return r4;
|
||||
return r0; // TODO it should be r4
|
||||
}
|
||||
|
||||
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
|
||||
final Dataset<Relation> rels,
|
||||
final Class<RT> clazz) {
|
||||
return rels
|
||||
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
|
||||
Encoders.kryo(clazz));
|
||||
}
|
||||
|
||||
private static <T> Dataset<OpenaireBrokerResult> join(final Dataset<OpenaireBrokerResult> sources,
|
||||
final Dataset<Relation> rels,
|
||||
private static <T> Dataset<OaBrokerMainEntity> join(final Dataset<OaBrokerMainEntity> sources,
|
||||
final Dataset<T> typedRels) {
|
||||
|
||||
final TypedColumn<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator<T>()
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, T>, OaBrokerMainEntity> aggr = new OaBrokerMainEntityAggregator<T>()
|
||||
.toColumn();
|
||||
;
|
||||
|
||||
return sources
|
||||
.joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer")
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.kryo(OpenaireBrokerResult.class));
|
||||
}
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
final SparkSession spark,
|
||||
final String inputPath,
|
||||
final Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
|
||||
|
||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
final String conf = isLookUpService
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class GenerateRelatedDatasets {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateRelatedDatasets.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateRelatedDatasets.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String relsPath = parser.get("relsPath");
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = ClusterUtils
|
||||
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
||||
|
||||
rels
|
||||
.joinWith(datasets, datasets.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedDataset(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafDatasetToBrokerDataset(t._2)),
|
||||
Encoders.bean(RelatedDataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,79 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class GenerateRelatedProjects {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateRelatedProjects.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateRelatedProjects.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String relsPath = parser.get("relsPath");
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final Dataset<Project> projects = ClusterUtils.readPath(spark, graphPath + "/project", Project.class);
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils
|
||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT));
|
||||
|
||||
rels
|
||||
.joinWith(projects, projects.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedProject(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafProjectToBrokerProject(t._2)),
|
||||
Encoders.bean(RelatedProject.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class GenerateRelatedPublications {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateRelatedPublications.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateRelatedPublications.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String relsPath = parser.get("relsPath");
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final Dataset<Publication> pubs = ClusterUtils
|
||||
.readPath(spark, graphPath + "/publication", Publication.class);
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
||||
|
||||
rels
|
||||
.joinWith(pubs, pubs.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedPublication(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafPublicationToBrokerPublication(t._2)),
|
||||
Encoders.bean(RelatedPublication.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class GenerateRelatedSoftwares {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateRelatedSoftwares.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateRelatedSoftwares.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_relations.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String relsPath = parser.get("relsPath");
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
final Dataset<Software> softwares = ClusterUtils.readPath(spark, graphPath + "/software", Software.class);
|
||||
|
||||
final Dataset<Relation> rels = ClusterUtils.readPath(spark, graphPath + "/relation", Relation.class);
|
||||
|
||||
rels
|
||||
.joinWith(softwares, softwares.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> new RelatedSoftware(
|
||||
t._1.getSource(),
|
||||
t._1.getRelType(),
|
||||
ConversionUtils.oafSoftwareToBrokerSoftware(t._2)),
|
||||
Encoders.bean(RelatedSoftware.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class GenerateSimpleEntitities {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateSimpleEntitities.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateSimpleEntitities.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_simple_entities.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String simpleEntitiesPath = parser.get("simpleEntitiesPath");
|
||||
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, simpleEntitiesPath);
|
||||
|
||||
expandResultsWithRelations(spark, graphPath, Publication.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(simpleEntitiesPath);
|
||||
|
||||
// TODO UNCOMMENT THIS
|
||||
// spark
|
||||
// .emptyDataset(Encoders.bean(Event.class))
|
||||
// .union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
|
||||
// .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
|
||||
// .union(generateEvents(spark, graphPath, Software.class, dedupConfig))
|
||||
// .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
|
||||
// .write()
|
||||
// .mode(SaveMode.Overwrite)
|
||||
// .option("compression", "gzip")
|
||||
// .json(eventsPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static <SRC extends Result> Dataset<OaBrokerMainEntity> expandResultsWithRelations(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass) {
|
||||
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OaBrokerMainEntity.class));
|
||||
}
|
||||
|
||||
}
|
|
@ -12,7 +12,7 @@ import java.util.function.Function;
|
|||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
@ -21,11 +21,11 @@ public abstract class UpdateMatcher<T> {
|
|||
|
||||
private final boolean multipleUpdate;
|
||||
private final Function<T, Topic> topicFunction;
|
||||
private final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction;
|
||||
private final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction;
|
||||
private final Function<T, String> highlightToStringFunction;
|
||||
|
||||
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
|
||||
final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction,
|
||||
final BiConsumer<OaBrokerMainEntity, T> compileHighlightFunction,
|
||||
final Function<T, String> highlightToStringFunction) {
|
||||
this.multipleUpdate = multipleUpdate;
|
||||
this.topicFunction = topicFunction;
|
||||
|
@ -33,13 +33,13 @@ public abstract class UpdateMatcher<T> {
|
|||
this.highlightToStringFunction = highlightToStringFunction;
|
||||
}
|
||||
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OpenaireBrokerResult res,
|
||||
final Collection<OpenaireBrokerResult> others,
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity res,
|
||||
final Collection<OaBrokerMainEntity> others,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||
|
||||
for (final OpenaireBrokerResult source : others) {
|
||||
for (final OaBrokerMainEntity source : others) {
|
||||
if (source != res) {
|
||||
for (final T hl : findDifferences(source, res)) {
|
||||
final Topic topic = getTopicFunction().apply(hl);
|
||||
|
@ -68,7 +68,7 @@ public abstract class UpdateMatcher<T> {
|
|||
}
|
||||
}
|
||||
|
||||
protected abstract List<T> findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target);
|
||||
protected abstract List<T> findDifferences(OaBrokerMainEntity source, OaBrokerMainEntity target);
|
||||
|
||||
protected static boolean isMissing(final List<String> list) {
|
||||
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
|
||||
|
@ -86,7 +86,7 @@ public abstract class UpdateMatcher<T> {
|
|||
return topicFunction;
|
||||
}
|
||||
|
||||
public BiConsumer<OpenaireBrokerResult, T> getCompileHighlightFunction() {
|
||||
public BiConsumer<OaBrokerMainEntity, T> getCompileHighlightFunction() {
|
||||
return compileHighlightFunction;
|
||||
}
|
||||
|
||||
|
|
|
@ -5,13 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Dataset;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public abstract class AbstractEnrichMissingDataset
|
||||
extends UpdateMatcher<Dataset> {
|
||||
public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
|
||||
|
||||
public AbstractEnrichMissingDataset(final Topic topic) {
|
||||
super(true,
|
||||
|
@ -23,14 +22,14 @@ public abstract class AbstractEnrichMissingDataset
|
|||
protected abstract boolean filterByType(String relType);
|
||||
|
||||
@Override
|
||||
protected final List<Dataset> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected final List<OaBrokerRelatedDataset> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
final Set<String> existingDatasets = target
|
||||
.getDatasets()
|
||||
.stream()
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(Dataset::getOriginalId)
|
||||
.map(OaBrokerRelatedDataset::getOriginalId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
|
|
|
@ -4,12 +4,12 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerProject;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMissingProject extends UpdateMatcher<Project> {
|
||||
public class EnrichMissingProject extends UpdateMatcher<OaBrokerProject> {
|
||||
|
||||
public EnrichMissingProject() {
|
||||
super(true,
|
||||
|
@ -19,7 +19,7 @@ public class EnrichMissingProject extends UpdateMatcher<Project> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<Project> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
|
||||
if (target.getProjects().isEmpty()) {
|
||||
return source.getProjects();
|
||||
} else {
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerProject;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMoreProject extends UpdateMatcher<Project> {
|
||||
public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
|
||||
|
||||
public EnrichMoreProject() {
|
||||
super(true,
|
||||
|
@ -19,13 +19,13 @@ public class EnrichMoreProject extends UpdateMatcher<Project> {
|
|||
prj -> projectAsString(prj));
|
||||
}
|
||||
|
||||
private static String projectAsString(final Project prj) {
|
||||
private static String projectAsString(final OaBrokerProject prj) {
|
||||
return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Project> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
final Set<String> existingProjects = target
|
||||
.getProjects()
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<Publication> {
|
||||
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
|
||||
|
||||
public AbstractEnrichMissingPublication(final Topic topic) {
|
||||
super(true,
|
||||
|
@ -23,15 +23,15 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<Pub
|
|||
protected abstract boolean filterByType(String relType);
|
||||
|
||||
@Override
|
||||
protected final List<eu.dnetlib.broker.objects.Publication> findDifferences(
|
||||
final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected final List<OaBrokerRelatedPublication> findDifferences(
|
||||
final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
final Set<String> existingPublications = target
|
||||
.getPublications()
|
||||
.stream()
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(Publication::getOriginalId)
|
||||
.map(OaBrokerRelatedPublication::getOriginalId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
|
|
|
@ -4,12 +4,13 @@ package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMissingSoftware
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
|
||||
extends UpdateMatcher<OaBrokerRelatedSoftware> {
|
||||
|
||||
public EnrichMissingSoftware() {
|
||||
super(true,
|
||||
|
@ -19,9 +20,9 @@ public class EnrichMissingSoftware
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
|
||||
final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerRelatedSoftware> findDifferences(
|
||||
final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
if (target.getSoftwares().isEmpty()) {
|
||||
return source.getSoftwares();
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Software;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMoreSoftware extends UpdateMatcher<Software> {
|
||||
public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
|
||||
|
||||
public EnrichMoreSoftware() {
|
||||
super(true,
|
||||
|
@ -20,14 +20,14 @@ public class EnrichMoreSoftware extends UpdateMatcher<Software> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
|
||||
final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerRelatedSoftware> findDifferences(
|
||||
final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
final Set<String> existingSoftwares = source
|
||||
.getSoftwares()
|
||||
.stream()
|
||||
.map(Software::getName)
|
||||
.map(OaBrokerRelatedSoftware::getName)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target
|
||||
|
|
|
@ -5,7 +5,7 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
|
@ -19,7 +19,7 @@ public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<String> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
|
||||
protected List<String> findDifferences(final OaBrokerMainEntity source, final OaBrokerMainEntity target) {
|
||||
if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
|
||||
return Arrays.asList(source.getAbstracts().get(0));
|
||||
} else {
|
||||
|
|
|
@ -7,12 +7,12 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.Author;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Author> {
|
||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
|
||||
|
||||
public EnrichMissingAuthorOrcid() {
|
||||
super(true,
|
||||
|
@ -22,13 +22,13 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<Author> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<Author> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerAuthor> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
final Set<String> existingOrcids = target
|
||||
.getCreators()
|
||||
.stream()
|
||||
.map(Author::getOrcid)
|
||||
.map(OaBrokerAuthor::getOrcid)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
|
|
|
@ -5,28 +5,28 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerInstance;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
|
||||
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
|
||||
public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
|
||||
|
||||
public EnrichMissingOpenAccess() {
|
||||
super(true,
|
||||
i -> Topic.ENRICH_MISSING_OA_VERSION,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
OaBrokerInstance::getUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
final long count = target
|
||||
.getInstances()
|
||||
.stream()
|
||||
.map(Instance::getLicense)
|
||||
.map(OaBrokerInstance::getLicense)
|
||||
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
|
||||
.count();
|
||||
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMissingPid extends UpdateMatcher<TypedValue> {
|
||||
public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
|
||||
|
||||
public EnrichMissingPid() {
|
||||
super(true,
|
||||
|
@ -20,8 +20,8 @@ public class EnrichMissingPid extends UpdateMatcher<TypedValue> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
final long count = target.getPids().size();
|
||||
|
||||
if (count > 0) {
|
||||
|
|
|
@ -5,7 +5,7 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
|
@ -19,8 +19,8 @@ public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<String> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<String> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
|
||||
if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) {
|
||||
return Arrays.asList(source.getPublicationdate());
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
|
||||
public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||
|
||||
public EnrichMissingSubject() {
|
||||
super(true,
|
||||
|
@ -20,8 +20,8 @@ public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
final Set<String> existingSubject = target
|
||||
.getSubjects()
|
||||
.stream()
|
||||
|
@ -35,7 +35,7 @@ public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static String subjectAsString(final TypedValue s) {
|
||||
private static String subjectAsString(final OaBrokerTypedValue s) {
|
||||
return s.getType() + "::" + s.getValue();
|
||||
}
|
||||
|
||||
|
|
|
@ -5,24 +5,24 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerInstance;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
|
||||
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
|
||||
public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
|
||||
|
||||
public EnrichMoreOpenAccess() {
|
||||
super(true,
|
||||
i -> Topic.ENRICH_MORE_OA_VERSION,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
OaBrokerInstance::getUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
final Set<String> urls = target
|
||||
.getInstances()
|
||||
.stream()
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMorePid extends UpdateMatcher<TypedValue> {
|
||||
public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
|
||||
|
||||
public EnrichMorePid() {
|
||||
super(true,
|
||||
|
@ -20,8 +20,8 @@ public class EnrichMorePid extends UpdateMatcher<TypedValue> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
final Set<String> existingPids = target
|
||||
.getPids()
|
||||
.stream()
|
||||
|
@ -35,7 +35,7 @@ public class EnrichMorePid extends UpdateMatcher<TypedValue> {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static String pidAsString(final TypedValue pid) {
|
||||
private static String pidAsString(final OaBrokerTypedValue pid) {
|
||||
return pid.getType() + "::" + pid.getValue();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,12 +5,12 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
|
||||
public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||
|
||||
public EnrichMoreSubject() {
|
||||
super(true,
|
||||
|
@ -20,8 +20,8 @@ public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target) {
|
||||
final Set<String> existingSubjects = target
|
||||
.getSubjects()
|
||||
.stream()
|
||||
|
@ -35,7 +35,7 @@ public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static String subjectAsString(final TypedValue s) {
|
||||
private static String subjectAsString(final OaBrokerTypedValue s) {
|
||||
return s.getType() + "::" + s.getValue();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
|
||||
public class ClusterUtils {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void removeDir(final SparkSession spark, final String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
final SparkSession spark,
|
||||
final String inputPath,
|
||||
final Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
}
|
|
@ -3,18 +3,28 @@ package eu.dnetlib.dhp.broker.oa.util;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import com.google.common.base.Function;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||
import eu.dnetlib.broker.objects.OaBrokerExternalReference;
|
||||
import eu.dnetlib.broker.objects.OaBrokerInstance;
|
||||
import eu.dnetlib.broker.objects.OaBrokerJournal;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerProject;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
|
||||
|
@ -24,6 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.Journal;
|
|||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
@ -32,145 +43,148 @@ public class ConversionUtils {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
|
||||
|
||||
public static List<eu.dnetlib.broker.objects.Instance> oafInstanceToBrokerInstances(final Instance i) {
|
||||
return i.getUrl().stream().map(url -> {
|
||||
return new eu.dnetlib.broker.objects.Instance()
|
||||
.setUrl(url)
|
||||
.setInstancetype(i.getInstancetype().getClassid())
|
||||
.setLicense(BrokerConstants.OPEN_ACCESS)
|
||||
.setHostedby(i.getHostedby().getValue());
|
||||
}).collect(Collectors.toList());
|
||||
public static List<OaBrokerInstance> oafInstanceToBrokerInstances(final Instance i) {
|
||||
if (i == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return mappedList(i.getUrl(), url -> {
|
||||
final OaBrokerInstance res = new OaBrokerInstance();
|
||||
res.setUrl(url);
|
||||
res.setInstancetype(classId(i.getInstancetype()));
|
||||
res.setLicense(BrokerConstants.OPEN_ACCESS);
|
||||
res.setHostedby(kvValue(i.getHostedby()));
|
||||
return res;
|
||||
});
|
||||
}
|
||||
|
||||
public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) {
|
||||
return sp != null ? new TypedValue()
|
||||
.setValue(sp.getValue())
|
||||
.setType(sp.getQualifier().getClassid()) : null;
|
||||
public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) {
|
||||
return oafStructPropToBrokerTypedValue(sp);
|
||||
}
|
||||
|
||||
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
|
||||
return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null;
|
||||
public static OaBrokerTypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) {
|
||||
return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
|
||||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
return d != null ? new eu.dnetlib.broker.objects.Dataset()
|
||||
.setOriginalId(d.getOriginalId().get(0))
|
||||
.setTitle(structPropValue(d.getTitle()))
|
||||
.setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
d
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
|
||||
: null;
|
||||
public static final OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
if (d == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
||||
res.setOpenaireId(d.getId());
|
||||
res.setOriginalId(first(d.getOriginalId()));
|
||||
res.setTitle(structPropValue(d.getTitle()));
|
||||
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
|
||||
res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue));
|
||||
return res;
|
||||
}
|
||||
|
||||
public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) {
|
||||
return p != null ? new eu.dnetlib.broker.objects.Publication()
|
||||
.setOriginalId(p.getOriginalId().get(0))
|
||||
.setTitle(structPropValue(p.getTitle()))
|
||||
.setPids(p.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
p
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setCollectedFrom(p.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
|
||||
: null;
|
||||
}
|
||||
|
||||
public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) {
|
||||
|
||||
return result != null ? new OpenaireBrokerResult()
|
||||
.setOpenaireId(result.getId())
|
||||
.setOriginalId(result.getOriginalId().get(0))
|
||||
.setTypology(result.getResulttype().getClassid())
|
||||
.setTitles(structPropList(result.getTitle()))
|
||||
.setAbstracts(fieldList(result.getDescription()))
|
||||
.setLanguage(result.getLanguage().getClassid())
|
||||
.setSubjects(structPropTypedList(result.getSubject()))
|
||||
.setCreators(
|
||||
result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList()))
|
||||
.setPublicationdate(result.getDateofacceptance().getValue())
|
||||
.setPublisher(fieldValue(result.getPublisher()))
|
||||
.setEmbargoenddate(fieldValue(result.getEmbargoenddate()))
|
||||
.setContributor(fieldList(result.getContributor()))
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
|
||||
.setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null))
|
||||
.setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
|
||||
.setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
result
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setExternalReferences(
|
||||
result
|
||||
.getExternalReference()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafExtRefToBrokerExtRef)
|
||||
.collect(Collectors.toList()))
|
||||
: null;
|
||||
}
|
||||
|
||||
private static List<TypedValue> structPropTypedList(final List<StructuredProperty> list) {
|
||||
return list
|
||||
.stream()
|
||||
.map(
|
||||
p -> new TypedValue()
|
||||
.setValue(p.getValue())
|
||||
.setType(p.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) {
|
||||
return author != null ? new eu.dnetlib.broker.objects.Author()
|
||||
.setFullname(author.getFullname())
|
||||
.setOrcid(
|
||||
author
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||
.map(pid -> pid.getValue())
|
||||
.findFirst()
|
||||
.orElse(null))
|
||||
: null;
|
||||
}
|
||||
|
||||
private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) {
|
||||
return journal != null ? new eu.dnetlib.broker.objects.Journal()
|
||||
.setName(journal.getName())
|
||||
.setIssn(journal.getIssnPrinted())
|
||||
.setEissn(journal.getIssnOnline())
|
||||
.setLissn(journal.getIssnLinking()) : null;
|
||||
}
|
||||
|
||||
private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
|
||||
return ref != null ? new eu.dnetlib.broker.objects.ExternalReference()
|
||||
.setRefidentifier(ref.getRefidentifier())
|
||||
.setSitename(ref.getSitename())
|
||||
.setType(ref.getQualifier().getClassid())
|
||||
.setUrl(ref.getUrl())
|
||||
: null;
|
||||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) {
|
||||
public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) {
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project()
|
||||
.setTitle(fieldValue(p.getTitle()))
|
||||
.setAcronym(fieldValue(p.getAcronym()))
|
||||
.setCode(fieldValue(p.getCode()));
|
||||
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
||||
res.setOpenaireId(p.getId());
|
||||
res.setOriginalId(first(p.getOriginalId()));
|
||||
res.setTitle(structPropValue(p.getTitle()));
|
||||
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
|
||||
res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static final OaBrokerMainEntity oafResultToBrokerResult(final Result result) {
|
||||
if (result == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
|
||||
res.setOpenaireId(result.getId());
|
||||
res.setOriginalId(first(result.getOriginalId()));
|
||||
res.setTypology(classId(result.getResulttype()));
|
||||
res.setTitles(structPropList(result.getTitle()));
|
||||
res.setAbstracts(fieldList(result.getDescription()));
|
||||
res.setLanguage(classId(result.getLanguage()));
|
||||
res.setSubjects(structPropTypedList(result.getSubject()));
|
||||
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
|
||||
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
|
||||
res.setPublisher(fieldValue(result.getPublisher()));
|
||||
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
|
||||
res.setContributor(fieldList(result.getContributor()));
|
||||
res
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
|
||||
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
|
||||
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
|
||||
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
|
||||
res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
||||
if (author == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final String pids = author.getPid() != null ? author
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> pid != null)
|
||||
.filter(pid -> pid.getQualifier() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||
.map(pid -> pid.getValue())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.findFirst()
|
||||
.orElse(null) : null;
|
||||
|
||||
return new OaBrokerAuthor(author.getFullname(), pids);
|
||||
}
|
||||
|
||||
private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) {
|
||||
if (journal == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerJournal res = new OaBrokerJournal();
|
||||
res.setName(journal.getName());
|
||||
res.setIssn(journal.getIssnPrinted());
|
||||
res.setEissn(journal.getIssnOnline());
|
||||
res.setLissn(journal.getIssnLinking());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
|
||||
if (ref == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerExternalReference res = new OaBrokerExternalReference();
|
||||
res.setRefidentifier(ref.getRefidentifier());
|
||||
res.setSitename(ref.getSitename());
|
||||
res.setType(classId(ref.getQualifier()));
|
||||
res.setUrl(ref.getUrl());
|
||||
return res;
|
||||
}
|
||||
|
||||
public static final OaBrokerProject oafProjectToBrokerProject(final Project p) {
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerProject res = new OaBrokerProject();
|
||||
res.setOpenaireId(p.getId());
|
||||
res.setTitle(fieldValue(p.getTitle()));
|
||||
res.setAcronym(fieldValue(p.getAcronym()));
|
||||
res.setCode(fieldValue(p.getCode()));
|
||||
|
||||
final String ftree = fieldValue(p.getFundingtree());
|
||||
if (StringUtils.isNotBlank(ftree)) {
|
||||
|
@ -187,13 +201,27 @@ public class ConversionUtils {
|
|||
return res;
|
||||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) {
|
||||
return sw != null ? new eu.dnetlib.broker.objects.Software()
|
||||
.setName(structPropValue(sw.getTitle()))
|
||||
.setDescription(fieldValue(sw.getDescription()))
|
||||
.setRepository(fieldValue(sw.getCodeRepositoryUrl()))
|
||||
.setLandingPage(fieldValue(sw.getDocumentationUrl()))
|
||||
: null;
|
||||
public static final OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) {
|
||||
if (sw == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
||||
res.setOpenaireId(sw.getId());
|
||||
res.setName(structPropValue(sw.getTitle()));
|
||||
res.setDescription(fieldValue(sw.getDescription()));
|
||||
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
|
||||
res.setLandingPage(fieldValue(sw.getDocumentationUrl()));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private static String first(final List<String> list) {
|
||||
return list != null && list.size() > 0 ? list.get(0) : null;
|
||||
}
|
||||
|
||||
private static String kvValue(final KeyValue kv) {
|
||||
return kv != null ? kv.getValue() : null;
|
||||
}
|
||||
|
||||
private static String fieldValue(final Field<String> f) {
|
||||
|
@ -205,6 +233,10 @@ public class ConversionUtils {
|
|||
: null;
|
||||
}
|
||||
|
||||
private static String classId(final Qualifier q) {
|
||||
return q != null ? q.getClassid() : null;
|
||||
}
|
||||
|
||||
private static String structPropValue(final List<StructuredProperty> props) {
|
||||
return props != null
|
||||
? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
|
||||
|
@ -226,4 +258,55 @@ public class ConversionUtils {
|
|||
.collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
|
||||
private static List<OaBrokerTypedValue> structPropTypedList(final List<StructuredProperty> list) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
.map(ConversionUtils::oafStructPropToBrokerTypedValue)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
.map(func::apply)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
.map(func::apply)
|
||||
.flatMap(List::stream)
|
||||
.filter(Objects::nonNull)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> T mappedFirst(final List<F> list, final Function<F, T> func) {
|
||||
if (list == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return list
|
||||
.stream()
|
||||
.map(func::apply)
|
||||
.filter(Objects::nonNull)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -4,7 +4,7 @@ package eu.dnetlib.dhp.broker.oa.util;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||
|
@ -68,7 +68,7 @@ public class EventFinder {
|
|||
public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final OpenaireBrokerResult target : results.getData()) {
|
||||
for (final OaBrokerMainEntity target : results.getData()) {
|
||||
for (final UpdateMatcher<?> matcher : matchers) {
|
||||
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig));
|
||||
}
|
||||
|
|
|
@ -9,10 +9,10 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Provenance;
|
||||
import eu.dnetlib.broker.objects.OaBrokerEventPayload;
|
||||
import eu.dnetlib.broker.objects.OaBrokerInstance;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerProvenance;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
|
@ -25,11 +25,11 @@ public final class UpdateInfo<T> {
|
|||
|
||||
private final T highlightValue;
|
||||
|
||||
private final OpenaireBrokerResult source;
|
||||
private final OaBrokerMainEntity source;
|
||||
|
||||
private final OpenaireBrokerResult target;
|
||||
private final OaBrokerMainEntity target;
|
||||
|
||||
private final BiConsumer<OpenaireBrokerResult, T> compileHighlight;
|
||||
private final BiConsumer<OaBrokerMainEntity, T> compileHighlight;
|
||||
|
||||
private final Function<T, String> highlightToString;
|
||||
|
||||
|
@ -37,9 +37,9 @@ public final class UpdateInfo<T> {
|
|||
|
||||
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
|
||||
|
||||
public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target,
|
||||
final BiConsumer<OpenaireBrokerResult, T> compileHighlight,
|
||||
public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source,
|
||||
final OaBrokerMainEntity target,
|
||||
final BiConsumer<OaBrokerMainEntity, T> compileHighlight,
|
||||
final Function<T, String> highlightToString,
|
||||
final DedupConfig dedupConfig) {
|
||||
this.topic = topic;
|
||||
|
@ -55,16 +55,22 @@ public final class UpdateInfo<T> {
|
|||
return highlightValue;
|
||||
}
|
||||
|
||||
public OpenaireBrokerResult getSource() {
|
||||
public OaBrokerMainEntity getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public OpenaireBrokerResult getTarget() {
|
||||
public OaBrokerMainEntity getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
private float calculateTrust(final DedupConfig dedupConfig, final OpenaireBrokerResult r1,
|
||||
final OpenaireBrokerResult r2) {
|
||||
private float calculateTrust(final DedupConfig dedupConfig,
|
||||
final OaBrokerMainEntity r1,
|
||||
final OaBrokerMainEntity r2) {
|
||||
|
||||
if (dedupConfig == null) {
|
||||
return BrokerConstants.MIN_TRUST;
|
||||
}
|
||||
|
||||
try {
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
final MapDocument doc1 = MapDocumentUtil
|
||||
|
@ -98,11 +104,11 @@ public final class UpdateInfo<T> {
|
|||
return highlightToString.apply(getHighlightValue());
|
||||
}
|
||||
|
||||
public OpenAireEventPayload asBrokerPayload() {
|
||||
public OaBrokerEventPayload asBrokerPayload() {
|
||||
|
||||
compileHighlight.accept(target, getHighlightValue());
|
||||
|
||||
final OpenaireBrokerResult hl = new OpenaireBrokerResult();
|
||||
final OaBrokerMainEntity hl = new OaBrokerMainEntity();
|
||||
compileHighlight.accept(hl, getHighlightValue());
|
||||
|
||||
final String provId = getSource().getOriginalId();
|
||||
|
@ -111,18 +117,20 @@ public final class UpdateInfo<T> {
|
|||
final String provUrl = getSource()
|
||||
.getInstances()
|
||||
.stream()
|
||||
.map(Instance::getUrl)
|
||||
.map(OaBrokerInstance::getUrl)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
;
|
||||
|
||||
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
|
||||
final OaBrokerProvenance provenance = new OaBrokerProvenance(provId, provRepo, provUrl);
|
||||
|
||||
return new OpenAireEventPayload()
|
||||
.setPublication(target)
|
||||
.setHighlight(hl)
|
||||
.setTrust(trust)
|
||||
.setProvenance(provenance);
|
||||
final OaBrokerEventPayload res = new OaBrokerEventPayload();
|
||||
res.setResult(target);
|
||||
res.setHighlight(hl);
|
||||
res.setTrust(trust);
|
||||
res.setProvenance(provenance);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder;
|
|||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup, ResultGroup> {
|
||||
public class ResultAggregator extends Aggregator<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup, ResultGroup> {
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -22,13 +22,15 @@ public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Re
|
|||
}
|
||||
|
||||
@Override
|
||||
public ResultGroup reduce(final ResultGroup group, final Tuple2<OpenaireBrokerResult, Relation> t) {
|
||||
return group.addElement(t._1);
|
||||
public ResultGroup reduce(final ResultGroup group, final Tuple2<OaBrokerMainEntity, Relation> t) {
|
||||
group.getData().add(t._1);
|
||||
return group;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) {
|
||||
return g1.addGroup(g2);
|
||||
g1.getData().addAll(g2.getData());
|
||||
return g1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -38,13 +40,13 @@ public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Re
|
|||
|
||||
@Override
|
||||
public Encoder<ResultGroup> bufferEncoder() {
|
||||
return Encoders.kryo(ResultGroup.class);
|
||||
return Encoders.bean(ResultGroup.class);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<ResultGroup> outputEncoder() {
|
||||
return Encoders.kryo(ResultGroup.class);
|
||||
return Encoders.bean(ResultGroup.class);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import java.io.Serializable;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
|
||||
public class ResultGroup implements Serializable {
|
||||
|
||||
|
@ -14,23 +14,14 @@ public class ResultGroup implements Serializable {
|
|||
*/
|
||||
private static final long serialVersionUID = -3360828477088669296L;
|
||||
|
||||
private final List<OpenaireBrokerResult> data = new ArrayList<>();
|
||||
private List<OaBrokerMainEntity> data = new ArrayList<>();
|
||||
|
||||
public List<OpenaireBrokerResult> getData() {
|
||||
public List<OaBrokerMainEntity> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public ResultGroup addElement(final OpenaireBrokerResult elem) {
|
||||
data.add(elem);
|
||||
return this;
|
||||
public void setData(final List<OaBrokerMainEntity> data) {
|
||||
this.data = data;
|
||||
}
|
||||
|
||||
public ResultGroup addGroup(final ResultGroup group) {
|
||||
data.addAll(group.getData());
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean isValid() {
|
||||
return data.size() > 1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,11 +5,11 @@ import org.apache.spark.sql.Encoder;
|
|||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class OpenaireBrokerResultAggregator<T>
|
||||
extends Aggregator<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult, OpenaireBrokerResult> {
|
||||
public class OaBrokerMainEntityAggregator<T>
|
||||
extends Aggregator<Tuple2<OaBrokerMainEntity, T>, OaBrokerMainEntity, OaBrokerMainEntity> {
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -17,17 +17,17 @@ public class OpenaireBrokerResultAggregator<T>
|
|||
private static final long serialVersionUID = -3687878788861013488L;
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult zero() {
|
||||
return new OpenaireBrokerResult();
|
||||
public OaBrokerMainEntity zero() {
|
||||
return new OaBrokerMainEntity();
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult finish(final OpenaireBrokerResult g) {
|
||||
public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
|
||||
return g;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2<OpenaireBrokerResult, T> t) {
|
||||
public OaBrokerMainEntity reduce(final OaBrokerMainEntity g, final Tuple2<OaBrokerMainEntity, T> t) {
|
||||
if (g.getOriginalId() == null) {
|
||||
return t._1;
|
||||
} else if (t._2 instanceof RelatedSoftware) {
|
||||
|
@ -38,13 +38,15 @@ public class OpenaireBrokerResultAggregator<T>
|
|||
g.getPublications().add(((RelatedPublication) t._2).getRelPublication());
|
||||
} else if (t._2 instanceof RelatedProject) {
|
||||
g.getProjects().add(((RelatedProject) t._2).getRelProject());
|
||||
} else {
|
||||
throw new RuntimeException("Invalid Object: " + t._2.getClass());
|
||||
}
|
||||
return g;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) {
|
||||
public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
|
||||
if (g1.getOriginalId() != null) {
|
||||
g1.getSoftwares().addAll(g2.getSoftwares());
|
||||
g1.getDatasets().addAll(g2.getDatasets());
|
||||
|
@ -57,13 +59,13 @@ public class OpenaireBrokerResultAggregator<T>
|
|||
}
|
||||
|
||||
@Override
|
||||
public Encoder<OpenaireBrokerResult> bufferEncoder() {
|
||||
return Encoders.kryo(OpenaireBrokerResult.class);
|
||||
public Encoder<OaBrokerMainEntity> bufferEncoder() {
|
||||
return Encoders.bean(OaBrokerMainEntity.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<OpenaireBrokerResult> outputEncoder() {
|
||||
return Encoders.kryo(OpenaireBrokerResult.class);
|
||||
public Encoder<OaBrokerMainEntity> outputEncoder() {
|
||||
return Encoders.bean(OaBrokerMainEntity.class);
|
||||
}
|
||||
|
||||
}
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Dataset;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
|
||||
|
||||
public class RelatedDataset implements Serializable {
|
||||
|
||||
|
@ -11,11 +11,14 @@ public class RelatedDataset implements Serializable {
|
|||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 774487705184038324L;
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Dataset relDataset;
|
||||
private String source;
|
||||
private String relType;
|
||||
private OaBrokerRelatedDataset relDataset;
|
||||
|
||||
public RelatedDataset(final String source, final String relType, final Dataset relDataset) {
|
||||
public RelatedDataset() {
|
||||
}
|
||||
|
||||
public RelatedDataset(final String source, final String relType, final OaBrokerRelatedDataset relDataset) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relDataset = relDataset;
|
||||
|
@ -25,12 +28,24 @@ public class RelatedDataset implements Serializable {
|
|||
return source;
|
||||
}
|
||||
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Dataset getRelDataset() {
|
||||
public void setRelType(final String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
|
||||
public OaBrokerRelatedDataset getRelDataset() {
|
||||
return relDataset;
|
||||
}
|
||||
|
||||
public void setRelDataset(final OaBrokerRelatedDataset relDataset) {
|
||||
this.relDataset = relDataset;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class RelatedEntityFactory {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <RT, T> RT newRelatedEntity(final String sourceId,
|
||||
final String relType,
|
||||
final T target,
|
||||
final Class<RT> clazz) {
|
||||
|
||||
if (clazz == RelatedProject.class) {
|
||||
return (RT) new RelatedProject(sourceId, relType,
|
||||
ConversionUtils.oafProjectToBrokerProject((Project) target));
|
||||
} else if (clazz == RelatedSoftware.class) {
|
||||
return (RT) new RelatedSoftware(sourceId, relType,
|
||||
ConversionUtils.oafSoftwareToBrokerSoftware((Software) target));
|
||||
} else if (clazz == RelatedDataset.class) {
|
||||
return (RT) new RelatedDataset(sourceId, relType,
|
||||
ConversionUtils.oafDatasetToBrokerDataset((Dataset) target));
|
||||
} else if (clazz == RelatedPublication.class) {
|
||||
return (RT) new RelatedPublication(sourceId, relType,
|
||||
ConversionUtils.oafPublicationToBrokerPublication((Publication) target));
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.broker.objects.OaBrokerProject;
|
||||
|
||||
public class RelatedProject implements Serializable {
|
||||
|
||||
|
@ -12,11 +12,14 @@ public class RelatedProject implements Serializable {
|
|||
*/
|
||||
private static final long serialVersionUID = 4941437626549329870L;
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Project relProject;
|
||||
private String source;
|
||||
private String relType;
|
||||
private OaBrokerProject relProject;
|
||||
|
||||
public RelatedProject(final String source, final String relType, final Project relProject) {
|
||||
public RelatedProject() {
|
||||
}
|
||||
|
||||
public RelatedProject(final String source, final String relType, final OaBrokerProject relProject) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relProject = relProject;
|
||||
|
@ -26,12 +29,24 @@ public class RelatedProject implements Serializable {
|
|||
return source;
|
||||
}
|
||||
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Project getRelProject() {
|
||||
public void setRelType(final String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
|
||||
public OaBrokerProject getRelProject() {
|
||||
return relProject;
|
||||
}
|
||||
|
||||
public void setRelProject(final OaBrokerProject relProject) {
|
||||
this.relProject = relProject;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
|
||||
|
||||
public class RelatedPublication implements Serializable {
|
||||
|
||||
|
@ -12,11 +12,15 @@ public class RelatedPublication implements Serializable {
|
|||
*/
|
||||
private static final long serialVersionUID = 9021609640411395128L;
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Publication relPublication;
|
||||
private String source;
|
||||
private String relType;
|
||||
private OaBrokerRelatedPublication relPublication;
|
||||
|
||||
public RelatedPublication(final String source, final String relType, final Publication relPublication) {
|
||||
public RelatedPublication() {
|
||||
}
|
||||
|
||||
public RelatedPublication(final String source, final String relType,
|
||||
final OaBrokerRelatedPublication relPublication) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relPublication = relPublication;
|
||||
|
@ -26,12 +30,24 @@ public class RelatedPublication implements Serializable {
|
|||
return source;
|
||||
}
|
||||
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Publication getRelPublication() {
|
||||
public void setRelType(final String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
|
||||
public OaBrokerRelatedPublication getRelPublication() {
|
||||
return relPublication;
|
||||
}
|
||||
|
||||
public void setRelPublication(final OaBrokerRelatedPublication relPublication) {
|
||||
this.relPublication = relPublication;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Software;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
|
||||
|
||||
public class RelatedSoftware implements Serializable {
|
||||
|
||||
|
@ -11,11 +11,14 @@ public class RelatedSoftware implements Serializable {
|
|||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 7573383356943300157L;
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Software relSoftware;
|
||||
private String source;
|
||||
private String relType;
|
||||
private OaBrokerRelatedSoftware relSoftware;
|
||||
|
||||
public RelatedSoftware(final String source, final String relType, final Software relSoftware) {
|
||||
public RelatedSoftware() {
|
||||
}
|
||||
|
||||
public RelatedSoftware(final String source, final String relType, final OaBrokerRelatedSoftware relSoftware) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relSoftware = relSoftware;
|
||||
|
@ -25,12 +28,24 @@ public class RelatedSoftware implements Serializable {
|
|||
return source;
|
||||
}
|
||||
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Software getRelSoftware() {
|
||||
public void setRelType(final String relType) {
|
||||
this.relType = relType;
|
||||
}
|
||||
|
||||
public OaBrokerRelatedSoftware getRelSoftware() {
|
||||
return relSoftware;
|
||||
}
|
||||
|
||||
public void setRelSoftware(final OaBrokerRelatedSoftware relSoftware) {
|
||||
this.relSoftware = relSoftware;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -78,21 +78,33 @@
|
|||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="generate_events">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${eventsOutputPath}"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</main-class>
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEvents</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
||||
</java>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphPath",
|
||||
"paramDescription": "the path where there the graph is stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "relsPath",
|
||||
"paramDescription": "the path where the generated relations will be stored",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,14 @@
|
|||
[
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphPath",
|
||||
"paramDescription": "the path where there the graph is stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "simpleEntitiesPath",
|
||||
"paramDescription": "the path where the generated simple entities (without relations) will be stored",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -19,7 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -84,12 +90,86 @@ public class CleanGraphSparkJob {
|
|||
|
||||
readTableFromPath(spark, inputPath, clazz)
|
||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||
.map((MapFunction<T, T>) value -> fixDefaults(value), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> T fixDefaults(T value) {
|
||||
if (value instanceof Datasource) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Project) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Organization) {
|
||||
Organization o = (Organization) value;
|
||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
||||
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
|
||||
}
|
||||
} else if (value instanceof Relation) {
|
||||
// nothing to clean here
|
||||
} else if (value instanceof Result) {
|
||||
|
||||
Result r = (Result) value;
|
||||
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
||||
r
|
||||
.setLanguage(
|
||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
r
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||
r
|
||||
.setResourcetype(
|
||||
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
||||
}
|
||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||
r
|
||||
.setBestaccessright(
|
||||
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.nonNull(r.getInstance())) {
|
||||
for (Instance i : r.getInstance()) {
|
||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
||||
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
|
||||
}
|
||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (value instanceof Publication) {
|
||||
|
||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||
|
||||
} else if (value instanceof OtherResearchProduct) {
|
||||
|
||||
} else if (value instanceof Software) {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
||||
return OafMapperUtils
|
||||
.qualifier(
|
||||
classid, classname, scheme, scheme);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
||||
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
||||
|
||||
|
|
|
@ -4,10 +4,13 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
|||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
|
||||
|
||||
|
@ -18,23 +21,24 @@ public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>
|
|||
*/
|
||||
public static CleaningRuleMap create(VocabularyGroup vocabularies) {
|
||||
CleaningRuleMap mapping = new CleaningRuleMap();
|
||||
mapping.put(Qualifier.class, o -> {
|
||||
Qualifier q = (Qualifier) o;
|
||||
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
||||
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
|
||||
q.setClassid(newValue.getClassid());
|
||||
q.setClassname(newValue.getClassname());
|
||||
mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o));
|
||||
mapping.put(Country.class, o -> {
|
||||
final Country c = (Country) o;
|
||||
if (StringUtils.isBlank(c.getSchemeid())) {
|
||||
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
|
||||
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
|
||||
}
|
||||
});
|
||||
mapping.put(StructuredProperty.class, o -> {
|
||||
StructuredProperty sp = (StructuredProperty) o;
|
||||
// TODO implement a policy
|
||||
/*
|
||||
* if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null);
|
||||
* }
|
||||
*/
|
||||
cleanQualifier(vocabularies, c);
|
||||
});
|
||||
return mapping;
|
||||
}
|
||||
|
||||
private static <Q extends Qualifier> void cleanQualifier(VocabularyGroup vocabularies, Q q) {
|
||||
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
||||
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
|
||||
q.setClassid(newValue.getClassid());
|
||||
q.setClassname(newValue.getClassname());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -122,7 +122,11 @@ public class VocabularyGroup implements Serializable {
|
|||
}
|
||||
|
||||
public boolean vocabularyExists(final String vocId) {
|
||||
return vocs.containsKey(vocId.toLowerCase());
|
||||
return Optional
|
||||
.ofNullable(vocId)
|
||||
.map(String::toLowerCase)
|
||||
.map(id -> vocs.containsKey(id))
|
||||
.orElse(false);
|
||||
}
|
||||
|
||||
private void addSynonyms(final String vocId, final String termId, final String syn) {
|
||||
|
|
|
@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
|
@ -56,6 +57,9 @@ public class CleaningFunctionTest {
|
|||
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||
|
||||
assertTrue(p_in instanceof Result);
|
||||
assertTrue(p_in instanceof Publication);
|
||||
|
||||
Publication p_out = OafCleaner.apply(p_in, mapping);
|
||||
|
||||
assertNotNull(p_out);
|
||||
|
@ -63,6 +67,9 @@ public class CleaningFunctionTest {
|
|||
assertEquals("und", p_out.getLanguage().getClassid());
|
||||
assertEquals("Undetermined", p_out.getLanguage().getClassname());
|
||||
|
||||
assertEquals("DE", p_out.getCountry().get(0).getClassid());
|
||||
assertEquals("Germany", p_out.getCountry().get(0).getClassname());
|
||||
|
||||
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
|
||||
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
|
||||
|
||||
|
|
|
@ -202,6 +202,12 @@
|
|||
"contributor": [
|
||||
],
|
||||
"country": [
|
||||
{
|
||||
"classid": "DE",
|
||||
"classname": "DE",
|
||||
"schemeid": "dnet:countries",
|
||||
"schemename": "dnet:countries"
|
||||
}
|
||||
],
|
||||
"coverage": [
|
||||
],
|
||||
|
|
|
@ -4,9 +4,12 @@ import java.time.LocalDateTime
|
|||
import java.time.format.DateTimeFormatter
|
||||
|
||||
import eu.dnetlib.dhp.common.PacePerson
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Field, Instance, KeyValue, Publication, Qualifier, Relation, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, StructuredProperty}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
@ -77,6 +80,76 @@ object DLIToOAF {
|
|||
)
|
||||
|
||||
|
||||
val rel_inverse: Map[String, String] = Map(
|
||||
"isRelatedTo" -> "isRelatedTo",
|
||||
"IsSupplementedBy" -> "isSupplementTo",
|
||||
"cites" -> "IsCitedBy",
|
||||
"IsCitedBy" -> "cites",
|
||||
"reviews" -> "IsReviewedBy"
|
||||
)
|
||||
|
||||
|
||||
val PidTypeMap: Map[String, String] = Map(
|
||||
"pbmid" -> "pmid",
|
||||
"pmcid" -> "pmc",
|
||||
"pmid" -> "pmid",
|
||||
"pubmedid" -> "pmid",
|
||||
"DOI" -> "doi",
|
||||
"doi" -> "doi"
|
||||
)
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: Dataset =>
|
||||
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
|
||||
a.setClazz(classOf[Dataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def convertClinicalTrial(dataset: DLIDataset): (String, String) = {
|
||||
val currentId = generateId(dataset.getId)
|
||||
val pids = dataset.getPid.asScala.filter(p => "clinicaltrials.gov".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => s"50|r3111dacbab5::${DHPUtils.md5(p.getValue.toLowerCase())}")
|
||||
if (pids.isEmpty)
|
||||
null
|
||||
else
|
||||
(currentId, pids.head)
|
||||
}
|
||||
|
||||
|
||||
def insertExternalRefs(publication: Publication, externalReferences: List[DLIExternalReference]): Publication = {
|
||||
|
||||
val eRefs = externalReferences.map(e => {
|
||||
val result = new ExternalReference()
|
||||
result.setSitename(e.sitename)
|
||||
result.setLabel(e.label)
|
||||
result.setUrl(e.url)
|
||||
result.setRefidentifier(e.pid)
|
||||
result.setDataInfo(generateDataInfo())
|
||||
result.setQualifier(createQualifier(e.classId, "dnet:externalReference_typologies"))
|
||||
result
|
||||
})
|
||||
publication.setExternalReference(eRefs.asJava)
|
||||
publication
|
||||
|
||||
}
|
||||
|
||||
def filterPid(p: StructuredProperty): Boolean = {
|
||||
if (expectecdPidType.contains(p.getQualifier.getClassname) && p.getQualifier.getClassname.equalsIgnoreCase("url"))
|
||||
if (filteredURL.exists(u => p.getValue.contains(u)))
|
||||
|
@ -97,7 +170,6 @@ object DLIToOAF {
|
|||
}
|
||||
|
||||
def convertDLIDatasetToExternalReference(dataset: DLIDataset): DLIExternalReference = {
|
||||
val currentId = generateId(dataset.getId)
|
||||
val pids = dataset.getPid.asScala.filter(filterPid)
|
||||
|
||||
if (pids == null || pids.isEmpty)
|
||||
|
@ -109,7 +181,7 @@ object DLIToOAF {
|
|||
pid.getQualifier.getClassname match {
|
||||
case "uniprot" => DLIExternalReference(generateId(dataset.getId), s"https://www.uniprot.org/uniprot/${pid.getValue}", "UniProt", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
case "ena" =>
|
||||
if(pid.getValue!= null && pid.getValue.nonEmpty && pid.getValue.length>7)
|
||||
if (pid.getValue != null && pid.getValue.nonEmpty && pid.getValue.length > 7)
|
||||
DLIExternalReference(generateId(dataset.getId), s"https://www.ebi.ac.uk/ena/data/view/${pid.getValue.substring(0, 8)}", "European Nucleotide Archive", extractTitle(dataset.getTitle), pid.getValue, "accessionNumber")
|
||||
else
|
||||
null
|
||||
|
@ -126,43 +198,50 @@ object DLIToOAF {
|
|||
}
|
||||
|
||||
|
||||
def convertDLIPublicationToOAF(p: DLIPublication): Publication = {
|
||||
|
||||
def convertDLIPublicationToOAF(inputPublication: DLIPublication): Publication = {
|
||||
val result = new Publication
|
||||
result.setId(generateId(p.getId))
|
||||
val cleanedPids = inputPublication.getPid.asScala.filter(p => PidTypeMap.contains(p.getQualifier.getClassid))
|
||||
.map(p => {
|
||||
p.setQualifier(createQualifier(PidTypeMap(p.getQualifier.getClassid), p.getQualifier.getSchemeid))
|
||||
p
|
||||
})
|
||||
if (cleanedPids.isEmpty)
|
||||
return null
|
||||
result.setId(generateId(inputPublication.getId))
|
||||
result.setDataInfo(generateDataInfo(invisibile = true))
|
||||
if (p.getCollectedfrom == null || p.getCollectedfrom.size() == 0 || (p.getCollectedfrom.size() == 1 && p.getCollectedfrom.get(0) == null))
|
||||
if (inputPublication.getCollectedfrom == null || inputPublication.getCollectedfrom.size() == 0 || (inputPublication.getCollectedfrom.size() == 1 && inputPublication.getCollectedfrom.get(0) == null))
|
||||
return null
|
||||
|
||||
result.setCollectedfrom(p.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava)
|
||||
result.setPid(p.getPid)
|
||||
result.setDateofcollection(p.getDateofcollection)
|
||||
result.setOriginalId(p.getPid.asScala.map(p => p.getValue).asJava)
|
||||
result.setCollectedfrom(inputPublication.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
|
||||
if(result.getCollectedfrom.isEmpty)
|
||||
return null
|
||||
result.setPid(cleanedPids.asJava)
|
||||
result.setDateofcollection(inputPublication.getDateofcollection)
|
||||
result.setOriginalId(inputPublication.getPid.asScala.map(p => p.getValue).asJava)
|
||||
result.setDateoftransformation(LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss'Z'")))
|
||||
if (p.getAuthor == null || p.getAuthor.isEmpty)
|
||||
if (inputPublication.getAuthor == null || inputPublication.getAuthor.isEmpty)
|
||||
return null
|
||||
result.setAuthor(p.getAuthor.asScala.map(convertAuthor).asJava)
|
||||
result.setResulttype(createQualifier(p.getResulttype.getClassid, p.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies"))
|
||||
result.setAuthor(inputPublication.getAuthor.asScala.map(convertAuthor).asJava)
|
||||
result.setResulttype(createQualifier(inputPublication.getResulttype.getClassid, inputPublication.getResulttype.getClassname, "dnet:result_typologies", "dnet:result_typologies"))
|
||||
|
||||
if (p.getSubject != null)
|
||||
result.setSubject(p.getSubject.asScala.map(convertSubject).asJava)
|
||||
if (inputPublication.getSubject != null)
|
||||
result.setSubject(inputPublication.getSubject.asScala.map(convertSubject).asJava)
|
||||
|
||||
if (p.getTitle == null || p.getTitle.isEmpty)
|
||||
if (inputPublication.getTitle == null || inputPublication.getTitle.isEmpty)
|
||||
return null
|
||||
|
||||
result.setTitle(List(patchTitle(p.getTitle.get(0))).asJava)
|
||||
result.setTitle(List(patchTitle(inputPublication.getTitle.get(0))).asJava)
|
||||
|
||||
if (p.getRelevantdate == null || p.getRelevantdate.size() == 0)
|
||||
if (inputPublication.getRelevantdate == null || inputPublication.getRelevantdate.size() == 0)
|
||||
return null
|
||||
|
||||
result.setRelevantdate(p.getRelevantdate.asScala.map(patchRelevantDate).asJava)
|
||||
result.setRelevantdate(inputPublication.getRelevantdate.asScala.map(patchRelevantDate).asJava)
|
||||
|
||||
|
||||
result.setDescription(p.getDescription)
|
||||
result.setDescription(inputPublication.getDescription)
|
||||
|
||||
result.setDateofacceptance(asField(p.getRelevantdate.get(0).getValue))
|
||||
result.setPublisher(p.getPublisher)
|
||||
result.setSource(p.getSource)
|
||||
result.setDateofacceptance(asField(inputPublication.getRelevantdate.get(0).getValue))
|
||||
result.setPublisher(inputPublication.getPublisher)
|
||||
result.setSource(inputPublication.getSource)
|
||||
result.setBestaccessright(createQualifier("UNKNOWN", "not available", "dnet:access_modes", "dnet:access_modes"))
|
||||
|
||||
val dois = result.getPid.asScala.filter(p => "doi".equalsIgnoreCase(p.getQualifier.getClassname)).map(p => p.getValue)
|
||||
|
@ -170,7 +249,7 @@ object DLIToOAF {
|
|||
return null
|
||||
|
||||
|
||||
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(p.getInstance()), result.getDateofacceptance)
|
||||
val i: Instance = createInstance(s"https://dx.doi.org/${dois.head}", firstInstanceOrNull(inputPublication.getInstance()), result.getDateofacceptance)
|
||||
|
||||
if (i != null)
|
||||
result.setInstance(List(i).asJava)
|
||||
|
@ -211,7 +290,9 @@ object DLIToOAF {
|
|||
val result: Dataset = new Dataset
|
||||
result.setId(generateId(d.getId))
|
||||
result.setDataInfo(generateDataInfo())
|
||||
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).asJava)
|
||||
result.setCollectedfrom(d.getCollectedfrom.asScala.map(c => collectedFromMap.getOrElse(c.getKey, null)).filter(p => p != null).asJava)
|
||||
if(result.getCollectedfrom.isEmpty)
|
||||
return null
|
||||
|
||||
|
||||
result.setPid(d.getPid)
|
||||
|
@ -280,7 +361,7 @@ object DLIToOAF {
|
|||
if (dataset)
|
||||
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
|
||||
else
|
||||
i.setInstancetype(createQualifier("0000", "UNKNOWN", "dnet:publication_resource", "dnet:publication_resource"))
|
||||
i.setInstancetype(createQualifier("0000", "Unknown", "dnet:publication_resource", "dnet:publication_resource"))
|
||||
if (originalInstance != null && originalInstance.getHostedby != null)
|
||||
i.setHostedby(originalInstance.getHostedby)
|
||||
|
||||
|
|
|
@ -4,10 +4,16 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
|||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIRelation}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.expressions.Window
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
|
||||
|
@ -36,57 +42,66 @@ object SparkExportContentForOpenAire {
|
|||
implicit val dliRelEncoder: Encoder[DLIRelation] = Encoders.bean(classOf[DLIRelation])
|
||||
import spark.implicits._
|
||||
|
||||
//
|
||||
// val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j")
|
||||
// .map(s => new ObjectMapper().readValue(s, classOf[DLIRelation]))
|
||||
// .filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
// .map(DLIToOAF.convertDLIRelation).filter(p=>p!= null)
|
||||
// spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS")
|
||||
//
|
||||
// val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset")
|
||||
// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
|
||||
// .filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
// .map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null)
|
||||
// spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS")
|
||||
//
|
||||
//
|
||||
// val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication")
|
||||
// .map(s => new ObjectMapper().readValue(s, classOf[DLIPublication]))
|
||||
// .filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
// .map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null)
|
||||
// spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS")
|
||||
//
|
||||
//
|
||||
//
|
||||
// val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication]
|
||||
// val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset]
|
||||
var relDS :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation]
|
||||
//
|
||||
//
|
||||
// pubs.joinWith(relDS, pubs("id").equalTo(relDS("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1")
|
||||
//
|
||||
// relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation]
|
||||
//
|
||||
// relDS.joinWith(dats, relDS("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered")
|
||||
//
|
||||
//
|
||||
// val r_source = relDS.select(relDS("source")).distinct()
|
||||
// val r_target = relDS.select(relDS("source")).distinct()
|
||||
//
|
||||
//
|
||||
// pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered")
|
||||
//
|
||||
// dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS_filtered")
|
||||
//
|
||||
// spark.createDataset(sc.textFile(s"$workingPath/dataset")
|
||||
// .map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
|
||||
// .map(DLIToOAF.convertDLIDatasetToExternalReference)
|
||||
// .filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference")
|
||||
//
|
||||
|
||||
val relRDD:RDD[Relation] = sc.textFile(s"$workingPath/relation_j")
|
||||
.map(s => new ObjectMapper().readValue(s, classOf[DLIRelation]))
|
||||
.filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
.map(DLIToOAF.convertDLIRelation).filter(p=>p!= null)
|
||||
spark.createDataset(relRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS")
|
||||
|
||||
val datRDD:RDD[OafDataset] = sc.textFile(s"$workingPath/dataset")
|
||||
.map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
|
||||
.filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
.map(DLIToOAF.convertDLIDatasetTOOAF).filter(p=>p!= null)
|
||||
spark.createDataset(datRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetDS")
|
||||
|
||||
|
||||
val pubRDD:RDD[Publication] = sc.textFile(s"$workingPath/publication")
|
||||
.map(s => new ObjectMapper().readValue(s, classOf[DLIPublication]))
|
||||
.filter(p => p.getDataInfo.getDeletedbyinference == false)
|
||||
.map(DLIToOAF.convertDLIPublicationToOAF).filter(p=>p!= null)
|
||||
spark.createDataset(pubRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS")
|
||||
|
||||
|
||||
|
||||
val pubs:Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS").as[Publication]
|
||||
val dats :Dataset[OafDataset] = spark.read.load(s"$workingPath/datasetDS").as[OafDataset]
|
||||
val relDS1 :Dataset[Relation] = spark.read.load(s"$workingPath/relationDS").as[Relation]
|
||||
|
||||
|
||||
val pub_id = pubs.select("id").distinct()
|
||||
val dat_id = dats.select("id").distinct()
|
||||
|
||||
|
||||
pub_id.joinWith(relDS1, pub_id("id").equalTo(relDS1("source"))).map(k => k._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_f1")
|
||||
|
||||
val relDS2= spark.read.load(s"$workingPath/relationDS_f1").as[Relation]
|
||||
|
||||
relDS2.joinWith(dat_id, relDS2("target").equalTo(dats("id"))).map(k => k._1).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationDS_filtered")
|
||||
|
||||
|
||||
val r_source = relDS2.select(relDS2("source")).distinct()
|
||||
val r_target = relDS2.select(relDS2("target")).distinct()
|
||||
|
||||
|
||||
val w2 = Window.partitionBy("id").orderBy("lastupdatetimestamp")
|
||||
|
||||
pubs.joinWith(r_source, pubs("id").equalTo(r_source("source")), "inner").map(k => k._1)
|
||||
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationDS_filtered")
|
||||
|
||||
dats.joinWith(r_target, dats("id").equalTo(r_target("target")), "inner").map(k => k._1)
|
||||
.withColumn("row",row_number.over(w2)).where($"row" === 1).drop("row")
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/datasetAS")
|
||||
|
||||
spark.createDataset(sc.textFile(s"$workingPath/dataset")
|
||||
.map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
|
||||
.map(DLIToOAF.convertDLIDatasetToExternalReference)
|
||||
.filter(p => p != null)).as[DLIExternalReference].write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference")
|
||||
|
||||
val pf = spark.read.load(s"$workingPath/publicationDS_filtered").select("id")
|
||||
relDS = spark.read.load(s"$workingPath/relationDS").as[Relation]
|
||||
val relationTo = pf.joinWith(relDS, pf("id").equalTo(relDS("source")),"inner").map(t =>t._2)
|
||||
val relDS3 = spark.read.load(s"$workingPath/relationDS").as[Relation]
|
||||
val relationTo = pf.joinWith(relDS3, pf("id").equalTo(relDS3("source")),"inner").map(t =>t._2)
|
||||
|
||||
val extRef = spark.read.load(s"$workingPath/externalReference").as[DLIExternalReference]
|
||||
|
||||
|
@ -100,19 +115,70 @@ object SparkExportContentForOpenAire {
|
|||
(f._1, dli_ext)
|
||||
})).write.mode(SaveMode.Overwrite).save(s"$workingPath/externalReference_grouped")
|
||||
|
||||
val pubf :Dataset[Publication] = spark.read.load(s"$workingPath/publicationDS_filtered").as[Publication]
|
||||
|
||||
val groupedERf:Dataset[(String, List[DLIExternalReference])]= spark.read.load(s"$workingPath/externalReference_grouped").as[(String, List[DLIExternalReference])]
|
||||
|
||||
groupedERf.joinWith(pubf,pubf("id").equalTo(groupedERf("_1"))).map(t =>
|
||||
{
|
||||
val publication = t._2
|
||||
if (t._1 != null) {
|
||||
val eRefs = t._1._2
|
||||
DLIToOAF.insertExternalRefs(publication, eRefs)
|
||||
|
||||
} else
|
||||
publication
|
||||
}
|
||||
).write.mode(SaveMode.Overwrite).save(s"$workingPath/publicationAS")
|
||||
|
||||
|
||||
spark.createDataset(sc.textFile(s"$workingPath/dataset")
|
||||
.map(s => new ObjectMapper().readValue(s, classOf[DLIDataset]))
|
||||
.map(DLIToOAF.convertClinicalTrial)
|
||||
.filter(p => p != null))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrials")
|
||||
|
||||
val ct:Dataset[(String,String)] = spark.read.load(s"$workingPath/clinicalTrials").as[(String,String)]
|
||||
|
||||
val relDS= spark.read.load(s"$workingPath/relationDS_f1").as[Relation]
|
||||
|
||||
relDS.joinWith(ct, relDS("target").equalTo(ct("_1")), "inner")
|
||||
.map(k =>{
|
||||
val currentRel = k._1
|
||||
currentRel.setTarget(k._2._2)
|
||||
currentRel
|
||||
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/clinicalTrialsRels")
|
||||
|
||||
|
||||
val clRels:Dataset[Relation] = spark.read.load(s"$workingPath/clinicalTrialsRels").as[Relation]
|
||||
val rels:Dataset[Relation] = spark.read.load(s"$workingPath/relationDS_filtered").as[Relation]
|
||||
|
||||
rels.union(clRels).flatMap(r => {
|
||||
val inverseRel = new Relation
|
||||
inverseRel.setSource(r.getTarget)
|
||||
inverseRel.setTarget(r.getSource)
|
||||
inverseRel.setDataInfo(r.getDataInfo)
|
||||
inverseRel.setCollectedfrom(r.getCollectedfrom)
|
||||
inverseRel.setRelType(r.getRelType)
|
||||
inverseRel.setSubRelType(r.getSubRelType)
|
||||
inverseRel.setRelClass(DLIToOAF.rel_inverse(r.getRelClass))
|
||||
List(r, inverseRel)
|
||||
}).write.mode(SaveMode.Overwrite).save(s"$workingPath/relationAS")
|
||||
|
||||
|
||||
val fRels:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationAS").as[Relation].map(DLIToOAF.toActionSet)
|
||||
val fpubs:Dataset[(String,String)] = spark.read.load(s"$workingPath/publicationAS").as[Publication].map(DLIToOAF.toActionSet)
|
||||
val fdats:Dataset[(String,String)] = spark.read.load(s"$workingPath/datasetAS").as[OafDataset].map(DLIToOAF.toActionSet)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue