|
|
|
@ -31,7 +31,6 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAg
|
|
|
|
|
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
|
|
|
|
|
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
|
|
|
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
|
|
@ -51,8 +50,7 @@ public class GenerateEventsApplication {
|
|
|
|
|
public static void main(final String[] args) throws Exception {
|
|
|
|
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
|
|
|
|
IOUtils
|
|
|
|
|
.toString(
|
|
|
|
|
GenerateEventsApplication.class
|
|
|
|
|
.toString(GenerateEventsApplication.class
|
|
|
|
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json")));
|
|
|
|
|
parser.parseArgument(args);
|
|
|
|
|
|
|
|
|
@ -78,18 +76,21 @@ public class GenerateEventsApplication {
|
|
|
|
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
|
|
|
|
conf.registerKryoClasses(BrokerConstants.getModelClasses());
|
|
|
|
|
|
|
|
|
|
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
|
|
|
|
// TODO UNCOMMENT
|
|
|
|
|
// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
|
|
|
|
final DedupConfig dedupConfig = null;
|
|
|
|
|
|
|
|
|
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
|
|
|
|
|
|
|
|
|
removeOutputDir(spark, eventsPath);
|
|
|
|
|
|
|
|
|
|
// TODO UNCOMMENT
|
|
|
|
|
spark
|
|
|
|
|
.emptyDataset(Encoders.kryo(Event.class))
|
|
|
|
|
.union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
|
|
|
|
|
.union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
|
|
|
|
|
.union(generateEvents(spark, graphPath, Software.class, dedupConfig))
|
|
|
|
|
.union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
|
|
|
|
|
// .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
|
|
|
|
|
// .union(generateEvents(spark, graphPath, Software.class, dedupConfig))
|
|
|
|
|
// .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
|
|
|
|
|
.write()
|
|
|
|
|
.mode(SaveMode.Overwrite)
|
|
|
|
|
.option("compression", "gzip")
|
|
|
|
@ -117,15 +118,12 @@ public class GenerateEventsApplication {
|
|
|
|
|
.toColumn();
|
|
|
|
|
|
|
|
|
|
return results
|
|
|
|
|
.joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner")
|
|
|
|
|
.groupByKey(
|
|
|
|
|
(MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
|
|
|
|
.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
|
|
|
|
|
.groupByKey((MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
|
|
|
|
.agg(aggr)
|
|
|
|
|
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
|
|
|
|
.filter(ResultGroup::isValid)
|
|
|
|
|
.map(
|
|
|
|
|
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
|
|
|
|
|
Encoders.kryo(EventGroup.class))
|
|
|
|
|
.map((MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig), Encoders.kryo(EventGroup.class))
|
|
|
|
|
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -133,9 +131,9 @@ public class GenerateEventsApplication {
|
|
|
|
|
final SparkSession spark,
|
|
|
|
|
final String graphPath,
|
|
|
|
|
final Class<SRC> sourceClass) {
|
|
|
|
|
|
|
|
|
|
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
|
|
|
|
|
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
|
|
|
|
|
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
|
|
|
|
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
|
|
|
|
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
|
|
|
|
|
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
|
|
|
|
|
|
|
|
|
@ -143,17 +141,14 @@ public class GenerateEventsApplication {
|
|
|
|
|
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
|
|
|
|
.cache();
|
|
|
|
|
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r0 = readPath(
|
|
|
|
|
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r0 = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
|
|
|
|
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
|
|
|
|
.map(ConversionUtils::oafResultToBrokerResult, Encoders.kryo(OpenaireBrokerResult.class));
|
|
|
|
|
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class));
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class));
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class));
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r4 = join(
|
|
|
|
|
r3, rels, relatedEntities(publications, rels, RelatedProject.class));
|
|
|
|
|
;
|
|
|
|
|
final Dataset<OpenaireBrokerResult> r4 = join(r3, rels, relatedEntities(publications, rels, RelatedProject.class));;
|
|
|
|
|
|
|
|
|
|
return r4;
|
|
|
|
|
}
|
|
|
|
@ -163,9 +158,7 @@ public class GenerateEventsApplication {
|
|
|
|
|
final Class<RT> clazz) {
|
|
|
|
|
return rels
|
|
|
|
|
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
|
|
|
|
|
.map(
|
|
|
|
|
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
|
|
|
|
|
Encoders.kryo(clazz));
|
|
|
|
|
.map(t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz), Encoders.kryo(clazz));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static <T> Dataset<OpenaireBrokerResult> join(final Dataset<OpenaireBrokerResult> sources,
|
|
|
|
@ -173,15 +166,13 @@ public class GenerateEventsApplication {
|
|
|
|
|
final Dataset<T> typedRels) {
|
|
|
|
|
|
|
|
|
|
final TypedColumn<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator<T>()
|
|
|
|
|
.toColumn();
|
|
|
|
|
;
|
|
|
|
|
.toColumn();;
|
|
|
|
|
|
|
|
|
|
return sources
|
|
|
|
|
.joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer")
|
|
|
|
|
.groupByKey(
|
|
|
|
|
(MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
|
|
|
|
|
return sources.joinWith(typedRels, sources.col("openaireId").equalTo(rels.col("source")), "left_outer")
|
|
|
|
|
.groupByKey((MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
|
|
|
|
|
.agg(aggr)
|
|
|
|
|
.map(t -> t._2, Encoders.kryo(OpenaireBrokerResult.class));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static <R> Dataset<R> readPath(
|
|
|
|
@ -195,14 +186,12 @@ public class GenerateEventsApplication {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
|
|
|
|
|
|
|
|
|
|
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
|
|
|
|
|
|
|
|
|
final String conf = isLookUpService
|
|
|
|
|
.getResourceProfileByQuery(
|
|
|
|
|
String
|
|
|
|
|
.format(
|
|
|
|
|
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
|
|
|
|
profId));
|
|
|
|
|
.getResourceProfileByQuery(String
|
|
|
|
|
.format("for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", profId));
|
|
|
|
|
|
|
|
|
|
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
|
|
|
|
|
dedupConfig.getPace().initModel();
|
|
|
|
|