forked from D-Net/dnet-hadoop
indexing, accumulators, limited lists
This commit is contained in:
parent
6f13673464
commit
59a5421c24
|
@ -10,10 +10,8 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -73,17 +71,17 @@ public class GenerateEventsJob {
|
|||
|
||||
final Map<String, LongAccumulator> accumulators = prepareAccumulators(spark.sparkContext());
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
|
||||
|
||||
final Dataset<ResultGroup> groups = ClusterUtils
|
||||
.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
|
||||
|
||||
final Dataset<Event> events = groups
|
||||
.map(
|
||||
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder
|
||||
.generateEvents(g, dedupConfig, accumulators),
|
||||
Encoders.bean(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class));
|
||||
final Dataset<Event> dataset = groups
|
||||
.map(g -> EventFinder.generateEvents(g, dedupConfig, accumulators), Encoders.bean(EventGroup.class))
|
||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class))
|
||||
.map(e -> ClusterUtils.incrementAccumulator(e, total), Encoders.bean(Event.class));
|
||||
|
||||
events.write().mode(SaveMode.Overwrite).json(eventsPath);
|
||||
ClusterUtils.save(dataset, eventsPath, Event.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
|
||||
public class IndexOnESJob {
|
||||
|
||||
|
@ -45,10 +46,8 @@ public class IndexOnESJob {
|
|||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final JavaRDD<String> inputRdd = spark
|
||||
.read()
|
||||
.load(eventsPath)
|
||||
.as(Encoders.bean(Event.class))
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
|
|
|
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -52,6 +52,8 @@ public class JoinStep1Job {
|
|||
|
||||
ClusterUtils.removeDir(spark, joinedEntitiesPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
|
||||
|
||||
|
@ -61,16 +63,15 @@ public class JoinStep1Job {
|
|||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
|
||||
.toColumn();
|
||||
|
||||
sources
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedProject>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(joinedEntitiesPath);
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -52,6 +52,8 @@ public class JoinStep2Job {
|
|||
|
||||
ClusterUtils.removeDir(spark, joinedEntitiesPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
||||
|
||||
|
@ -61,16 +63,15 @@ public class JoinStep2Job {
|
|||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
|
||||
.toColumn();
|
||||
|
||||
sources
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedSoftware>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(joinedEntitiesPath);
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -52,6 +52,8 @@ public class JoinStep3Job {
|
|||
|
||||
ClusterUtils.removeDir(spark, joinedEntitiesPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
||||
|
||||
|
@ -61,16 +63,15 @@ public class JoinStep3Job {
|
|||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
|
||||
.toColumn();
|
||||
|
||||
sources
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedDataset>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(joinedEntitiesPath);
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -52,6 +52,8 @@ public class JoinStep4Job {
|
|||
|
||||
ClusterUtils.removeDir(spark, joinedEntitiesPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
||||
|
||||
|
@ -61,16 +63,15 @@ public class JoinStep4Job {
|
|||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
|
||||
.toColumn();
|
||||
|
||||
sources
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, RelatedPublication>, String>) t -> t._1.getOpenaireId(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(joinedEntitiesPath);
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -10,8 +10,8 @@ import org.apache.spark.SparkConf;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -57,6 +57,8 @@ public class PrepareGroupsJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, groupsPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> results = ClusterUtils
|
||||
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
||||
|
||||
|
@ -67,20 +69,16 @@ public class PrepareGroupsJob {
|
|||
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
.toColumn();
|
||||
|
||||
final Dataset<ResultGroup> groups = results
|
||||
final Dataset<ResultGroup> dataset = results
|
||||
.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OaBrokerMainEntity, Relation>, String>) t -> t._2.getTarget(),
|
||||
Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(
|
||||
(MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
|
||||
.map(t -> t._2, Encoders.bean(ResultGroup.class))
|
||||
.filter(rg -> rg.getData().size() > 1);
|
||||
|
||||
groups
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(groupsPath);
|
||||
ClusterUtils.save(dataset, groupsPath, ResultGroup.class, total);
|
||||
|
||||
});
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -54,6 +54,8 @@ public class PrepareRelatedDatasetsJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
|
||||
|
||||
final Dataset<OaBrokerRelatedDataset> datasets = ClusterUtils
|
||||
.readPath(spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class)
|
||||
.filter(d -> !ClusterUtils.isDedupRoot(d.getId()))
|
||||
|
@ -67,16 +69,15 @@ public class PrepareRelatedDatasetsJob {
|
|||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
final Dataset<RelatedDataset> dataset = rels
|
||||
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
|
||||
rel.getRelDataset().setRelType(t._1.getRelClass());
|
||||
return rel;
|
||||
}, Encoders.bean(RelatedDataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
}, Encoders.bean(RelatedDataset.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedDataset.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -56,6 +56,8 @@ public class PrepareRelatedProjectsJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
|
||||
|
||||
final Dataset<OaBrokerProject> projects = ClusterUtils
|
||||
.readPath(spark, graphPath + "/project", Project.class)
|
||||
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
|
@ -69,12 +71,12 @@ public class PrepareRelatedProjectsJob {
|
|||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
final Dataset<RelatedProject> dataset = rels
|
||||
.joinWith(projects, projects.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
.map(t -> new RelatedProject(t._1.getSource(), t._2), Encoders.bean(RelatedProject.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedProject.class, total);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -55,6 +55,8 @@ public class PrepareRelatedPublicationsJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
|
||||
|
||||
final Dataset<OaBrokerRelatedPublication> pubs = ClusterUtils
|
||||
.readPath(spark, graphPath + "/publication", Publication.class)
|
||||
.filter(p -> !ClusterUtils.isDedupRoot(p.getId()))
|
||||
|
@ -70,16 +72,15 @@ public class PrepareRelatedPublicationsJob {
|
|||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
final Dataset<RelatedPublication> dataset = rels
|
||||
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
|
||||
rel.getRelPublication().setRelType(t._1.getRelClass());
|
||||
return rel;
|
||||
}, Encoders.bean(RelatedPublication.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
}, Encoders.bean(RelatedPublication.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedPublication.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -56,6 +56,8 @@ public class PrepareRelatedSoftwaresJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_rels");
|
||||
|
||||
final Dataset<OaBrokerRelatedSoftware> softwares = ClusterUtils
|
||||
.readPath(spark, graphPath + "/software", Software.class)
|
||||
.filter(sw -> !ClusterUtils.isDedupRoot(sw.getId()))
|
||||
|
@ -69,12 +71,11 @@ public class PrepareRelatedSoftwaresJob {
|
|||
.filter(r -> !ClusterUtils.isDedupRoot(r.getSource()))
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getTarget()));
|
||||
|
||||
rels
|
||||
final Dataset<RelatedSoftware> dataset = rels
|
||||
.joinWith(softwares, softwares.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||
.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(relsPath);
|
||||
.map(t -> new RelatedSoftware(t._1.getSource(), t._2), Encoders.bean(RelatedSoftware.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedSoftware.class, total);
|
||||
|
||||
});
|
||||
|
||||
|
|
|
@ -9,8 +9,8 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -56,13 +56,14 @@ public class PrepareSimpleEntititiesJob {
|
|||
|
||||
ClusterUtils.removeDir(spark, simpleEntitiesPath);
|
||||
|
||||
prepareSimpleEntities(spark, graphPath, Publication.class)
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> dataset = prepareSimpleEntities(spark, graphPath, Publication.class)
|
||||
.union(prepareSimpleEntities(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class))
|
||||
.union(prepareSimpleEntities(spark, graphPath, Software.class))
|
||||
.union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(simpleEntitiesPath);
|
||||
.union(prepareSimpleEntities(spark, graphPath, OtherResearchProduct.class));
|
||||
|
||||
ClusterUtils.save(dataset, simpleEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
});
|
||||
|
||||
}
|
||||
|
|
|
@ -83,8 +83,8 @@ public abstract class UpdateMatcher<T> {
|
|||
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
|
||||
}
|
||||
|
||||
protected boolean isMissing(final String field) {
|
||||
return StringUtils.isBlank(field);
|
||||
protected boolean isMissing(final String s) {
|
||||
return StringUtils.isBlank(s);
|
||||
}
|
||||
|
||||
public int getMaxNumber() {
|
||||
|
@ -108,7 +108,7 @@ public abstract class UpdateMatcher<T> {
|
|||
}
|
||||
|
||||
public void incrementAccumulator(final Map<String, LongAccumulator> accumulators, final long n) {
|
||||
if (accumulators.containsKey(accumulatorName())) {
|
||||
if (accumulators != null && accumulators.containsKey(accumulatorName())) {
|
||||
accumulators.get(accumulatorName()).add(n);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,7 +4,9 @@ package eu.dnetlib.dhp.broker.oa.util;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
|
@ -44,4 +46,20 @@ public class ClusterUtils {
|
|||
|| s.equals("isSupplementedTo");
|
||||
}
|
||||
|
||||
public static <T> T incrementAccumulator(final T o, final LongAccumulator acc) {
|
||||
if (acc != null) {
|
||||
acc.add(1);
|
||||
}
|
||||
return o;
|
||||
}
|
||||
|
||||
public static <T> void save(final Dataset<T> dataset, final String path, final Class<T> clazz,
|
||||
final LongAccumulator acc) {
|
||||
dataset
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(path);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -55,7 +55,7 @@ public class ConversionUtils {
|
|||
res.setLicense(BrokerConstants.OPEN_ACCESS);
|
||||
res.setHostedby(kvValue(i.getHostedby()));
|
||||
return res;
|
||||
});
|
||||
}, 20);
|
||||
}
|
||||
|
||||
public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) {
|
||||
|
@ -75,8 +75,8 @@ public class ConversionUtils {
|
|||
res.setOpenaireId(d.getId());
|
||||
res.setOriginalId(first(d.getOriginalId()));
|
||||
res.setTitle(structPropValue(d.getTitle()));
|
||||
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
|
||||
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid, 20));
|
||||
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances, 20));
|
||||
res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue));
|
||||
return res;
|
||||
}
|
||||
|
@ -90,8 +90,8 @@ public class ConversionUtils {
|
|||
res.setOpenaireId(p.getId());
|
||||
res.setOriginalId(first(p.getOriginalId()));
|
||||
res.setTitle(structPropValue(p.getTitle()));
|
||||
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
|
||||
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid, 20));
|
||||
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances, 20));
|
||||
res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue));
|
||||
|
||||
return res;
|
||||
|
@ -107,23 +107,25 @@ public class ConversionUtils {
|
|||
res.setOpenaireId(result.getId());
|
||||
res.setOriginalId(first(result.getOriginalId()));
|
||||
res.setTypology(classId(result.getResulttype()));
|
||||
res.setTitles(structPropList(result.getTitle()));
|
||||
res.setAbstracts(fieldList(result.getDescription()));
|
||||
res.setTitles(structPropList(result.getTitle(), 10));
|
||||
res.setAbstracts(fieldList(result.getDescription(), 10));
|
||||
res.setLanguage(classId(result.getLanguage()));
|
||||
res.setSubjects(structPropTypedList(result.getSubject()));
|
||||
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
|
||||
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor, 30));
|
||||
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
|
||||
res.setPublisher(fieldValue(result.getPublisher()));
|
||||
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
|
||||
res.setContributor(fieldList(result.getContributor()));
|
||||
res.setContributor(fieldList(result.getContributor(), 20));
|
||||
res
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
|
||||
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
|
||||
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
|
||||
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
|
||||
res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
|
||||
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid, 20));
|
||||
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances, 20));
|
||||
res
|
||||
.setExternalReferences(
|
||||
mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef, 20));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
@ -243,18 +245,25 @@ public class ConversionUtils {
|
|||
: null;
|
||||
}
|
||||
|
||||
private static List<String> fieldList(final List<Field<String>> fl) {
|
||||
private static List<String> fieldList(final List<Field<String>> fl, final long maxSize) {
|
||||
return fl != null
|
||||
? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
|
||||
? fl
|
||||
.stream()
|
||||
.map(Field::getValue)
|
||||
.map(s -> StringUtils.abbreviate(s, 3000)) // MAX 3000 CHARS
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.limit(maxSize)
|
||||
.collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
|
||||
private static List<String> structPropList(final List<StructuredProperty> props) {
|
||||
private static List<String> structPropList(final List<StructuredProperty> props, final long maxSize) {
|
||||
return props != null
|
||||
? props
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.limit(maxSize)
|
||||
.collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
|
@ -271,7 +280,7 @@ public class ConversionUtils {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
|
||||
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func, final long maxSize) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
@ -280,10 +289,12 @@ public class ConversionUtils {
|
|||
.stream()
|
||||
.map(func::apply)
|
||||
.filter(Objects::nonNull)
|
||||
.limit(maxSize)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
|
||||
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func,
|
||||
final long maxSize) {
|
||||
if (list == null) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
@ -293,6 +304,7 @@ public class ConversionUtils {
|
|||
.map(func::apply)
|
||||
.flatMap(List::stream)
|
||||
.filter(Objects::nonNull)
|
||||
.limit(maxSize)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
|
|
@ -378,9 +378,9 @@
|
|||
<class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors="2"
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
|
|
@ -79,7 +79,6 @@
|
|||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="generate_events">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -101,31 +100,6 @@
|
|||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
||||
</spark>
|
||||
<ok to="index_es"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="index_es">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>IndexOnESJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.IndexOnESJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>--index</arg><arg>${esIndexName}</arg>
|
||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
|
||||
class UpdateMatcherTest {
|
||||
|
||||
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_1() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_2() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
res.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_3() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
p2.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
||||
|
||||
assertTrue(list.size() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_4() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
res.setPublicationdate("2018");
|
||||
p2.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_5() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
res.setPublicationdate("2018");
|
||||
p1.setPublicationdate("2018");
|
||||
p2.setPublicationdate("2018");
|
||||
p3.setPublicationdate("2018");
|
||||
p4.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_6() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
p1.setPublicationdate("2018");
|
||||
p2.setPublicationdate("2018");
|
||||
p3.setPublicationdate("2018");
|
||||
p4.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
||||
|
||||
assertTrue(list.size() == 1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
|
||||
class EnrichMissingPublicationDateTest {
|
||||
|
||||
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_1() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_2() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.setPublicationdate("2018");
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.size() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_3() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
target.setPublicationdate("2018");
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_4() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.setPublicationdate("2018");
|
||||
target.setPublicationdate("2018");
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue