Merge branch 'master' of https://code-repo.d4science.org/D-Net/dnet-hadoop into orcid-no-doi
commit
a6acb37689
@ -0,0 +1,90 @@
|
||||
package eu.dnetlib.dhp.schema.scholexplorer
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty}
|
||||
|
||||
object OafUtils {
|
||||
|
||||
|
||||
|
||||
def generateKeyValue(key: String, value: String): KeyValue = {
|
||||
val kv: KeyValue = new KeyValue()
|
||||
kv.setKey(key)
|
||||
kv.setValue(value)
|
||||
kv.setDataInfo(generateDataInfo("0.9"))
|
||||
kv
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||
di
|
||||
}
|
||||
|
||||
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||
createQualifier(cls, cls, sch, sch)
|
||||
}
|
||||
|
||||
|
||||
def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = {
|
||||
val q: Qualifier = new Qualifier
|
||||
q.setClassid(classId)
|
||||
q.setClassname(className)
|
||||
q.setSchemeid(schemeId)
|
||||
q.setSchemename(schemeName)
|
||||
q
|
||||
}
|
||||
|
||||
|
||||
def asField[T](value: T): Field[T] = {
|
||||
val tmp = new Field[T]
|
||||
tmp.setValue(value)
|
||||
tmp
|
||||
|
||||
|
||||
}
|
||||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId,className, schemeId, schemeName))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,112 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class CheckDuplictedIdsJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CheckDuplictedIdsJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
CheckDuplictedIdsJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String countPath = parser.get("workingPath") + "/counts";
|
||||
log.info("countPath: {}", countPath);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("invaild_event_id");
|
||||
|
||||
final TypedColumn<Tuple2<String, Long>, Tuple2<String, Long>> agg = new CountAggregator().toColumn();
|
||||
|
||||
ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.map(e -> new Tuple2<>(e.getEventId(), 1l), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.groupByKey(t -> t._1, Encoders.STRING())
|
||||
.agg(agg)
|
||||
.map(t -> t._2, Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.filter(t -> t._2 > 1)
|
||||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(countPath);
|
||||
;
|
||||
|
||||
}
|
||||
|
||||
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
|
||||
return new ObjectMapper().writeValueAsString(f);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
class CountAggregator extends Aggregator<Tuple2<String, Long>, Tuple2<String, Long>, Tuple2<String, Long>> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1395935985734672538L;
|
||||
|
||||
@Override
|
||||
public Encoder<Tuple2<String, Long>> bufferEncoder() {
|
||||
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> finish(final Tuple2<String, Long> arg0) {
|
||||
return arg0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> merge(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
|
||||
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
|
||||
return new Tuple2<>(s, arg0._2 + arg1._2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<Tuple2<String, Long>> outputEncoder() {
|
||||
return Encoders.tuple(Encoders.STRING(), Encoders.LONG());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> reduce(final Tuple2<String, Long> arg0, final Tuple2<String, Long> arg1) {
|
||||
final String s = StringUtils.defaultIfBlank(arg0._1, arg1._1);
|
||||
return new Tuple2<>(s, arg0._2 + arg1._2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tuple2<String, Long> zero() {
|
||||
return new Tuple2<>(null, 0l);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.DatasourceStats;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.stats.StatsAggregator;
|
||||
|
||||
public class GenerateStatsJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateStatsJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
IndexOnESJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String statsPath = parser.get("workingPath") + "/stats";
|
||||
log.info("stats: {}", statsPath);
|
||||
|
||||
final TypedColumn<Event, DatasourceStats> aggr = new StatsAggregator().toColumn();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
final Dataset<DatasourceStats> stats = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
.groupByKey(e -> e.getMap().getTargetDatasourceId(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(DatasourceStats.class));
|
||||
|
||||
ClusterUtils.save(stats, statsPath, DatasourceStats.class, null);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,71 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
|
||||
public class IndexOnESJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(IndexOnESJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
IndexOnESJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/index_es.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
final String eventsPath = parser.get("workingPath") + "/events";
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String index = parser.get("index");
|
||||
log.info("index: {}", index);
|
||||
|
||||
final String indexHost = parser.get("esHost");
|
||||
log.info("indexHost: {}", indexHost);
|
||||
|
||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||
|
||||
final JavaRDD<String> inputRdd = ClusterUtils
|
||||
.readPath(spark, eventsPath, Event.class)
|
||||
// .limit(10000) // TODO REMOVE
|
||||
.map(IndexOnESJob::eventAsJsonString, Encoders.STRING())
|
||||
.javaRDD();
|
||||
|
||||
final Map<String, String> esCfg = new HashMap<>();
|
||||
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
||||
esCfg.put("es.nodes", indexHost);
|
||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||
esCfg.put("es.batch.write.retry.count", "8");
|
||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
||||
esCfg.put("es.batch.size.entries", "200");
|
||||
esCfg.put("es.nodes.wan.only", "true");
|
||||
|
||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||
}
|
||||
|
||||
private static String eventAsJsonString(final Event f) throws JsonProcessingException {
|
||||
return new ObjectMapper().writeValueAsString(f);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasource;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasourceAggregator;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class JoinStep0Job {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(JoinStep0Job.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
JoinStep0Job.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
|
||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, joinedEntitiesPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||
|
||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
|
||||
|
||||
final Dataset<RelatedDatasource> typedRels = ClusterUtils
|
||||
.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
|
||||
|
||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
|
||||
.toColumn();
|
||||
|
||||
final Dataset<OaBrokerMainEntity> dataset = sources
|
||||
.joinWith(typedRels, sources.col("openaireId").equalTo(typedRels.col("source")), "left_outer")
|
||||
.groupByKey(t -> t._1.getOpenaireId(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.bean(OaBrokerMainEntity.class));
|
||||
|
||||
ClusterUtils.save(dataset, joinedEntitiesPath, OaBrokerMainEntity.class, total);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,107 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.DatasourceRelationsAccumulator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDatasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import scala.Tuple3;
|
||||
|
||||
public class PrepareRelatedDatasourcesJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareRelatedDatasourcesJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
PrepareRelatedDatasourcesJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphPath = parser.get("graphPath");
|
||||
log.info("graphPath: {}", graphPath);
|
||||
|
||||
final String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
|
||||
final String relsPath = workingPath + "/relatedDatasources";
|
||||
log.info("relsPath: {}", relsPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
|
||||
ClusterUtils.removeDir(spark, relsPath);
|
||||
|
||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_datasources");
|
||||
|
||||
final Dataset<Tuple3<String, String, String>> rels = prepareResultTuples(
|
||||
spark, graphPath, Publication.class)
|
||||
.union(prepareResultTuples(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class))
|
||||
.union(prepareResultTuples(spark, graphPath, Software.class))
|
||||
.union(prepareResultTuples(spark, graphPath, OtherResearchProduct.class));
|
||||
|
||||
final Dataset<OaBrokerRelatedDatasource> datasources = ClusterUtils
|
||||
.readPath(spark, graphPath + "/datasource", Datasource.class)
|
||||
.map(ConversionUtils::oafDatasourceToBrokerDatasource, Encoders.bean(OaBrokerRelatedDatasource.class));
|
||||
|
||||
final Dataset<RelatedDatasource> dataset = rels
|
||||
.joinWith(datasources, datasources.col("openaireId").equalTo(rels.col("_2")), "inner")
|
||||
.map(t -> {
|
||||
final RelatedDatasource r = new RelatedDatasource();
|
||||
r.setSource(t._1._1());
|
||||
r.setRelDatasource(t._2);
|
||||
r.getRelDatasource().setRelType(t._1._3());
|
||||
return r;
|
||||
}, Encoders.bean(RelatedDatasource.class));
|
||||
|
||||
ClusterUtils.save(dataset, relsPath, RelatedDatasource.class, total);
|
||||
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static final Dataset<Tuple3<String, String, String>> prepareResultTuples(final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<? extends Result> sourceClass) {
|
||||
|
||||
return ClusterUtils
|
||||
.readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> !ClusterUtils.isDedupRoot(r.getId()))
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(
|
||||
r -> DatasourceRelationsAccumulator.calculateTuples(r),
|
||||
Encoders.bean(DatasourceRelationsAccumulator.class))
|
||||
.flatMap(
|
||||
acc -> acc.getRels().iterator(),
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING()));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import scala.Tuple3;
|
||||
|
||||
public class DatasourceRelationsAccumulator implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 3256220670651218957L;
|
||||
|
||||
private List<Tuple3<String, String, String>> rels = new ArrayList<>();
|
||||
|
||||
public List<Tuple3<String, String, String>> getRels() {
|
||||
return rels;
|
||||
}
|
||||
|
||||
public void setRels(final List<Tuple3<String, String, String>> rels) {
|
||||
this.rels = rels;
|
||||
}
|
||||
|
||||
protected void addTuple(final Tuple3<String, String, String> t) {
|
||||
rels.add(t);
|
||||
}
|
||||
|
||||
public static final DatasourceRelationsAccumulator calculateTuples(final Result r) {
|
||||
|
||||
final Set<String> collectedFromSet = r
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(kv -> kv.getKey())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.distinct()
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
final Set<String> hostedBySet = r
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(i -> i.getHostedby())
|
||||
.filter(Objects::nonNull)
|
||||
.filter(kv -> !StringUtils.equalsIgnoreCase(kv.getValue(), "Unknown Repository"))
|
||||
.map(kv -> kv.getKey())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.distinct()
|
||||
.filter(id -> !collectedFromSet.contains(id))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
|
||||
collectedFromSet
|
||||
.stream()
|
||||
.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
|
||||
.forEach(res::addTuple);
|
||||
hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,61 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.stats;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class DatasourceStats implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -282112564184047677L;
|
||||
|
||||
private String id;
|
||||
private String name;
|
||||
private String type;
|
||||
private Map<String, Long> topics = new HashMap<>();
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public Map<String, Long> getTopics() {
|
||||
return topics;
|
||||
}
|
||||
|
||||
public void setTopics(final Map<String, Long> topics) {
|
||||
this.topics = topics;
|
||||
}
|
||||
|
||||
public void incrementTopic(final String topic, final long inc) {
|
||||
if (topics.containsKey(topic)) {
|
||||
topics.put(topic, topics.get(topic) + inc);
|
||||
} else {
|
||||
topics.put(topic, inc);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,59 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.stats;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
|
||||
public class StatsAggregator extends Aggregator<Event, DatasourceStats, DatasourceStats> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 6652105853037330529L;
|
||||
|
||||
@Override
|
||||
public DatasourceStats zero() {
|
||||
return new DatasourceStats();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DatasourceStats reduce(final DatasourceStats stats, final Event e) {
|
||||
stats.setId(e.getMap().getTargetDatasourceId());
|
||||
stats.setName(e.getMap().getTargetDatasourceName());
|
||||
stats.setType(e.getMap().getTargetDatasourceType());
|
||||
stats.incrementTopic(e.getTopic(), 1l);
|
||||
return stats;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DatasourceStats merge(final DatasourceStats stats0, final DatasourceStats stats1) {
|
||||
if (StringUtils.isBlank(stats0.getId())) {
|
||||
stats0.setId(stats1.getId());
|
||||
stats0.setName(stats1.getName());
|
||||
stats0.setType(stats1.getType());
|
||||
}
|
||||
stats1.getTopics().entrySet().forEach(e -> stats0.incrementTopic(e.getKey(), e.getValue()));
|
||||
return stats0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<DatasourceStats> bufferEncoder() {
|
||||
return Encoders.bean(DatasourceStats.class);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public DatasourceStats finish(final DatasourceStats stats) {
|
||||
return stats;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<DatasourceStats> outputEncoder() {
|
||||
return Encoders.bean(DatasourceStats.class);
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
|
||||
|
||||
public class RelatedDatasource implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 3015550240920424010L;
|
||||
|
||||
private String source;
|
||||
private OaBrokerRelatedDatasource relDatasource;
|
||||
|
||||
public RelatedDatasource() {
|
||||
}
|
||||
|
||||
public RelatedDatasource(final String source, final OaBrokerRelatedDatasource relDatasource) {
|
||||
this.source = source;
|
||||
this.relDatasource = relDatasource;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public void setSource(final String source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
public OaBrokerRelatedDatasource getRelDatasource() {
|
||||
return relDatasource;
|
||||
}
|
||||
|
||||
public void setRelDatasource(final OaBrokerRelatedDatasource relDatasource) {
|
||||
this.relDatasource = relDatasource;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class RelatedDatasourceAggregator
|
||||
extends Aggregator<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity, OaBrokerMainEntity> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -7212121913834713672L;
|
||||
|
||||
@Override
|
||||
public OaBrokerMainEntity zero() {
|
||||
return new OaBrokerMainEntity();
|
||||
}
|
||||
|
||||
@Override
|
||||
public OaBrokerMainEntity finish(final OaBrokerMainEntity g) {
|
||||
return g;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OaBrokerMainEntity reduce(final OaBrokerMainEntity g,
|
||||
final Tuple2<OaBrokerMainEntity, RelatedDatasource> t) {
|
||||
final OaBrokerMainEntity res = StringUtils.isNotBlank(g.getOpenaireId()) ? g : t._1;
|
||||
if (t._2 != null && res.getDatasources().size() < BrokerConstants.MAX_NUMBER_OF_RELS) {
|
||||
res.getDatasources().add(t._2.getRelDatasource());
|
||||
}
|
||||
return res;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public OaBrokerMainEntity merge(final OaBrokerMainEntity g1, final OaBrokerMainEntity g2) {
|
||||
if (StringUtils.isNotBlank(g1.getOpenaireId())) {
|
||||
final int availables = BrokerConstants.MAX_NUMBER_OF_RELS - g1.getDatasources().size();
|
||||
if (availables > 0) {
|
||||
if (g2.getDatasources().size() <= availables) {
|
||||
g1.getDatasources().addAll(g2.getDatasources());
|
||||
} else {
|
||||
g1.getDatasources().addAll(g2.getDatasources().subList(0, availables));
|
||||
}
|
||||
}
|
||||
return g1;
|
||||
} else {
|
||||
return g2;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<OaBrokerMainEntity> bufferEncoder() {
|
||||
return Encoders.bean(OaBrokerMainEntity.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<OaBrokerMainEntity> outputEncoder() {
|
||||
return Encoders.bean(OaBrokerMainEntity.class);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,122 @@
|
||||
{
|
||||
"wf": {
|
||||
|
||||
},
|
||||
"pace": {
|
||||
"clustering": [
|
||||
{
|
||||
"name": "wordssuffixprefix",
|
||||
"fields": [
|
||||
"title"
|
||||
],
|
||||
"params": {
|
||||
"max": "2",
|
||||
"len": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "lowercase",
|
||||
"fields": [
|
||||
"doi"
|
||||
],
|
||||
"params": {
|
||||
|
||||
}
|
||||
}
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "doi",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer1",
|
||||
"undefined": "layer1",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer1": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "titleVersionMatch",
|
||||
"weight": 0.9,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
|
||||
}
|
||||
},
|
||||
{
|
||||
"field": "authors",
|
||||
"comparator": "sizeMatch",
|
||||
"weight": 0.9,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.5,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "layer2",
|
||||
"undefined": "layer2",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer2": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "title",
|
||||
"comparator": "levensteinTitle",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.99,
|
||||
"aggregation": "AVG",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
}
|
||||
},
|
||||
"model": [
|
||||
{
|
||||
"name": "doi",
|
||||
"type": "String",
|
||||
"path": "$.pids[?(@.type == 'doi')].value"
|
||||
},
|
||||
{
|
||||
"name": "title",
|
||||
"type": "String",
|
||||
"path": "$.titles",
|
||||
"length": 250,
|
||||
"size": 5
|
||||
},
|
||||
{
|
||||
"name": "authors",
|
||||
"type": "List",
|
||||
"path": "$.creators[*].fullname",
|
||||
"size": 200
|
||||
}
|
||||
],
|
||||
"blacklists": {
|
||||
|
||||
},
|
||||
"synonyms": {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "the workinh path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "idx",
|
||||
"paramLongName": "index",
|
||||
"paramDescription": "the ES index",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "es",
|
||||
"paramLongName": "esHost",
|
||||
"paramDescription": "the ES host",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -0,0 +1,133 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerRelatedDatasource;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class UpdateMatcherTest {
|
||||
|
||||
UpdateMatcher<String> matcher = new EnrichMissingPublicationDate();
|
||||
|
||||
@Mock
|
||||
private OaBrokerRelatedDatasource targetDs;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_1() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_2() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
res.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_3() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
p2.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
|
||||
|
||||
assertTrue(list.size() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_4() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
res.setPublicationdate("2018");
|
||||
p2.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_5() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
res.setPublicationdate("2018");
|
||||
p1.setPublicationdate("2018");
|
||||
p2.setPublicationdate("2018");
|
||||
p3.setPublicationdate("2018");
|
||||
p4.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
|
||||
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSearchUpdatesForRecord_6() {
|
||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p1 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p2 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p3 = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||
|
||||
p1.setPublicationdate("2018");
|
||||
p2.setPublicationdate("2018");
|
||||
p3.setPublicationdate("2018");
|
||||
p4.setPublicationdate("2018");
|
||||
|
||||
final Collection<UpdateInfo<String>> list = matcher
|
||||
.searchUpdatesForRecord(res, targetDs, Arrays.asList(p1, p2, p3, p4), null);
|
||||
|
||||
assertTrue(list.size() == 1);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
|
||||
class EnrichMissingPublicationDateTest {
|
||||
|
||||
final EnrichMissingPublicationDate matcher = new EnrichMissingPublicationDate();
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_1() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_2() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.setPublicationdate("2018");
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.size() == 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_3() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
target.setPublicationdate("2018");
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_4() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.setPublicationdate("2018");
|
||||
target.setPublicationdate("2018");
|
||||
final List<String> list = matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class RelationAggregator extends Aggregator<Relation, Relation, Relation> {
|
||||
|
||||
private static Relation ZERO = new Relation();
|
||||
|
||||
@Override
|
||||
public Relation zero() {
|
||||
return ZERO;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation reduce(Relation b, Relation a) {
|
||||
return mergeRel(b, a);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation merge(Relation b, Relation a) {
|
||||
return mergeRel(b, a);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Relation finish(Relation r) {
|
||||
return r;
|
||||
}
|
||||
|
||||
private Relation mergeRel(Relation b, Relation a) {
|
||||
if (Objects.equals(b, ZERO)) {
|
||||
return a;
|
||||
}
|
||||
if (Objects.equals(a, ZERO)) {
|
||||
return b;
|
||||
}
|
||||
|
||||
b.mergeFrom(a);
|
||||
return b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<Relation> bufferEncoder() {
|
||||
return Encoders.kryo(Relation.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<Relation> outputEncoder() {
|
||||
return Encoders.kryo(Relation.class);
|
||||
}
|
||||
}
|
@ -0,0 +1,126 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Block;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.BlockStats;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkBlockStats extends AbstractSparkAction {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class);
|
||||
|
||||
public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) {
|
||||
super(parser, spark);
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkBlockStats.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
new SparkBlockStats(parser, getSparkSession(conf))
|
||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||
}
|
||||
|
||||
public Long computeComparisons(Long blockSize, Long slidingWindowSize) {
|
||||
|
||||
if (slidingWindowSize >= blockSize)
|
||||
return (slidingWindowSize * (slidingWindowSize - 1)) / 2;
|
||||
else {
|
||||
return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run(ISLookUpService isLookUpService)
|
||||
throws DocumentException, IOException, ISLookUpException {
|
||||
|
||||
// read oozie parameters
|
||||
final String graphBasePath = parser.get("graphBasePath");
|
||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||
final String actionSetId = parser.get("actionSetId");
|
||||
final String workingPath = parser.get("workingPath");
|
||||
final int numPartitions = Optional
|
||||
.ofNullable(parser.get("numPartitions"))
|
||||
.map(Integer::valueOf)
|
||||
.orElse(NUM_PARTITIONS);
|
||||
|
||||
log.info("graphBasePath: '{}'", graphBasePath);
|
||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||
log.info("actionSetId: '{}'", actionSetId);
|
||||
log.info("workingPath: '{}'", workingPath);
|
||||
|
||||
// for each dedup configuration
|
||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||
|
||||
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
||||
log.info("Creating blockstats for: '{}'", subEntity);
|
||||
|
||||
final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity);
|
||||
removeOutputDir(spark, outputPath);
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaPairRDD<String, MapDocument> mapDocuments = sc
|
||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||
.repartition(numPartitions)
|
||||
.mapToPair(
|
||||
(PairFunction<String, String, MapDocument>) s -> {
|
||||
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s);
|
||||
return new Tuple2<>(d.getIdentifier(), d);
|
||||
});
|
||||
|
||||
// create blocks for deduplication
|
||||
JavaRDD<BlockStats> blockStats = Deduper
|
||||
.createSortedBlocks(mapDocuments, dedupConf)
|
||||
.repartition(numPartitions)
|
||||
.map(b -> asBlockStats(dedupConf, b));
|
||||
|
||||
// save the blockstats in the workingdir
|
||||
spark
|
||||
.createDataset(blockStats.rdd(), Encoders.bean(BlockStats.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(outputPath);
|
||||
}
|
||||
}
|
||||
|
||||
private BlockStats asBlockStats(DedupConfig dedupConf, Tuple2<String, Block> b) {
|
||||
return new BlockStats(
|
||||
b._1(),
|
||||
(long) b._2().getDocuments().size(),
|
||||
computeComparisons(
|
||||
(long) b._2().getDocuments().size(), (long) dedupConf.getWf().getSlidingWindowSize()));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.dedup.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class BlockStats implements Serializable {
|
||||
|
||||
private String key; // key of the block
|
||||
private Long size; // number of elements in the block
|
||||
private Long comparisons; // number of comparisons in the block
|
||||
|
||||
public BlockStats() {
|
||||
}
|
||||
|
||||
public BlockStats(String key, Long size, Long comparisons) {
|
||||
this.key = key;
|
||||
this.size = size;
|
||||
this.comparisons = comparisons;
|
||||
}
|
||||
|
||||
public String getKey() {
|
||||
return key;
|
||||
}
|
||||
|
||||
public void setKey(String key) {
|
||||
this.key = key;
|
||||
}
|
||||
|
||||
public Long getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public void setSize(Long size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
public Long getComparisons() {
|
||||
return comparisons;
|
||||
}
|
||||
|
||||
public void setComparisons(Long comparisons) {
|
||||
this.comparisons = comparisons;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
[
|
||||
{
|
||||
"paramName": "la",
|
||||
"paramLongName": "isLookUpUrl",
|
||||
"paramDescription": "address for the LookUp",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "asi",
|
||||
"paramLongName": "actionSetId",
|
||||
"paramDescription": "action set identifier (name of the orchestrator)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "graphBasePath",
|
||||
"paramDescription": "the base path of the raw graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingPath",
|
||||
"paramDescription": "path of the working directory",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "np",
|
||||
"paramLongName": "numPartitions",
|
||||
"paramDescription": "number of partitions for the similarity relations intermediate phases",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
@ -0,0 +1,108 @@
|
||||
<workflow-app name="Create dedup blocks" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphBasePath</name>
|
||||
<description>the raw graph base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookUpUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>actionSetId</name>
|
||||
<description>id of the actionSet</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>numPartitions</name>
|
||||
<description>number of partitions for the similarity relations intermediate phases</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="CreateBlockStats"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="CreateBlockStats">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create deduplication blocks</name>
|
||||
<class>eu.dnetlib.dhp.oa.dedup.SparkBlockStats</class>
|
||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}</arg>
|
||||
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,177 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class SparkStatsTest implements Serializable {
|
||||
|
||||
@Mock(serializable = true)
|
||||
ISLookUpService isLookUpService;
|
||||
|
||||
private static SparkSession spark;
|
||||
private static JavaSparkContext jsc;
|
||||
|
||||
private static String testGraphBasePath;
|
||||
private static String testOutputBasePath;
|
||||
private static final String testActionSetId = "test-orchestrator";
|
||||
|
||||
@BeforeAll
|
||||
public static void cleanUp() throws IOException, URISyntaxException {
|
||||
|
||||
testGraphBasePath = Paths
|
||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-")
|
||||
.toAbsolutePath()
|
||||
.toString();
|
||||
|
||||
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.set("spark.sql.shuffle.partitions", "200");
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkDedupTest.class.getSimpleName())
|
||||
.master("local[*]")
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
|
||||
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, ISLookUpException {
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml")));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void createBlockStatsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateSimRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json")));
|
||||
parser
|
||||
.parseArgument(
|
||||
new String[] {
|
||||
"-i", testGraphBasePath,
|
||||
"-asi", testActionSetId,
|
||||
"-la", "lookupurl",
|
||||
"-w", testOutputBasePath
|
||||
});
|
||||
|
||||
new SparkBlockStats(parser, spark).run(isLookUpService);
|
||||
|
||||
long orgs_blocks = spark
|
||||
.read()
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_blockstats")
|
||||
.count();
|
||||
|
||||
long pubs_blocks = spark
|
||||
.read()
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_blockstats")
|
||||
.count();
|
||||
|
||||
long sw_blocks = spark
|
||||
.read()
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/software_blockstats")
|
||||
.count();
|
||||
|
||||
long ds_blocks = spark
|
||||
.read()
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_blockstats")
|
||||
.count();
|
||||
|
||||
long orp_blocks = spark
|
||||
.read()
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||
.count();
|
||||
|
||||
assertEquals(121, orgs_blocks);
|
||||
assertEquals(110, pubs_blocks);
|
||||
assertEquals(21, sw_blocks);
|
||||
assertEquals(67, ds_blocks);
|
||||
assertEquals(55, orp_blocks);
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,162 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.merge;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Combines the content from two aggregator graph tables of the same type, entities (or relationships) with the same ids
|
||||
* are picked preferring those from the BETA aggregator rather then from PROD. The identity of a relationship is defined
|
||||
* by eu.dnetlib.dhp.schema.common.ModelSupport#idFn()
|
||||
*/
|
||||
public class MergeGraphSparkJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final String PRIORITY_DEFAULT = "BETA"; // BETA | PROD
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CleanGraphSparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/merge_graphs_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
String priority = Optional
|
||||
.ofNullable(parser.get("priority"))
|
||||
.orElse(PRIORITY_DEFAULT);
|
||||
log.info("priority: {}", priority);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String betaInputPath = parser.get("betaInputPath");
|
||||
log.info("betaInputPath: {}", betaInputPath);
|
||||
|
||||
String prodInputPath = parser.get("prodInputPath");
|
||||
log.info("prodInputPath: {}", prodInputPath);
|
||||
|
||||
String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
String graphTableClassName = parser.get("graphTableClassName");
|
||||
log.info("graphTableClassName: {}", graphTableClassName);
|
||||
|
||||
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
removeOutputDir(spark, outputPath);
|
||||
mergeGraphTable(spark, priority, betaInputPath, prodInputPath, entityClazz, entityClazz, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> void mergeGraphTable(
|
||||
SparkSession spark,
|
||||
String priority,
|
||||
String betaInputPath,
|
||||
String prodInputPath,
|
||||
Class<P> p_clazz,
|
||||
Class<B> b_clazz,
|
||||
String outputPath) {
|
||||
|
||||
Dataset<Tuple2<String, B>> beta = readTableFromPath(spark, betaInputPath, b_clazz);
|
||||
Dataset<Tuple2<String, P>> prod = readTableFromPath(spark, prodInputPath, p_clazz);
|
||||
|
||||
prod
|
||||
.joinWith(beta, prod.col("_1").equalTo(beta.col("_1")), "full_outer")
|
||||
.map((MapFunction<Tuple2<Tuple2<String, P>, Tuple2<String, B>>, P>) value -> {
|
||||
Optional<P> p = Optional.ofNullable(value._1()).map(Tuple2::_2);
|
||||
Optional<B> b = Optional.ofNullable(value._2()).map(Tuple2::_2);
|
||||
switch (priority) {
|
||||
default:
|
||||
case "BETA":
|
||||
return mergeWithPriorityToBETA(p, b);
|
||||
case "PROD":
|
||||
return mergeWithPriorityToPROD(p, b);
|
||||
}
|
||||
}, Encoders.bean(p_clazz))
|
||||
.filter((FilterFunction<P>) Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
|
||||
if (b.isPresent() & !p.isPresent()) {
|
||||
return (P) b.get();
|
||||
}
|
||||
if (p.isPresent()) {
|
||||
return p.get();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToBETA(Optional<P> p, Optional<B> b) {
|
||||
if (p.isPresent() & !b.isPresent()) {
|
||||
return p.get();
|
||||
}
|
||||
if (b.isPresent()) {
|
||||
return (P) b.get();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Dataset<Tuple2<String, T>> readTableFromPath(
|
||||
SparkSession spark, String inputEntityPath, Class<T> clazz) {
|
||||
|
||||
log.info("Reading Graph table from: {}", inputEntityPath);
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputEntityPath)
|
||||
.map(
|
||||
(MapFunction<String, Tuple2<String, T>>) value -> {
|
||||
final T t = OBJECT_MAPPER.readValue(value, clazz);
|
||||
final String id = ModelSupport.idFn().apply(t);
|
||||
return new Tuple2<>(id, t);
|
||||
},
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,89 @@
|
||||
package eu.dnetlib.dhp.sx.ebi
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||
import org.apache.spark.sql.{Encoder, Encoders}
|
||||
import org.apache.spark.sql.expressions.Aggregator
|
||||
|
||||
|
||||
|
||||
object EBIAggregator {
|
||||
|
||||
def getDatasetAggregator(): Aggregator[(String, OafDataset), OafDataset, OafDataset] = new Aggregator[(String, OafDataset), OafDataset, OafDataset]{
|
||||
|
||||
override def zero: OafDataset = new OafDataset()
|
||||
|
||||
override def reduce(b: OafDataset, a: (String, OafDataset)): OafDataset = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: OafDataset, wy: OafDataset): OafDataset = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: OafDataset): OafDataset = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
|
||||
override def outputEncoder: Encoder[OafDataset] =
|
||||
Encoders.kryo(classOf[OafDataset])
|
||||
}
|
||||
|
||||
|
||||
def getPublicationAggregator(): Aggregator[(String, Publication), Publication, Publication] = new Aggregator[(String, Publication), Publication, Publication]{
|
||||
|
||||
override def zero: Publication = new Publication()
|
||||
|
||||
override def reduce(b: Publication, a: (String, Publication)): Publication = {
|
||||
b.mergeFrom(a._2)
|
||||
if (b.getId == null)
|
||||
b.setId(a._2.getId)
|
||||
b
|
||||
}
|
||||
|
||||
|
||||
override def merge(wx: Publication, wy: Publication): Publication = {
|
||||
wx.mergeFrom(wy)
|
||||
if(wx.getId == null && wy.getId.nonEmpty)
|
||||
wx.setId(wy.getId)
|
||||
wx
|
||||
}
|
||||
override def finish(reduction: Publication): Publication = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
|
||||
override def outputEncoder: Encoder[Publication] =
|
||||
Encoders.kryo(classOf[Publication])
|
||||
}
|
||||
|
||||
|
||||
def getRelationAggregator(): Aggregator[(String, Relation), Relation, Relation] = new Aggregator[(String, Relation), Relation, Relation]{
|
||||
|
||||
override def zero: Relation = new Relation()
|
||||
|
||||
override def reduce(b: Relation, a: (String, Relation)): Relation = {
|
||||
a._2
|
||||
}
|
||||
|
||||
|
||||
override def merge(a: Relation, b: Relation): Relation = {
|
||||
if(b!= null) b else a
|
||||
}
|
||||
override def finish(reduction: Relation): Relation = reduction
|
||||
|
||||
override def bufferEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
|
||||
override def outputEncoder: Encoder[Relation] =
|
||||
Encoders.kryo(classOf[Relation])
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,138 @@
|
||||
package eu.dnetlib.dhp.sx.ebi
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, KeyValue, Oaf}
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils.createQualifier
|
||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIRelation, OafUtils, ProvenaceInfo}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST.{JField, JObject, JString}
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkAddLinkUpdates {
|
||||
|
||||
val relationMapper = RelationMapper.load
|
||||
|
||||
|
||||
case class EBILinks(relation:String, pubdate:String, tpid:String, tpidType:String, turl:String, title:String, publisher:String) {}
|
||||
|
||||
|
||||
def generatePubmedDLICollectedFrom(): KeyValue = {
|
||||
OafUtils.generateKeyValue("dli_________::europe_pmc__", "Europe PMC")
|
||||
}
|
||||
|
||||
|
||||
def ebiLinksToOaf(input:(String, String)):List[Oaf] = {
|
||||
val pmid :String = input._1
|
||||
val input_json :String = input._2
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input_json)
|
||||
|
||||
|
||||
val targets:List[EBILinks] = for {
|
||||
JObject(link) <- json \\ "Category" \\ "Link"
|
||||
JField("PublicationDate", JString(pubdate)) <- link
|
||||
JField("RelationshipType", JObject(relationshipType)) <- link
|
||||
JField("Name", JString(relname)) <- relationshipType
|
||||
JField("Target", JObject(target)) <- link
|
||||
JField("Identifier", JObject(identifier)) <- target
|
||||
JField("ID", JString(tpid)) <- identifier
|
||||
JField("IDScheme", JString(tpidtype)) <- identifier
|
||||
JField("IDURL", JString(turl)) <- identifier
|
||||
JField("Title", JString(title)) <- target
|
||||
JField("Publisher", JObject(pub)) <- target
|
||||
JField("Name", JString(publisher)) <- pub
|
||||
} yield EBILinks(relname, pubdate, tpid, tpidtype, turl,title, publisher)
|
||||
|
||||
|
||||
|
||||
val dnetPublicationId = s"50|${DHPUtils.md5(s"$pmid::pmid")}"
|
||||
|
||||
targets.flatMap(l => {
|
||||
val relation = new DLIRelation
|
||||
val inverseRelation = new DLIRelation
|
||||
val targetDnetId = s"50|${DHPUtils.md5(s"${l.tpid.toLowerCase.trim}::${l.tpidType.toLowerCase.trim}")}"
|
||||
val relInfo = relationMapper.get(l.relation.toLowerCase)
|
||||
val relationSemantic = relInfo.getOriginal
|
||||
val inverseRelationSemantic = relInfo.getInverse
|
||||
|
||||
relation.setSource(dnetPublicationId)
|
||||
relation.setTarget(targetDnetId)
|
||||
relation.setRelClass("datacite")
|
||||
relation.setRelType(relationSemantic)
|
||||
relation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
inverseRelation.setSource(targetDnetId)
|
||||
inverseRelation.setTarget(dnetPublicationId)
|
||||
inverseRelation.setRelClass("datacite")
|
||||
inverseRelation.setRelType(inverseRelationSemantic)
|
||||
inverseRelation.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
|
||||
|
||||
|
||||
val d = new DLIDataset
|
||||
d.setId(targetDnetId)
|
||||
d.setDataInfo(OafUtils.generateDataInfo())
|
||||
d.setPid(List(OafUtils.createSP(l.tpid.toLowerCase.trim, l.tpidType.toLowerCase.trim, "dnet:pid_types")).asJava)
|
||||
d.setCompletionStatus("complete")
|
||||
val pi = new ProvenaceInfo
|
||||
pi.setId("dli_________::europe_pmc__")
|
||||
pi.setName( "Europe PMC")
|
||||
pi.setCompletionStatus("complete")
|
||||
pi.setCollectionMode("collected")
|
||||
d.setDlicollectedfrom(List(pi).asJava)
|
||||
d.setCollectedfrom(List(generatePubmedDLICollectedFrom()).asJava)
|
||||
d.setPublisher(OafUtils.asField(l.publisher))
|
||||
d.setTitle(List(OafUtils.createSP(l.title, "main title", "dnet:dataCite_title")).asJava)
|
||||
d.setDateofacceptance(OafUtils.asField(l.pubdate))
|
||||
val i = new Instance
|
||||
i.setCollectedfrom(generatePubmedDLICollectedFrom())
|
||||
i.setDateofacceptance(d.getDateofacceptance)
|
||||
i.setUrl(List(l.turl).asJava)
|
||||
i.setInstancetype(createQualifier("0021", "Dataset", "dnet:publication_resource", "dnet:publication_resource"))
|
||||
d.setInstance(List(i).asJava)
|
||||
List(relation, inverseRelation, d)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val relEncoder: Encoder[DLIRelation] = Encoders.kryo(classOf[DLIRelation])
|
||||
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo(classOf[DLIDataset])
|
||||
|
||||
val ds:Dataset[(String,String)] = spark.read.load(s"$workingPath/baseline_links_updates").as[(String,String)](Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
|
||||
ds.flatMap(l =>ebiLinksToOaf(l)).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_oaf")
|
||||
|
||||
ds.filter(s => s.isInstanceOf)
|
||||
|
||||
|
||||
|
||||
val oDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/baseline_links_updates_oaf").as[Oaf]
|
||||
|
||||
oDataset.filter(p =>p.isInstanceOf[DLIRelation]).map(p => p.asInstanceOf[DLIRelation]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_relation")
|
||||
oDataset.filter(p =>p.isInstanceOf[DLIDataset]).map(p => p.asInstanceOf[DLIDataset]).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_links_updates_dataset")
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal, PMParser}
|
||||
|
||||
|
||||
import scala.io.Source
|
||||
import scala.xml.pull.XMLEventReader
|
||||
|
||||
object SparkCreateBaselineDataFrame {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
|
||||
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
|
||||
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
|
||||
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
|
||||
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000)
|
||||
|
||||
val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{
|
||||
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
|
||||
new PMParser(xml)
|
||||
|
||||
} ))
|
||||
|
||||
ds.write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,87 @@
|
||||
package eu.dnetlib.dhp.sx.ebi
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
|
||||
import eu.dnetlib.dhp.sx.graph.parser.{DatasetScholexplorerParser, PublicationScholexplorerParser}
|
||||
import eu.dnetlib.scholexplorer.relation.RelationMapper
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkCreateEBIDataFrame {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkCreateEBIDataFrame.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateEBIDataFrame.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/ebi/ebi_to_df_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkCreateEBIDataFrame.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sc = spark.sparkContext
|
||||
|
||||
|
||||
val workingPath = parser.get("workingPath")
|
||||
val relationMapper = RelationMapper.load
|
||||
|
||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val pubEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
|
||||
logger.info("Extract Publication and relation from publication_xml")
|
||||
val oafPubsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/publication_xml").map(s =>
|
||||
{
|
||||
new ObjectMapper().readValue(s, classOf[String])
|
||||
}).flatMap(s => {
|
||||
val d = new PublicationScholexplorerParser
|
||||
d.parseObject(s, relationMapper).asScala.iterator})
|
||||
|
||||
val mapper = new ObjectMapper()
|
||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||
spark.createDataset(oafPubsRDD).write.mode(SaveMode.Overwrite).save(s"$workingPath/oaf")
|
||||
|
||||
logger.info("Extract Publication and relation from dataset_xml")
|
||||
val oafDatsRDD:RDD[Oaf] = sc.textFile(s"$workingPath/dataset_xml").map(s =>
|
||||
{
|
||||
new ObjectMapper().readValue(s, classOf[String])
|
||||
}).flatMap(s => {
|
||||
val d = new DatasetScholexplorerParser
|
||||
d.parseObject(s, relationMapper).asScala.iterator})
|
||||
|
||||
spark.createDataset(oafDatsRDD).write.mode(SaveMode.Append).save(s"$workingPath/oaf")
|
||||
val dataset: Dataset[OafDataset] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[OafDataset]).map(d => d.asInstanceOf[OafDataset])
|
||||
val publication: Dataset[Publication] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Publication]).map(d => d.asInstanceOf[Publication])
|
||||
val relations: Dataset[Relation] = spark.read.load(s"$workingPath/oaf").as[Oaf].filter(o => o.isInstanceOf[Relation]).map(d => d.asInstanceOf[Relation])
|
||||
publication.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, pubEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getPublicationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/publication")
|
||||
|
||||
dataset.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datasetEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getDatasetAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/dataset")
|
||||
|
||||
relations.map(d => (s"${d.getSource}::${d.getRelType}::${d.getTarget}", d))(Encoders.tuple(Encoders.STRING, relEncoder))
|
||||
.groupByKey(_._1)(Encoders.STRING)
|
||||
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
||||
.map(p => p._2)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/relation")
|
||||
|
||||
|
||||
|
||||
relations.map(r => (r.getSource, r.getTarget))(Encoders.tuple(Encoders.STRING,Encoders.STRING))
|
||||
}
|
||||
}
|
@ -0,0 +1,64 @@
|
||||
|
||||
package eu.dnetlib.dhp.sx.ebi.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class PMArticle implements Serializable {
|
||||
|
||||
private String pmid;
|
||||
private String date;
|
||||
private PMJournal journal;
|
||||
private String title;
|
||||
private String description;
|
||||
private List<PMAuthor> authors = new ArrayList<>();
|
||||
|
||||
public String getPmid() {
|
||||
return pmid;
|
||||
}
|
||||
|
||||
public void setPmid(String pmid) {
|
||||
this.pmid = pmid;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public PMJournal getJournal() {
|
||||
return journal;
|
||||
}
|
||||
|
||||
public void setJournal(PMJournal journal) {
|
||||
this.journal = journal;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public List<PMAuthor> getAuthors() {
|
||||
return authors;
|
||||
}
|
||||
|
||||
public void setAuthors(List<PMAuthor> authors) {
|
||||
this.authors = authors;
|
||||
}
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
|
||||
package eu.dnetlib.dhp.sx.ebi.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class PMAuthor implements Serializable {
|
||||
|
||||
private String lastName;
|
||||
private String foreName;
|
||||
|
||||
public String getLastName() {
|
||||
return lastName;
|
||||
}
|
||||
|
||||
public void setLastName(String lastName) {
|
||||
this.lastName = lastName;
|
||||
}
|
||||
|
||||
public String getForeName() {
|
||||
return foreName;
|
||||
}
|
||||
|
||||
public void setForeName(String foreName) {
|
||||
this.foreName = foreName;
|
||||
}
|
||||
|
||||
public String getFullName() {
|
||||
return String.format("%s, %s", this.foreName, this.lastName);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
|
||||
package eu.dnetlib.dhp.sx.ebi.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class PMJournal implements Serializable {
|
||||
|
||||
private String issn;
|
||||
private String volume;
|
||||
private String issue;
|
||||
private String date;
|
||||
private String title;
|
||||
|
||||
public String getIssn() {
|
||||
return issn;
|
||||
}
|
||||
|
||||
public void setIssn(String issn) {
|
||||
this.issn = issn;
|
||||
}
|
||||
|
||||
public String getVolume() {
|
||||
return volume;
|
||||
}
|
||||
|
||||
public void setVolume(String volume) {
|
||||
this.volume = volume;
|
||||
}
|
||||
|
||||
public String getIssue() {
|
||||
return issue;
|
||||
}
|
||||
|
||||
public void setIssue(String issue) {
|
||||
this.issue = issue;
|
||||
}
|
||||
|
||||
public String getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public void setDate(String date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
}
|
@ -0,0 +1,92 @@
|
||||
package eu.dnetlib.dhp.sx.ebi.model
|
||||
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
|
||||
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
|
||||
|
||||
var currentArticle:PMArticle = generateNextArticle()
|
||||
|
||||
override def hasNext: Boolean = currentArticle!= null
|
||||
|
||||
override def next(): PMArticle = {
|
||||
val tmp = currentArticle
|
||||
currentArticle = generateNextArticle()
|
||||
tmp
|
||||
}
|
||||
|
||||
|
||||
def generateNextArticle():PMArticle = {
|
||||
|
||||
var currentAuthor: PMAuthor = null
|
||||
var currentJournal: PMJournal = null
|
||||
var currNode: String = null
|
||||
var currentYear = "0"
|
||||
var currentMonth = "01"
|
||||
var currentDay = "01"
|
||||
|
||||
while (xml.hasNext) {
|
||||
xml.next match {
|
||||
case EvElemStart(_, label, _, _) =>
|
||||
currNode = label
|
||||
label match {
|
||||
case "PubmedArticle" => currentArticle = new PMArticle
|
||||
case "Author" => currentAuthor = new PMAuthor
|
||||
case "Journal" => currentJournal = new PMJournal
|
||||
case _ =>
|
||||
}
|
||||
case EvElemEnd(_, label) =>
|
||||
label match {
|
||||
case "PubmedArticle" => return currentArticle
|
||||
case "Author" => currentArticle.getAuthors.add(currentAuthor)
|
||||
case "Journal" => currentArticle.setJournal(currentJournal)
|
||||
case "DateCompleted" => currentArticle.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay")
|
||||
case _ =>
|
||||
}
|
||||
case EvText(text) =>
|
||||
if (currNode!= null && text.trim.nonEmpty)
|
||||
currNode match {
|
||||
case "ArticleTitle" => {
|
||||
if (currentArticle.getTitle==null)
|
||||
currentArticle.setTitle(text.trim)
|
||||
else
|
||||
currentArticle.setTitle(currentArticle.getTitle + text.trim)
|
||||
}
|
||||
case "AbstractText" => {
|
||||
if (currentArticle.getDescription==null)
|
||||
currentArticle.setDescription(text.trim)
|
||||
else
|
||||
currentArticle.setDescription(currentArticle.getDescription + text.trim)
|
||||
}
|
||||
case "PMID" => currentArticle.setPmid(text.trim)
|
||||
case "ISSN" => currentJournal.setIssn(text.trim)
|
||||
case "Year" => currentYear = text.trim
|
||||
case "Month" => currentMonth = text.trim
|
||||
case "Day" => currentDay = text.trim
|
||||
case "Volume" => currentJournal.setVolume( text.trim)
|
||||
case "Issue" => currentJournal.setIssue (text.trim)
|
||||
case "LastName" => {
|
||||
if (currentAuthor != null)
|
||||
currentAuthor.setLastName(text.trim)
|
||||
|
||||
}
|
||||
case "ForeName" => if (currentAuthor != null)
|
||||
currentAuthor.setForeName(text.trim)
|
||||
case "Title" =>
|
||||
if (currentJournal.getTitle==null)
|
||||
currentJournal.setTitle(text.trim)
|
||||
else
|
||||
currentJournal.setTitle(currentJournal.getTitle + text.trim)
|
||||
case _ =>
|
||||
|
||||
}
|
||||
case _ =>
|
||||
}
|
||||
|
||||
}
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -0,0 +1,18 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue