Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop

This commit is contained in:
Sandro La Bruzzo 2020-06-19 15:11:53 +02:00
commit 464eeeec87
17 changed files with 464 additions and 216 deletions

View File

@ -24,11 +24,7 @@
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
@ -45,10 +41,6 @@
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
@ -61,7 +53,7 @@
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-broker-common</artifactId>
<version>[3.0.1,4.0.0)</version>
<version>[3.0.2,4.0.0)</version>
</dependency>
</dependencies>

View File

@ -31,12 +31,9 @@ import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAg
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
@ -75,25 +72,38 @@ public class GenerateEventsApplication {
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
final SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(BrokerConstants.getModelClasses());
// conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
// conf.registerKryoClasses(BrokerConstants.getModelClasses());
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
// TODO UNCOMMENT
// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
final DedupConfig dedupConfig = null;
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, eventsPath);
spark
.emptyDataset(Encoders.kryo(Event.class))
.union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
.union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
.union(generateEvents(spark, graphPath, Software.class, dedupConfig))
.union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
// TODO REMOVE THIS
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.cache();
relatedEntities(projects, rels, RelatedProject.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(eventsPath);
// TODO UNCOMMENT THIS
// spark
// .emptyDataset(Encoders.bean(Event.class))
// .union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
// .union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
// .union(generateEvents(spark, graphPath, Software.class, dedupConfig))
// .union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
// .write()
// .mode(SaveMode.Overwrite)
// .option("compression", "gzip")
// .json(eventsPath);
});
}
@ -117,45 +127,48 @@ public class GenerateEventsApplication {
.toColumn();
return results
.joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner")
.joinWith(mergedRels, results.col("openaireId").equalTo(mergedRels.col("source")), "inner")
.groupByKey(
(MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
.agg(aggr)
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
.filter(ResultGroup::isValid)
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.bean(ResultGroup.class))
.filter(rg -> rg.getData().size() > 1)
.map(
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
Encoders.kryo(EventGroup.class))
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
Encoders.bean(EventGroup.class))
.flatMap(group -> group.getData().iterator(), Encoders.bean(Event.class));
}
private static <SRC extends Result> Dataset<OpenaireBrokerResult> expandResultsWithRelations(
final SparkSession spark,
final String graphPath,
final Class<SRC> sourceClass) {
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
// final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
// spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
// final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
// final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
.cache();
final Dataset<OpenaireBrokerResult> r0 = readPath(
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
.filter(r -> r.getDataInfo().getDeletedbyinference())
.map(ConversionUtils::oafResultToBrokerResult, Encoders.kryo(OpenaireBrokerResult.class));
.map(ConversionUtils::oafResultToBrokerResult, Encoders.bean(OpenaireBrokerResult.class));
// TODO UNCOMMENT THIS
final Dataset<OpenaireBrokerResult> r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class));
final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class));
final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class));
final Dataset<OpenaireBrokerResult> r4 = join(
r3, rels, relatedEntities(publications, rels, RelatedProject.class));
;
// final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels,
// RelatedSoftware.class));
// final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels,
// RelatedDataset.class));
// final Dataset<OpenaireBrokerResult> r4 = join(r3, rels, relatedEntities(publications, rels,
// RelatedPublication.class));;
return r4;
return r0; // TODO it should be r4
}
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
@ -165,7 +178,7 @@ public class GenerateEventsApplication {
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
.map(
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
Encoders.kryo(clazz));
Encoders.bean(clazz));
}
private static <T> Dataset<OpenaireBrokerResult> join(final Dataset<OpenaireBrokerResult> sources,
@ -174,14 +187,14 @@ public class GenerateEventsApplication {
final TypedColumn<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator<T>()
.toColumn();
;
return sources
.joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer")
.joinWith(typedRels, sources.col("openaireId").equalTo(rels.col("source")), "left_outer")
.groupByKey(
(MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
.agg(aggr)
.map(t -> t._2, Encoders.kryo(OpenaireBrokerResult.class));
.map(t -> t._2, Encoders.bean(OpenaireBrokerResult.class));
}
public static <R> Dataset<R> readPath(
@ -195,6 +208,7 @@ public class GenerateEventsApplication {
}
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String conf = isLookUpService

View File

@ -3,16 +3,18 @@ package eu.dnetlib.dhp.broker.oa.util;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Function;
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
import eu.dnetlib.broker.objects.TypedValue;
import eu.dnetlib.dhp.schema.oaf.Author;
@ -24,6 +26,7 @@ import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
@ -33,133 +36,133 @@ public class ConversionUtils {
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
public static List<eu.dnetlib.broker.objects.Instance> oafInstanceToBrokerInstances(final Instance i) {
return i.getUrl().stream().map(url -> {
return new eu.dnetlib.broker.objects.Instance()
.setUrl(url)
.setInstancetype(i.getInstancetype().getClassid())
.setLicense(BrokerConstants.OPEN_ACCESS)
.setHostedby(i.getHostedby().getValue());
}).collect(Collectors.toList());
if (i == null) {
return new ArrayList<>();
}
return mappedList(i.getUrl(), url -> {
final eu.dnetlib.broker.objects.Instance res = new eu.dnetlib.broker.objects.Instance();
res.setUrl(url);
res.setInstancetype(classId(i.getInstancetype()));
res.setLicense(BrokerConstants.OPEN_ACCESS);
res.setHostedby(kvValue(i.getHostedby()));
return res;
});
}
public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) {
return sp != null ? new TypedValue()
.setValue(sp.getValue())
.setType(sp.getQualifier().getClassid()) : null;
return oafStructPropToBrokerTypedValue(sp);
}
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null;
public static TypedValue oafStructPropToBrokerTypedValue(final StructuredProperty sp) {
return sp != null ? new TypedValue(classId(sp.getQualifier()), sp.getValue()) : null;
}
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
return d != null ? new eu.dnetlib.broker.objects.Dataset()
.setOriginalId(d.getOriginalId().get(0))
.setTitle(structPropValue(d.getTitle()))
.setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
d
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
: null;
if (d == null) {
return null;
}
final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset();
res.setOriginalId(first(d.getOriginalId()));
res.setTitle(structPropValue(d.getTitle()));
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue));
return res;
}
public static eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication p) {
return p != null ? new eu.dnetlib.broker.objects.Publication()
.setOriginalId(p.getOriginalId().get(0))
.setTitle(structPropValue(p.getTitle()))
.setPids(p.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
p
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setCollectedFrom(p.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
: null;
if (p == null) {
return null;
}
final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication();
res.setOriginalId(first(p.getOriginalId()));
res.setTitle(structPropValue(p.getTitle()));
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue));
return res;
}
public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) {
if (result == null) {
return null;
}
return result != null ? new OpenaireBrokerResult()
.setOpenaireId(result.getId())
.setOriginalId(result.getOriginalId().get(0))
.setTypology(result.getResulttype().getClassid())
.setTitles(structPropList(result.getTitle()))
.setAbstracts(fieldList(result.getDescription()))
.setLanguage(result.getLanguage().getClassid())
.setSubjects(structPropTypedList(result.getSubject()))
.setCreators(
result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList()))
.setPublicationdate(result.getDateofacceptance().getValue())
.setPublisher(fieldValue(result.getPublisher()))
.setEmbargoenddate(fieldValue(result.getEmbargoenddate()))
.setContributor(fieldList(result.getContributor()))
final OpenaireBrokerResult res = new OpenaireBrokerResult();
res.setOpenaireId(result.getId());
res.setOriginalId(first(result.getOriginalId()));
res.setTypology(classId(result.getResulttype()));
res.setTitles(structPropList(result.getTitle()));
res.setAbstracts(fieldList(result.getDescription()));
res.setLanguage(classId(result.getLanguage()));
res.setSubjects(structPropTypedList(result.getSubject()));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
res.setPublisher(fieldValue(result.getPublisher()));
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
res.setContributor(fieldList(result.getContributor()));
res
.setJournal(
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
.setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null))
.setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
.setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
.setInstances(
result
.getInstance()
.stream()
.map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream)
.collect(Collectors.toList()))
.setExternalReferences(
result
.getExternalReference()
.stream()
.map(ConversionUtils::oafExtRefToBrokerExtRef)
.collect(Collectors.toList()))
: null;
}
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
private static List<TypedValue> structPropTypedList(final List<StructuredProperty> list) {
return list
.stream()
.map(
p -> new TypedValue()
.setValue(p.getValue())
.setType(p.getQualifier().getClassid()))
.collect(Collectors.toList());
return res;
}
private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) {
return author != null ? new eu.dnetlib.broker.objects.Author()
.setFullname(author.getFullname())
.setOrcid(
author
.getPid()
.stream()
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.map(pid -> pid.getValue())
.findFirst()
.orElse(null))
: null;
if (author == null) {
return null;
}
final String pids = author.getPid() != null ? author
.getPid()
.stream()
.filter(pid -> pid != null)
.filter(pid -> pid.getQualifier() != null)
.filter(pid -> pid.getQualifier().getClassid() != null)
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.map(pid -> pid.getValue())
.filter(StringUtils::isNotBlank)
.findFirst()
.orElse(null) : null;
return new eu.dnetlib.broker.objects.Author(author.getFullname(), pids);
}
private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) {
return journal != null ? new eu.dnetlib.broker.objects.Journal()
.setName(journal.getName())
.setIssn(journal.getIssnPrinted())
.setEissn(journal.getIssnOnline())
.setLissn(journal.getIssnLinking()) : null;
if (journal == null) {
return null;
}
final eu.dnetlib.broker.objects.Journal res = new eu.dnetlib.broker.objects.Journal();
res.setName(journal.getName());
res.setIssn(journal.getIssnPrinted());
res.setEissn(journal.getIssnOnline());
res.setLissn(journal.getIssnLinking());
return res;
}
private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
return ref != null ? new eu.dnetlib.broker.objects.ExternalReference()
.setRefidentifier(ref.getRefidentifier())
.setSitename(ref.getSitename())
.setType(ref.getQualifier().getClassid())
.setUrl(ref.getUrl())
: null;
if (ref == null) {
return null;
}
final eu.dnetlib.broker.objects.ExternalReference res = new eu.dnetlib.broker.objects.ExternalReference();
res.setRefidentifier(ref.getRefidentifier());
res.setSitename(ref.getSitename());
res.setType(classId(ref.getQualifier()));
res.setUrl(ref.getUrl());
return res;
}
public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) {
@ -167,10 +170,10 @@ public class ConversionUtils {
return null;
}
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project()
.setTitle(fieldValue(p.getTitle()))
.setAcronym(fieldValue(p.getAcronym()))
.setCode(fieldValue(p.getCode()));
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project();
res.setTitle(fieldValue(p.getTitle()));
res.setAcronym(fieldValue(p.getAcronym()));
res.setCode(fieldValue(p.getCode()));
final String ftree = fieldValue(p.getFundingtree());
if (StringUtils.isNotBlank(ftree)) {
@ -188,12 +191,25 @@ public class ConversionUtils {
}
public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) {
return sw != null ? new eu.dnetlib.broker.objects.Software()
.setName(structPropValue(sw.getTitle()))
.setDescription(fieldValue(sw.getDescription()))
.setRepository(fieldValue(sw.getCodeRepositoryUrl()))
.setLandingPage(fieldValue(sw.getDocumentationUrl()))
: null;
if (sw == null) {
return null;
}
final eu.dnetlib.broker.objects.Software res = new eu.dnetlib.broker.objects.Software();
res.setName(structPropValue(sw.getTitle()));
res.setDescription(fieldValue(sw.getDescription()));
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
res.setLandingPage(fieldValue(sw.getDocumentationUrl()));
return res;
}
private static String first(final List<String> list) {
return list != null && list.size() > 0 ? list.get(0) : null;
}
private static String kvValue(final KeyValue kv) {
return kv != null ? kv.getValue() : null;
}
private static String fieldValue(final Field<String> f) {
@ -205,6 +221,10 @@ public class ConversionUtils {
: null;
}
private static String classId(final Qualifier q) {
return q != null ? q.getClassid() : null;
}
private static String structPropValue(final List<StructuredProperty> props) {
return props != null
? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
@ -226,4 +246,55 @@ public class ConversionUtils {
.collect(Collectors.toList())
: new ArrayList<>();
}
private static List<TypedValue> structPropTypedList(final List<StructuredProperty> list) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(ConversionUtils::oafStructPropToBrokerTypedValue)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(func::apply)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
if (list == null) {
return new ArrayList<>();
}
return list
.stream()
.map(func::apply)
.flatMap(List::stream)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static <F, T> T mappedFirst(final List<F> list, final Function<F, T> func) {
if (list == null) {
return null;
}
return list
.stream()
.map(func::apply)
.filter(Objects::nonNull)
.findFirst()
.orElse(null);
}
}

View File

@ -63,8 +63,14 @@ public final class UpdateInfo<T> {
return target;
}
private float calculateTrust(final DedupConfig dedupConfig, final OpenaireBrokerResult r1,
private float calculateTrust(final DedupConfig dedupConfig,
final OpenaireBrokerResult r1,
final OpenaireBrokerResult r2) {
if (dedupConfig == null) {
return BrokerConstants.MIN_TRUST;
}
try {
final ObjectMapper objectMapper = new ObjectMapper();
final MapDocument doc1 = MapDocumentUtil
@ -116,13 +122,15 @@ public final class UpdateInfo<T> {
.orElse(null);
;
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
final Provenance provenance = new Provenance(provId, provRepo, provUrl);
return new OpenAireEventPayload()
.setPublication(target)
.setHighlight(hl)
.setTrust(trust)
.setProvenance(provenance);
final OpenAireEventPayload res = new OpenAireEventPayload();
res.setResult(target);
res.setHighlight(hl);
res.setTrust(trust);
res.setProvenance(provenance);
return res;
}
}

View File

@ -23,12 +23,14 @@ public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Re
@Override
public ResultGroup reduce(final ResultGroup group, final Tuple2<OpenaireBrokerResult, Relation> t) {
return group.addElement(t._1);
group.getData().add(t._1);
return group;
}
@Override
public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) {
return g1.addGroup(g2);
g1.getData().addAll(g2.getData());
return g1;
}
@Override
@ -38,13 +40,13 @@ public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Re
@Override
public Encoder<ResultGroup> bufferEncoder() {
return Encoders.kryo(ResultGroup.class);
return Encoders.bean(ResultGroup.class);
}
@Override
public Encoder<ResultGroup> outputEncoder() {
return Encoders.kryo(ResultGroup.class);
return Encoders.bean(ResultGroup.class);
}

View File

@ -14,23 +14,14 @@ public class ResultGroup implements Serializable {
*/
private static final long serialVersionUID = -3360828477088669296L;
private final List<OpenaireBrokerResult> data = new ArrayList<>();
private List<OpenaireBrokerResult> data = new ArrayList<>();
public List<OpenaireBrokerResult> getData() {
return data;
}
public ResultGroup addElement(final OpenaireBrokerResult elem) {
data.add(elem);
return this;
public void setData(final List<OpenaireBrokerResult> data) {
this.data = data;
}
public ResultGroup addGroup(final ResultGroup group) {
data.addAll(group.getData());
return this;
}
public boolean isValid() {
return data.size() > 1;
}
}

View File

@ -58,12 +58,12 @@ public class OpenaireBrokerResultAggregator<T>
@Override
public Encoder<OpenaireBrokerResult> bufferEncoder() {
return Encoders.kryo(OpenaireBrokerResult.class);
return Encoders.bean(OpenaireBrokerResult.class);
}
@Override
public Encoder<OpenaireBrokerResult> outputEncoder() {
return Encoders.kryo(OpenaireBrokerResult.class);
return Encoders.bean(OpenaireBrokerResult.class);
}
}

View File

@ -11,9 +11,12 @@ public class RelatedDataset implements Serializable {
*
*/
private static final long serialVersionUID = 774487705184038324L;
private final String source;
private final String relType;
private final Dataset relDataset;
private String source;
private String relType;
private Dataset relDataset;
public RelatedDataset() {
}
public RelatedDataset(final String source, final String relType, final Dataset relDataset) {
this.source = source;
@ -25,12 +28,24 @@ public class RelatedDataset implements Serializable {
return source;
}
public void setSource(final String source) {
this.source = source;
}
public String getRelType() {
return relType;
}
public void setRelType(final String relType) {
this.relType = relType;
}
public Dataset getRelDataset() {
return relDataset;
}
public void setRelDataset(final Dataset relDataset) {
this.relDataset = relDataset;
}
}

View File

@ -12,9 +12,12 @@ public class RelatedProject implements Serializable {
*/
private static final long serialVersionUID = 4941437626549329870L;
private final String source;
private final String relType;
private final Project relProject;
private String source;
private String relType;
private Project relProject;
public RelatedProject() {
}
public RelatedProject(final String source, final String relType, final Project relProject) {
this.source = source;
@ -26,12 +29,24 @@ public class RelatedProject implements Serializable {
return source;
}
public void setSource(final String source) {
this.source = source;
}
public String getRelType() {
return relType;
}
public void setRelType(final String relType) {
this.relType = relType;
}
public Project getRelProject() {
return relProject;
}
public void setRelProject(final Project relProject) {
this.relProject = relProject;
}
}

View File

@ -12,9 +12,12 @@ public class RelatedPublication implements Serializable {
*/
private static final long serialVersionUID = 9021609640411395128L;
private final String source;
private final String relType;
private final Publication relPublication;
private String source;
private String relType;
private Publication relPublication;
public RelatedPublication() {
}
public RelatedPublication(final String source, final String relType, final Publication relPublication) {
this.source = source;
@ -26,12 +29,24 @@ public class RelatedPublication implements Serializable {
return source;
}
public void setSource(final String source) {
this.source = source;
}
public String getRelType() {
return relType;
}
public void setRelType(final String relType) {
this.relType = relType;
}
public Publication getRelPublication() {
return relPublication;
}
public void setRelPublication(final Publication relPublication) {
this.relPublication = relPublication;
}
}

View File

@ -11,9 +11,12 @@ public class RelatedSoftware implements Serializable {
*
*/
private static final long serialVersionUID = 7573383356943300157L;
private final String source;
private final String relType;
private final Software relSoftware;
private String source;
private String relType;
private Software relSoftware;
public RelatedSoftware() {
}
public RelatedSoftware(final String source, final String relType, final Software relSoftware) {
this.source = source;
@ -25,12 +28,24 @@ public class RelatedSoftware implements Serializable {
return source;
}
public void setSource(final String source) {
this.source = source;
}
public String getRelType() {
return relType;
}
public void setRelType(final String relType) {
this.relType = relType;
}
public Software getRelSoftware() {
return relSoftware;
}
public void setRelSoftware(final Software relSoftware) {
this.relSoftware = relSoftware;
}
}

View File

@ -78,21 +78,33 @@
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="generate_events">
<java>
<prepare>
<delete path="${eventsOutputPath}"/>
</prepare>
<main-class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</main-class>
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>GenerateEvents</name>
<class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
</java>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>

View File

@ -3,9 +3,13 @@ package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.BufferedInputStream;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -19,7 +23,9 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -84,12 +90,83 @@ public class CleanGraphSparkJob {
readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> fixDefaults(value), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
r
.getSubject()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
r
.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
}
}
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
private static <T extends Oaf> Dataset<T> readTableFromPath(
SparkSession spark, String inputEntityPath, Class<T> clazz) {

View File

@ -4,10 +4,13 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.util.HashMap;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
@ -18,23 +21,24 @@ public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>
*/
public static CleaningRuleMap create(VocabularyGroup vocabularies) {
CleaningRuleMap mapping = new CleaningRuleMap();
mapping.put(Qualifier.class, o -> {
Qualifier q = (Qualifier) o;
if (vocabularies.vocabularyExists(q.getSchemeid())) {
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
q.setClassid(newValue.getClassid());
q.setClassname(newValue.getClassname());
mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o));
mapping.put(Country.class, o -> {
final Country c = (Country) o;
if (StringUtils.isBlank(c.getSchemeid())) {
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
}
});
mapping.put(StructuredProperty.class, o -> {
StructuredProperty sp = (StructuredProperty) o;
// TODO implement a policy
/*
* if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null);
* }
*/
cleanQualifier(vocabularies, c);
});
return mapping;
}
private static <Q extends Qualifier> void cleanQualifier(VocabularyGroup vocabularies, Q q) {
if (vocabularies.vocabularyExists(q.getSchemeid())) {
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
q.setClassid(newValue.getClassid());
q.setClassname(newValue.getClassname());
}
}
}

View File

@ -122,7 +122,11 @@ public class VocabularyGroup implements Serializable {
}
public boolean vocabularyExists(final String vocId) {
return vocs.containsKey(vocId.toLowerCase());
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(id -> vocs.containsKey(id))
.orElse(false);
}
private void addSynonyms(final String vocId, final String termId, final String syn) {

View File

@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -56,6 +57,9 @@ public class CleaningFunctionTest {
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication);
Publication p_out = OafCleaner.apply(p_in, mapping);
assertNotNull(p_out);
@ -63,6 +67,9 @@ public class CleaningFunctionTest {
assertEquals("und", p_out.getLanguage().getClassid());
assertEquals("Undetermined", p_out.getLanguage().getClassname());
assertEquals("DE", p_out.getCountry().get(0).getClassid());
assertEquals("Germany", p_out.getCountry().get(0).getClassname());
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());

View File

@ -202,6 +202,12 @@
"contributor": [
],
"country": [
{
"classid": "DE",
"classname": "DE",
"schemeid": "dnet:countries",
"schemename": "dnet:countries"
}
],
"coverage": [
],