forked from antonis.lempesis/dnet-hadoop
merge DOIBoost
This commit is contained in:
parent
e46e2a4776
commit
e34e7d6728
|
@ -36,8 +36,9 @@ public class EventFactory {
|
|||
|
||||
final Map<String, Object> map = createMapFromResult(updateInfo);
|
||||
|
||||
final String eventId =
|
||||
calculateEventId(updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0), updateInfo.getHighlightValueAsString());
|
||||
final String eventId = calculateEventId(
|
||||
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
|
||||
updateInfo.getHighlightValueAsString());
|
||||
|
||||
res.setEventId(eventId);
|
||||
res.setProducerId(PRODUCER_ID);
|
||||
|
@ -80,13 +81,17 @@ public class EventFactory {
|
|||
final List<StructuredProperty> subjects = target.getSubject();
|
||||
if (subjects.size() > 0) {
|
||||
map
|
||||
.put("target_publication_subject_list", subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||
.put(
|
||||
"target_publication_subject_list",
|
||||
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
final List<Author> authors = target.getAuthor();
|
||||
if (authors.size() > 0) {
|
||||
map
|
||||
.put("target_publication_author_list", authors.stream().map(Author::getFullname).collect(Collectors.toList()));
|
||||
.put(
|
||||
"target_publication_author_list",
|
||||
authors.stream().map(Author::getFullname).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
// PROVENANCE INFO
|
||||
|
@ -113,7 +118,9 @@ public class EventFactory {
|
|||
}
|
||||
|
||||
private static long parseDateTolong(final String date) {
|
||||
if (StringUtils.isBlank(date)) { return -1; }
|
||||
if (StringUtils.isBlank(date)) {
|
||||
return -1;
|
||||
}
|
||||
try {
|
||||
return DateUtils.parseDate(date, DATE_PATTERNS).getTime();
|
||||
} catch (final ParseException e) {
|
||||
|
|
|
@ -87,32 +87,25 @@ public class GenerateEventsApplication {
|
|||
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy =
|
||||
new EnrichMissingPublicationIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo =
|
||||
new EnrichMissingPublicationIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy =
|
||||
new EnrichMissingPublicationIsSupplementedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo =
|
||||
new EnrichMissingDatasetIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy =
|
||||
new EnrichMissingDatasetIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences =
|
||||
new EnrichMissingDatasetReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo =
|
||||
new EnrichMissingDatasetIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy =
|
||||
new EnrichMissingDatasetIsSupplementedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
.toString(
|
||||
GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -156,8 +149,9 @@ public class GenerateEventsApplication {
|
|||
final String graphPath,
|
||||
final Class<R> resultClazz) {
|
||||
|
||||
final Dataset<R> results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
final Dataset<R> results = readPath(
|
||||
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
@ -196,15 +190,18 @@ public class GenerateEventsApplication {
|
|||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <SRC extends Result, TRG extends OafEntity> JavaRDD<Event> generateRelationEvents(final SparkSession spark,
|
||||
private static <SRC extends Result, TRG extends OafEntity> JavaRDD<Event> generateRelationEvents(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass,
|
||||
final Class<TRG> targetClass) {
|
||||
|
||||
final Dataset<SRC> sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
final Dataset<SRC> sources = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
|
||||
final Dataset<TRG> targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass);
|
||||
final Dataset<TRG> targets = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass);
|
||||
|
||||
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
|
|
@ -28,7 +28,8 @@ public class EnrichMissingProject
|
|||
if (source.getRight().isEmpty()) {
|
||||
return Arrays.asList();
|
||||
} else {
|
||||
return target.getRight()
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafProjectToBrokerProject)
|
||||
.map(p -> generateUpdateInfo(p, source, target))
|
||||
|
|
|
@ -24,12 +24,14 @@ public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>
|
|||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
|
||||
final Set<String> existingProjects = source.getRight()
|
||||
final Set<String> existingProjects = source
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(Project::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target.getRight()
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.filter(p -> !existingProjects.contains(p.getId()))
|
||||
.map(ConversionUtils::oafProjectToBrokerProject)
|
||||
|
|
|
@ -29,7 +29,8 @@ public class EnrichMissingSoftware
|
|||
if (source.getRight().isEmpty()) {
|
||||
return Arrays.asList();
|
||||
} else {
|
||||
return target.getRight()
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware)
|
||||
.map(p -> generateUpdateInfo(p, source, target))
|
||||
|
|
|
@ -26,12 +26,14 @@ public class EnrichMoreSoftware
|
|||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
|
||||
final Set<String> existingSoftwares = source.getRight()
|
||||
final Set<String> existingSoftwares = source
|
||||
.getRight()
|
||||
.stream()
|
||||
.map(Software::getId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target.getRight()
|
||||
return target
|
||||
.getRight()
|
||||
.stream()
|
||||
.filter(p -> !existingSoftwares.contains(p.getId()))
|
||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware)
|
||||
|
|
|
@ -28,7 +28,9 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
|
||||
.count();
|
||||
|
||||
if (count > 0) { return Arrays.asList(); }
|
||||
if (count > 0) {
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
return source
|
||||
.getInstance()
|
||||
|
|
|
@ -15,7 +15,7 @@ public class BrokerConstants {
|
|||
public static final String OPEN_ACCESS = "OPEN";
|
||||
public static final String IS_MERGED_IN_CLASS = "isMergedIn";
|
||||
|
||||
public static final List<Class<? extends Result>> RESULT_CLASSES =
|
||||
Arrays.asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class);
|
||||
public static final List<Class<? extends Result>> RESULT_CLASSES = Arrays
|
||||
.asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class);
|
||||
|
||||
}
|
||||
|
|
|
@ -56,7 +56,13 @@ public class ConversionUtils {
|
|||
.setOriginalId(d.getOriginalId().get(0))
|
||||
.setTitles(structPropList(d.getTitle()))
|
||||
.setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(d.getInstance().stream().map(ConversionUtils::oafInstanceToBrokerInstances).flatMap(List::stream).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
d
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).collect(Collectors.toList()))
|
||||
: null;
|
||||
}
|
||||
|
@ -74,11 +80,23 @@ public class ConversionUtils {
|
|||
.setPublisher(fieldValue(result.getPublisher()))
|
||||
.setEmbargoenddate(fieldValue(result.getEmbargoenddate()))
|
||||
.setContributor(fieldList(result.getContributor()))
|
||||
.setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
|
||||
.setCollectedFrom(result.getCollectedfrom().stream().map(KeyValue::getValue).collect(Collectors.toList()))
|
||||
.setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(result.getInstance().stream().map(ConversionUtils::oafInstanceToBrokerInstances).flatMap(List::stream).collect(Collectors.toList()))
|
||||
.setExternalReferences(result.getExternalReference().stream().map(ConversionUtils::oafExtRefToBrokerExtRef).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
result
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setExternalReferences(
|
||||
result
|
||||
.getExternalReference()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafExtRefToBrokerExtRef)
|
||||
.collect(Collectors.toList()))
|
||||
: null;
|
||||
}
|
||||
|
||||
|
@ -100,7 +118,9 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) {
|
||||
if (p == null) { return null; }
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project()
|
||||
.setTitle(fieldValue(p.getTitle()))
|
||||
|
@ -136,20 +156,29 @@ public class ConversionUtils {
|
|||
}
|
||||
|
||||
private static String fieldValue(final List<Field<String>> fl) {
|
||||
return fl != null ? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null) : null;
|
||||
return fl != null ? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
|
||||
: null;
|
||||
}
|
||||
|
||||
private static String structPropValue(final List<StructuredProperty> props) {
|
||||
return props != null ? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null) : null;
|
||||
return props != null
|
||||
? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
|
||||
: null;
|
||||
}
|
||||
|
||||
private static List<String> fieldList(final List<Field<String>> fl) {
|
||||
return fl != null ? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
|
||||
return fl != null
|
||||
? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
|
||||
private static List<String> structPropList(final List<StructuredProperty> props) {
|
||||
return props != null ? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
|
||||
return props != null
|
||||
? props
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -83,8 +83,20 @@ public final class UpdateInfo<T> {
|
|||
compileHighlight.accept(hl, getHighlightValue());
|
||||
|
||||
final String provId = getSource().getOriginalId().stream().findFirst().orElse(null);
|
||||
final String provRepo = getSource().getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null);
|
||||
final String provUrl = getSource().getInstance().stream().map(Instance::getUrl).flatMap(List::stream).findFirst().orElse(null);;
|
||||
final String provRepo = getSource()
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(KeyValue::getValue)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
final String provUrl = getSource()
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(Instance::getUrl)
|
||||
.flatMap(List::stream)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
;
|
||||
|
||||
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
|
||||
|
||||
|
|
Loading…
Reference in New Issue