enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
40 changed files with 349 additions and 454 deletions
Showing only changes of commit 244f6e50cf - Show all commits

View File

@ -30,30 +30,30 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.broker.model.Event;
import eu.dnetlib.dhp.broker.model.EventFactory;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingDatasetReferences;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSoftware;
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.common.HdfsSupport;

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingDatasetIsReferencedBy
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
public EnrichMissingDatasetIsReferencedBy() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
// TODO Auto-generated method stub
return null;
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingDatasetIsRelatedTo
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
public EnrichMissingDatasetIsRelatedTo() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
// TODO Auto-generated method stub
return null;
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingDatasetIsSupplementedBy
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
public EnrichMissingDatasetIsSupplementedBy() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
// TODO Auto-generated method stub
return null;
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingDatasetIsSupplementedTo
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
public EnrichMissingDatasetIsSupplementedTo() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
// TODO Auto-generated method stub
return null;
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
}

View File

@ -1,38 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingDatasetReferences
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
public EnrichMissingDatasetReferences() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
// TODO Auto-generated method stub
return null;
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_DATASET_REFERENCES,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPublicationIsReferencedBy
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
public EnrichMissingPublicationIsReferencedBy() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
// TODO Auto-generated method stub
return Arrays.asList();
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> {
}, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common
rel -> rel.getOriginalId());
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPublicationIsRelatedTo
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
public EnrichMissingPublicationIsRelatedTo() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
// TODO Auto-generated method stub
return Arrays.asList();
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> {
}, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common
rel -> rel.getOriginalId());
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPublicationIsSupplementedBy
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
public EnrichMissingPublicationIsSupplementedBy() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
// TODO Auto-generated method stub
return Arrays.asList();
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> {
}, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common
rel -> rel.getOriginalId());
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPublicationIsSupplementedTo
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
public EnrichMissingPublicationIsSupplementedTo() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
// TODO Auto-generated method stub
return Arrays.asList();
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> {
}, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common
rel -> rel.getOriginalId());
}
}

View File

@ -1,42 +0,0 @@
package eu.dnetlib.dhp.broker.oa.matchers;
import java.util.Arrays;
import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public class EnrichMissingPublicationReferences
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
public EnrichMissingPublicationReferences() {
super(true);
}
@Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
// TODO Auto-generated method stub
return Arrays.asList();
}
@Override
protected UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_REFERENCES,
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> {
}, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common
rel -> rel.getOriginalId());
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Result;
public abstract class AbstractEnrichMissingDataset
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
private final Topic topic;
public AbstractEnrichMissingDataset(final Topic topic) {
super(true);
this.topic = topic;
}
@Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
final Set<String> existingDatasets = target
.getRight()
.stream()
.map(Dataset::getId)
.collect(Collectors.toSet());
return source
.getRight()
.stream()
.filter(d -> !existingDatasets.contains(d.getId()))
.map(ConversionUtils::oafDatasetToBrokerDataset)
.map(i -> generateUpdateInfo(i, source, target))
.collect(Collectors.toList());
}
@Override
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final Pair<Result, List<Dataset>> source,
final Pair<Result, List<Dataset>> target) {
return new UpdateInfo<>(
getTopic(),
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
}
public Topic getTopic() {
return topic;
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDataset {
public EnrichMissingDatasetIsReferencedBy() {
super(Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDataset {
public EnrichMissingDatasetIsRelatedTo() {
super(Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingDataset {
public EnrichMissingDatasetIsSupplementedBy() {
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingDataset {
public EnrichMissingDatasetIsSupplementedTo() {
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset {
public EnrichMissingDatasetReferences() {
super(Topic.ENRICH_MISSING_DATASET_REFERENCES);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -0,0 +1,64 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
public abstract class AbstractEnrichMissingPublication
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
private final Topic topic;
public AbstractEnrichMissingPublication(final Topic topic) {
super(true);
this.topic = topic;
}
@Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
final Set<String> existingPublications = target
.getRight()
.stream()
.map(Publication::getId)
.collect(Collectors.toSet());
return source
.getRight()
.stream()
.filter(d -> !existingPublications.contains(d.getId()))
.map(ConversionUtils::oafPublicationToBrokerPublication)
.map(i -> generateUpdateInfo(i, source, target))
.collect(Collectors.toList());
}
@Override
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final Pair<Result, List<Publication>> source,
final Pair<Result, List<Publication>> target) {
return new UpdateInfo<>(
getTopic(),
highlightValue, source.getLeft(), target.getLeft(),
(p, rel) -> {
}, // p.getPublications().add(rel), //TODO available in the future release of dnet-openaire-broker-common
rel -> rel.getInstances().get(0).getUrl());
}
public Topic getTopic() {
return topic;
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissingPublication {
public EnrichMissingPublicationIsReferencedBy() {
super(Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPublication {
public EnrichMissingPublicationIsRelatedTo() {
super(Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMissingPublication {
public EnrichMissingPublicationIsSupplementedBy() {
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMissingPublication {
public EnrichMissingPublicationIsSupplementedTo() {
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO);
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import eu.dnetlib.dhp.broker.model.Topic;
public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPublication {
public EnrichMissingPublicationReferences() {
super(Topic.ENRICH_MISSING_PUBLICATION_REFERENCES);
}
}

View File

@ -1,11 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -1,11 +1,12 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.List;
import java.util.Set;
@ -8,6 +8,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.List;
import java.util.Set;
@ -7,6 +7,7 @@ import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.List;
import java.util.Set;
@ -7,6 +7,7 @@ import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays;
import java.util.List;
@ -7,6 +7,7 @@ import java.util.List;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.broker.oa.matchers;
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.List;
import java.util.Set;
@ -8,6 +8,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.schema.oaf.Result;

View File

@ -7,6 +7,8 @@ import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.broker.objects.Instance;
import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ConversionUtils {
@ -33,4 +35,15 @@ public class ConversionUtils {
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
}
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset();
// TODO
return res;
}
public static final eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication d) {
final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication();
// TODO
return res;
}
}

View File

@ -20,12 +20,7 @@ public class AuthorMerger {
public static List<Author> merge(List<List<Author>> authors) {
authors.sort(new Comparator<List<Author>>() {
@Override
public int compare(List<Author> o1, List<Author> o2) {
return -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2));
}
});
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
List<Author> author = new ArrayList<>();
@ -86,20 +81,28 @@ public class AuthorMerger {
.stream()
.map(ba -> new Tuple2<>(sim(ba, a._2()), ba))
.max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent() && simAuthor.get()._1() > THRESHOLD) {
Author r = simAuthor.get()._2();
if (r.getPid() == null) {
r.setPid(new ArrayList<>());
if(simAuthor.isPresent()) {
double th = THRESHOLD;
//increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null && simAuthor.get()._2().getSurname().length()<=3)
th = 0.99;
if (simAuthor.get()._1() > th) {
Author r = simAuthor.get()._2();
if (r.getPid() == null) {
r.setPid(new ArrayList<>());
}
r.getPid().add(a._1());
}
r.getPid().add(a._1());
}
});
}
public static String pidToComparableString(StructuredProperty pid) {
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
return (pid.getQualifier() != null ?
pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" : "")
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
}
public static int countAuthorsPids(List<Author> authors) {
@ -120,12 +123,14 @@ public class AuthorMerger {
final Person pa = parse(a);
final Person pb = parse(b);
//if both are accurate (e.g. they have name and surname)
if (pa.isAccurate() & pb.isAccurate()) {
return new JaroWinkler()
.score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()));
return
new JaroWinkler().score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString()))*0.5
+ new JaroWinkler().score(normalize(pa.getNameString()), normalize(pb.getNameString()))*0.5;
} else {
return new JaroWinkler()
.score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
return
new JaroWinkler().score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname()));
}
}

View File

@ -21,6 +21,7 @@ import scala.Tuple2;
public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2;
String testEntityBasePath;
DataInfo dataInfo;
@ -36,6 +37,7 @@ public class EntityMergerTest implements Serializable {
.getAbsolutePath();
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
pub_top = getTopPub(publications);
@ -90,6 +92,17 @@ public class EntityMergerTest implements Serializable {
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
}
@Test
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException, IOException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
assertEquals(pub_merged.getAuthor().size(), 27);
// insert assertions here
}
public DataInfo setDI() {
DataInfo dataInfo = new DataInfo();
dataInfo.setTrust("0.9");

File diff suppressed because one or more lines are too long