enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
1 changed files with 38 additions and 11 deletions
Showing only changes of commit 48959e9a17 - Show all commits

View File

@ -1,41 +1,68 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple; package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.Arrays; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.tuple.Pair; import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Pair<String, String>> { public class EnrichMissingAuthorOrcid extends UpdateMatcher<String> {
public EnrichMissingAuthorOrcid() { public EnrichMissingAuthorOrcid() {
super(true); super(true);
} }
@Override @Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final ResultWithRelations source, protected List<UpdateInfo<String>> findUpdates(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target,
final DedupConfig dedupConfig) { final DedupConfig dedupConfig) {
// TODO
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); final Set<String> existingOrcids = target
return Arrays.asList(); .getResult()
.getAuthor()
.stream()
.map(Author::getPid)
.flatMap(List::stream)
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
.map(pid -> pid.getValue())
.collect(Collectors.toSet());
final List<UpdateInfo<String>> list = new ArrayList<>();
for (final Author author : source.getResult().getAuthor()) {
final String name = author.getFullname();
for (final StructuredProperty pid : author.getPid()) {
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")
&& !existingOrcids.contains(pid.getValue())) {
list
.add(
generateUpdateInfo(name + " [ORCID: " + pid.getValue() + "]", source, target, dedupConfig));
;
}
}
}
return list;
} }
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue, public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final ResultWithRelations source, final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target,
final DedupConfig dedupConfig) { final DedupConfig dedupConfig) {
return new UpdateInfo<>( return new UpdateInfo<>(
Topic.ENRICH_MISSING_AUTHOR_ORCID, Topic.ENRICH_MISSING_AUTHOR_ORCID,
highlightValue, source, target, highlightValue, source, target,
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), (p, aut) -> p.getCreators().add(aut),
pair -> pair.getLeft() + "::" + pair.getRight(), aut -> aut,
dedupConfig); dedupConfig);
} }
} }