enrichment steps #38

Merged
claudio.atzori merged 334 commits from miriam.baglioni/dnet-hadoop:master into enrichment_wfs 2020-08-11 16:40:26 +02:00
13 changed files with 88 additions and 25 deletions
Showing only changes of commit 3bcdfbabe9 - Show all commits

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedDataset;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBrokerRelatedDataset> {
@ -25,6 +27,10 @@ public abstract class AbstractEnrichMissingDataset extends UpdateMatcher<OaBroke
protected final List<OaBrokerRelatedDataset> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getDatasets().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingDatasets = target
.getDatasets()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerProject;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
@ -27,6 +29,10 @@ public class EnrichMoreProject extends UpdateMatcher<OaBrokerProject> {
protected List<OaBrokerProject> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getProjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingProjects = target
.getProjects()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedPublications;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedPublication;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaBrokerRelatedPublication> {
@ -27,6 +29,10 @@ public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<OaB
final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getPublications().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingPublications = target
.getPublications()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
@ -24,6 +26,10 @@ public class EnrichMoreSoftware extends UpdateMatcher<OaBrokerRelatedSoftware> {
final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getSoftwares().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingSoftwares = source
.getSoftwares()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -11,6 +12,7 @@ import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
@ -25,6 +27,10 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<OaBrokerAuthor> {
protected List<OaBrokerAuthor> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getCreators().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingOrcids = target
.getCreators()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
@ -23,6 +24,11 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<OaBrokerInstance> {
@Override
protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getInstances().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final long count = target
.getInstances()
.stream()

View File

@ -22,9 +22,8 @@ public class EnrichMissingPid extends UpdateMatcher<OaBrokerTypedValue> {
@Override
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
final long count = target.getPids().size();
if (count > 0) {
if (target.getPids().size() > 0) {
return Arrays.asList();
}

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
@ -22,6 +24,11 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
@Override
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getSubjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingSubject = target
.getSubjects()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -23,6 +24,11 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<OaBrokerInstance> {
@Override
protected List<OaBrokerInstance> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getInstances().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> urls = target
.getInstances()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
@ -22,6 +24,11 @@ public class EnrichMorePid extends UpdateMatcher<OaBrokerTypedValue> {
@Override
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getPids().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingPids = target
.getPids()
.stream()

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@ -9,6 +10,7 @@ import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
@ -23,6 +25,10 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
protected List<OaBrokerTypedValue> findDifferences(final OaBrokerMainEntity source,
final OaBrokerMainEntity target) {
if (target.getSubjects().size() >= BrokerConstants.MAX_LIST_SIZE) {
return new ArrayList<>();
}
final Set<String> existingSubjects = target
.getSubjects()
.stream()

View File

@ -19,6 +19,10 @@ public class BrokerConstants {
public static final int MAX_NUMBER_OF_RELS = 20;
public static final int MAX_STRING_SIZE = 3000;
public static final int MAX_LIST_SIZE = 50;
public static Class<?>[] getModelClasses() {
final Set<Class<?>> list = new HashSet<>();
list.addAll(Arrays.asList(ModelSupport.getOafModelClasses()));

View File

@ -55,7 +55,7 @@ public class ConversionUtils {
res.setLicense(BrokerConstants.OPEN_ACCESS);
res.setHostedby(kvValue(i.getHostedby()));
return res;
}, 20);
});
}
public static OaBrokerTypedValue oafPidToBrokerPid(final StructuredProperty sp) {
@ -75,8 +75,8 @@ public class ConversionUtils {
res.setOpenaireId(d.getId());
res.setOriginalId(first(d.getOriginalId()));
res.setTitle(structPropValue(d.getTitle()));
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid, 20));
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances, 20));
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue));
return res;
}
@ -90,8 +90,8 @@ public class ConversionUtils {
res.setOpenaireId(p.getId());
res.setOriginalId(first(p.getOriginalId()));
res.setTitle(structPropValue(p.getTitle()));
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid, 20));
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances, 20));
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue));
return res;
@ -107,25 +107,24 @@ public class ConversionUtils {
res.setOpenaireId(result.getId());
res.setOriginalId(first(result.getOriginalId()));
res.setTypology(classId(result.getResulttype()));
res.setTitles(structPropList(result.getTitle(), 10));
res.setAbstracts(fieldList(result.getDescription(), 10));
res.setTitles(structPropList(result.getTitle()));
res.setAbstracts(fieldList(result.getDescription()));
res.setLanguage(classId(result.getLanguage()));
res.setSubjects(structPropTypedList(result.getSubject()));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor, 30));
res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor));
res.setPublicationdate(fieldValue(result.getDateofacceptance()));
res.setPublisher(fieldValue(result.getPublisher()));
res.setEmbargoenddate(fieldValue(result.getEmbargoenddate()));
res.setContributor(fieldList(result.getContributor(), 20));
res.setContributor(fieldList(result.getContributor()));
res
.setJournal(
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null);
res.setCollectedFromId(mappedFirst(result.getCollectedfrom(), KeyValue::getKey));
res.setCollectedFromName(mappedFirst(result.getCollectedfrom(), KeyValue::getValue));
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid, 20));
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances, 20));
res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid));
res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances));
res
.setExternalReferences(
mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef, 20));
.setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef));
return res;
}
@ -245,25 +244,25 @@ public class ConversionUtils {
: null;
}
private static List<String> fieldList(final List<Field<String>> fl, final long maxSize) {
private static List<String> fieldList(final List<Field<String>> fl) {
return fl != null
? fl
.stream()
.map(Field::getValue)
.map(s -> StringUtils.abbreviate(s, 3000)) // MAX 3000 CHARS
.map(s -> StringUtils.abbreviate(s, BrokerConstants.MAX_STRING_SIZE))
.filter(StringUtils::isNotBlank)
.limit(maxSize)
.limit(BrokerConstants.MAX_LIST_SIZE)
.collect(Collectors.toList())
: new ArrayList<>();
}
private static List<String> structPropList(final List<StructuredProperty> props, final long maxSize) {
private static List<String> structPropList(final List<StructuredProperty> props) {
return props != null
? props
.stream()
.map(StructuredProperty::getValue)
.filter(StringUtils::isNotBlank)
.limit(maxSize)
.limit(BrokerConstants.MAX_LIST_SIZE)
.collect(Collectors.toList())
: new ArrayList<>();
}
@ -280,7 +279,7 @@ public class ConversionUtils {
.collect(Collectors.toList());
}
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func, final long maxSize) {
private static <F, T> List<T> mappedList(final List<F> list, final Function<F, T> func) {
if (list == null) {
return new ArrayList<>();
}
@ -289,12 +288,11 @@ public class ConversionUtils {
.stream()
.map(func::apply)
.filter(Objects::nonNull)
.limit(maxSize)
.limit(BrokerConstants.MAX_LIST_SIZE)
.collect(Collectors.toList());
}
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func,
final long maxSize) {
private static <F, T> List<T> flatMappedList(final List<F> list, final Function<F, List<T>> func) {
if (list == null) {
return new ArrayList<>();
}
@ -304,7 +302,7 @@ public class ConversionUtils {
.map(func::apply)
.flatMap(List::stream)
.filter(Objects::nonNull)
.limit(maxSize)
.limit(BrokerConstants.MAX_LIST_SIZE)
.collect(Collectors.toList());
}