limiting the dimensions of outliers

This commit is contained in:
Claudio Atzori 2020-05-28 17:36:37 +02:00
parent 821be1f8b6
commit a57965a3ea
1 changed files with 53 additions and 27 deletions

View File

@ -3,11 +3,9 @@ package eu.dnetlib.dhp.oa.provision;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
@ -65,8 +63,10 @@ public class CreateRelatedEntitiesJob_phase2 {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final int MAX_EXTERNAL_ENTITIES = 50;
private static final int MAX_AUTHORS = 200;
private static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
private static final int MAX_TITLE_LENGTH = 5000;
private static final int MAX_ABSTRACT_LENGTH = 100000;
public static void main(String[] args) throws Exception {
@ -199,7 +199,14 @@ public class CreateRelatedEntitiesJob_phase2 {
(MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))
.filter("dataInfo.invisible == false")
.map((MapFunction<E, E>) e -> {
.map((MapFunction<E, E>) e -> pruneOutliers(entityClazz, e), Encoders.bean(entityClazz))
.map(
(MapFunction<E, TypedRow>) value -> getTypedRow(
StringUtils.substringAfterLast(inputEntityPath, "/"), value),
Encoders.bean(TypedRow.class));
}
private static <E extends OafEntity> E pruneOutliers(Class<E> entityClazz, E e) {
if (ModelSupport.isSubClass(entityClazz, Result.class)) {
Result r = (Result) e;
if (r.getExternalReference() != null) {
@ -210,23 +217,42 @@ public class CreateRelatedEntitiesJob_phase2 {
.collect(Collectors.toList());
r.setExternalReference(refs);
}
if (r.getAuthor() != null && r.getAuthor().size() > MAX_AUTHORS) {
if (r.getAuthor() != null) {
List<Author> authors = Lists.newArrayList();
for (int i = 0; i < r.getAuthor().size(); i++) {
final Author a = r.getAuthor().get(i);
for (Author a : r.getAuthor()) {
a.setFullname(StringUtils.left(a.getFullname(), MAX_AUTHOR_FULLNAME_LENGTH));
if (authors.size() < MAX_AUTHORS || hasORCID(a)) {
authors.add(a);
}
}
r.setAuthor(authors);
}
if (r.getDescription() != null) {
List<Field<String>> desc = r
.getDescription()
.stream()
.filter(Objects::nonNull)
.map(d -> {
d.setValue(StringUtils.left(d.getValue(), MAX_ABSTRACT_LENGTH));
return d;
})
.collect(Collectors.toList());
r.setDescription(desc);
}
if (r.getTitle() != null) {
List<StructuredProperty> titles = r
.getTitle()
.stream()
.filter(Objects::nonNull)
.map(t -> {
t.setValue(StringUtils.left(t.getValue(), MAX_TITLE_LENGTH));
return t;
})
.collect(Collectors.toList());
r.setTitle(titles);
}
}
return e;
}, Encoders.bean(entityClazz))
.map(
(MapFunction<E, TypedRow>) value -> getTypedRow(
StringUtils.substringAfterLast(inputEntityPath, "/"), value),
Encoders.bean(TypedRow.class));
}
private static boolean hasORCID(Author a) {