From a57965a3ea5fdc0536444166d357a35197978610 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 28 May 2020 17:36:37 +0200 Subject: [PATCH] limiting the dimensions of outliers --- .../CreateRelatedEntitiesJob_phase2.java | 80 ++++++++++++------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index cc9f17ee7..1de734ee4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -3,11 +3,9 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.Optional; -import java.util.function.Predicate; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; @@ -65,8 +63,10 @@ public class CreateRelatedEntitiesJob_phase2 { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final int MAX_EXTERNAL_ENTITIES = 50; - private static final int MAX_AUTHORS = 200; + private static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; + private static final int MAX_TITLE_LENGTH = 5000; + private static final int MAX_ABSTRACT_LENGTH = 100000; public static void main(String[] args) throws Exception { @@ -199,36 +199,62 @@ public class CreateRelatedEntitiesJob_phase2 { (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz)) .filter("dataInfo.invisible == false") - .map((MapFunction) e -> { - if (ModelSupport.isSubClass(entityClazz, Result.class)) { - Result r = (Result) e; - if (r.getExternalReference() != null) { - List refs = r - .getExternalReference() - .stream() - .limit(MAX_EXTERNAL_ENTITIES) - .collect(Collectors.toList()); - r.setExternalReference(refs); - } - if (r.getAuthor() != null && r.getAuthor().size() > MAX_AUTHORS) { - List authors = Lists.newArrayList(); - for (int i = 0; i < r.getAuthor().size(); i++) { - final Author a = r.getAuthor().get(i); - if (authors.size() < MAX_AUTHORS || hasORCID(a)) { - authors.add(a); - } - } - r.setAuthor(authors); - } - } - return e; - }, Encoders.bean(entityClazz)) + .map((MapFunction) e -> pruneOutliers(entityClazz, e), Encoders.bean(entityClazz)) .map( (MapFunction) value -> getTypedRow( StringUtils.substringAfterLast(inputEntityPath, "/"), value), Encoders.bean(TypedRow.class)); } + private static E pruneOutliers(Class entityClazz, E e) { + if (ModelSupport.isSubClass(entityClazz, Result.class)) { + Result r = (Result) e; + if (r.getExternalReference() != null) { + List refs = r + .getExternalReference() + .stream() + .limit(MAX_EXTERNAL_ENTITIES) + .collect(Collectors.toList()); + r.setExternalReference(refs); + } + if (r.getAuthor() != null) { + List authors = Lists.newArrayList(); + for (Author a : r.getAuthor()) { + a.setFullname(StringUtils.left(a.getFullname(), MAX_AUTHOR_FULLNAME_LENGTH)); + if (authors.size() < MAX_AUTHORS || hasORCID(a)) { + authors.add(a); + } + } + r.setAuthor(authors); + } + if (r.getDescription() != null) { + List> desc = r + .getDescription() + .stream() + .filter(Objects::nonNull) + .map(d -> { + d.setValue(StringUtils.left(d.getValue(), MAX_ABSTRACT_LENGTH)); + return d; + }) + .collect(Collectors.toList()); + r.setDescription(desc); + } + if (r.getTitle() != null) { + List titles = r + .getTitle() + .stream() + .filter(Objects::nonNull) + .map(t -> { + t.setValue(StringUtils.left(t.getValue(), MAX_TITLE_LENGTH)); + return t; + }) + .collect(Collectors.toList()); + r.setTitle(titles); + } + } + return e; + } + private static boolean hasORCID(Author a) { return a.getPid() != null && a .getPid()