limiting the dimensions of outliers

2020-05-28 17:36:37 +02:00 · 2020-05-28 17:36:37 +02:00 · a57965a3ea
parent 821be1f8b6
commit a57965a3ea
1 changed files with 53 additions and 27 deletions
--- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
+++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java
@ -3,11 +3,9 @@ package eu.dnetlib.dhp.oa.provision;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Objects;
 import java.util.Optional;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import org.apache.commons.io.IOUtils;
@ -65,8 +63,10 @@ public class CreateRelatedEntitiesJob_phase2 {
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 	private static final int MAX_EXTERNAL_ENTITIES = 50;
 	private static final int MAX_AUTHORS = 200;
 	private static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
 	private static final int MAX_TITLE_LENGTH = 5000;
 	private static final int MAX_ABSTRACT_LENGTH = 100000;
 	public static void main(String[] args) throws Exception {
@ -199,7 +199,14 @@ public class CreateRelatedEntitiesJob_phase2 {
 				(MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
 				Encoders.bean(entityClazz))
 			.filter("dataInfo.invisible == false")
-			.map((MapFunction<E, E>) e -> {
+			.map((MapFunction<E, E>) e -> pruneOutliers(entityClazz, e), Encoders.bean(entityClazz))
 			.map(
 				(MapFunction<E, TypedRow>) value -> getTypedRow(
 					StringUtils.substringAfterLast(inputEntityPath, "/"), value),
 				Encoders.bean(TypedRow.class));
 	}
 	private static <E extends OafEntity> E pruneOutliers(Class<E> entityClazz, E e) {
 		if (ModelSupport.isSubClass(entityClazz, Result.class)) {
 			Result r = (Result) e;
 			if (r.getExternalReference() != null) {
@ -210,23 +217,42 @@ public class CreateRelatedEntitiesJob_phase2 {
 					.collect(Collectors.toList());
 				r.setExternalReference(refs);
 			}
-					if (r.getAuthor() != null && r.getAuthor().size() > MAX_AUTHORS) {
+			if (r.getAuthor() != null) {
 				List<Author> authors = Lists.newArrayList();
-						for (int i = 0; i < r.getAuthor().size(); i++) {
+				for (Author a : r.getAuthor()) {
-							final Author a = r.getAuthor().get(i);
+					a.setFullname(StringUtils.left(a.getFullname(), MAX_AUTHOR_FULLNAME_LENGTH));
 					if (authors.size() < MAX_AUTHORS || hasORCID(a)) {
 						authors.add(a);
 					}
 				}
 				r.setAuthor(authors);
 			}
 			if (r.getDescription() != null) {
 				List<Field<String>> desc = r
 					.getDescription()
 					.stream()
 					.filter(Objects::nonNull)
 					.map(d -> {
 						d.setValue(StringUtils.left(d.getValue(), MAX_ABSTRACT_LENGTH));
 						return d;
 					})
 					.collect(Collectors.toList());
 				r.setDescription(desc);
 			}
 			if (r.getTitle() != null) {
 				List<StructuredProperty> titles = r
 					.getTitle()
 					.stream()
 					.filter(Objects::nonNull)
 					.map(t -> {
 						t.setValue(StringUtils.left(t.getValue(), MAX_TITLE_LENGTH));
 						return t;
 					})
 					.collect(Collectors.toList());
 				r.setTitle(titles);
 			}
 		}
 		return e;
 			}, Encoders.bean(entityClazz))
 			.map(
 				(MapFunction<E, TypedRow>) value -> getTypedRow(
 					StringUtils.substringAfterLast(inputEntityPath, "/"), value),
 				Encoders.bean(TypedRow.class));
 	}
 	private static boolean hasORCID(Author a) {