From d1ca025b0bdbdff02901b4dd6514dce397d16ee3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 13 Apr 2021 14:32:41 +0200 Subject: [PATCH] [cleaning] remiving authors without fullname or providing 'deactivated' keyword. Removing test test titles --- .../dhp/schema/oaf/CleaningFunctions.java | 79 ++++++++++++++++--- .../oa/graph/clean/CleanGraphSparkJob.java | 1 + .../oa/graph/clean/CleaningFunctionTest.java | 4 + .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 ++++++ 4 files changed, 96 insertions(+), 10 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java index 917733a14..673bee314 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; @@ -22,6 +23,9 @@ public class CleaningFunctions { public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final Set PID_BLACKLIST = new HashSet<>(); + public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; + public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; static { PID_BLACKLIST.add("none"); @@ -80,6 +84,36 @@ public class CleaningFunctions { return value; } + public static boolean filter(T value) { + if (value instanceof Datasource) { + // nothing to evaluate here + } else if (value instanceof Project) { + // nothing to evaluate here + } else if (value instanceof Organization) { + // nothing to evaluate here + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) { + return false; + } + + if (value instanceof Publication) { + + } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + return true; + } + public static T cleanup(T value) { if (value instanceof Datasource) { // nothing to clean here @@ -124,6 +158,12 @@ public class CleaningFunctions { .stream() .filter(Objects::nonNull) .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter( + sp -> sp + .getValue() + .toLowerCase() + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH) .map(CleaningFunctions::cleanValue) .collect(Collectors.toList())); } @@ -199,16 +239,7 @@ public class CleaningFunctions { } } if (Objects.nonNull(r.getAuthor())) { - boolean nullRank = r - .getAuthor() - .stream() - .anyMatch(a -> Objects.isNull(a.getRank())); - if (nullRank) { - int i = 1; - for (Author author : r.getAuthor()) { - author.setRank(i++); - } - } + final List authors = Lists.newArrayList(); for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); @@ -235,8 +266,27 @@ public class CleaningFunctions { .stream() .collect(Collectors.toList())); } + if (StringUtils.isBlank(a.getFullname())) { + if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) { + a.setFullname(a.getSurname() + ", " + a.getName()); + } + } + if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) { + authors.add(a); + } } + boolean nullRank = authors + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : authors) { + author.setRank(i++); + } + } + r.setAuthor(authors); + } if (value instanceof Publication) { @@ -252,6 +302,15 @@ public class CleaningFunctions { return value; } + private static boolean isValidAuthorName(Author a) { + return !Stream + .of(a.getFullname(), a.getName(), a.getSurname()) + .filter(s -> s != null && !s.isEmpty()) + .collect(Collectors.joining("")) + .toLowerCase() + .matches(INVALID_AUTHOR_REGEX); + } + private static List processPidCleaning(List pids) { return pids .stream() diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 86c453656..088539325 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -90,6 +90,7 @@ public class CleanGraphSparkJob { .map((MapFunction) value -> fixVocabularyNames(value), Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction) value -> cleanup(value), Encoders.bean(clazz)) + .filter((FilterFunction) value -> filter(value)) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java index fdbc58c17..15cb054ad 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java @@ -67,6 +67,7 @@ public class CleaningFunctionTest { assertNotNull(p_out.getPublisher()); assertNull(p_out.getPublisher().getValue()); + assertEquals("und", p_out.getLanguage().getClassid()); assertEquals("Undetermined", p_out.getLanguage().getClassname()); @@ -120,6 +121,9 @@ public class CleaningFunctionTest { .isPresent()); Publication p_cleaned = CleaningFunctions.cleanup(p_out); + + assertEquals(1, p_cleaned.getTitle().size()); + assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertNull(p_out.getPublisher()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 23de2ef86..8670c10f1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -865,6 +865,28 @@ "schemename": "dnet:dataCite_title" }, "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "test test 123 test" } ] } \ No newline at end of file