From 97b1c4057c12d9624aadef55bbd13678ad9ba379 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 12 Jun 2020 10:45:18 +0200 Subject: [PATCH] WIP: graph cleaner implementation --- .../dhp/oa/graph/clean/CleaningRule.java | 90 +++++------ .../dhp/oa/graph/clean/OafNavigator.java | 147 ++++++++++++++++++ .../oa/graph/raw/common/VocabularyGroup.java | 5 +- .../dhp/oa/graph/clean/CleaningRuleTest.java | 7 + .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++ 5 files changed, 217 insertions(+), 54 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRule.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRule.java index 51b9309625..88fc612980 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRule.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRule.java @@ -1,86 +1,70 @@ package eu.dnetlib.dhp.oa.graph.clean; -import java.lang.reflect.Field; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Objects; - +import com.google.common.collect.Maps; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.api.java.function.MapFunction; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import java.util.Map; +import java.util.Objects; +import java.util.function.Function; + public class CleaningRule implements MapFunction { private VocabularyGroup vocabularies; + private Map> mapping = Maps.newHashMap(); + + public CleaningRule(VocabularyGroup vocabularies) { this.vocabularies = vocabularies; + + mapping.put(Qualifier.class, o -> patchQualifier(o)); + mapping.put(StructuredProperty.class, o -> patchSp(o)); + mapping.put(Field.class, o -> patchStringField(o)); } @Override public T call(T value) throws Exception { - doClean(value); + OafNavigator.apply(value, mapping); return value; } - private void doClean(Object o) { - if (Objects.isNull(o)) { - return; - } - - if (o instanceof Iterable) { - for (Object oi : (Iterable) o) { - doClean(oi); - } - } else { - - Class clazz = o.getClass(); - - if (clazz.isPrimitive() - || o instanceof Integer - || o instanceof Double - || o instanceof Float - || o instanceof Long - || o instanceof Boolean - || o instanceof String) { - return; - } else { - try { - for (Field field : getAllFields(new LinkedList<>(), clazz)) { - field.setAccessible(true); - Object value = field.get(o); - if (value instanceof Qualifier) { - Qualifier q = (Qualifier) value; - if (vocabularies.vocabularyExists(q.getSchemeid())) { - field.set(o, vocabularies.lookup(q.getSchemeid(), q.getClassid())); - } - - } else { - doClean(value); - } - } - } catch (IllegalAccessException | IllegalArgumentException e) { - throw new RuntimeException(e); - } - } + private Object patchQualifier(Object o) { + Qualifier q = (Qualifier) o; + if (vocabularies.vocabularyExists(q.getSchemeid())) { + return vocabularies.lookup(q.getSchemeid(), q.getClassid()); } + return o; } - private static List getAllFields(List fields, Class clazz) { - fields.addAll(Arrays.asList(clazz.getDeclaredFields())); + private Object patchSp(Object o) { + StructuredProperty sp = (StructuredProperty) o; + if (StringUtils.isBlank(sp.getValue())) { + return null; + } + return o; + } - final Class superclass = clazz.getSuperclass(); - if (Objects.nonNull(superclass) && superclass.getPackage().equals(Oaf.class.getPackage())) { - getAllFields(fields, superclass); + private Object patchStringField(Object o) { + Field f = (Field) o; + try { + if (StringUtils.isBlank((String) f.getValue())) { + return null; + } + } catch (ClassCastException e) { + // ignored on purpose } - return fields; + return o; } public VocabularyGroup getVocabularies() { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/OafNavigator.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/OafNavigator.java index c329a31112..40facc110e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/OafNavigator.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/OafNavigator.java @@ -1,4 +1,151 @@ + package eu.dnetlib.dhp.oa.graph.clean; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import scala.Tuple2; + +import java.beans.BeanInfo; +import java.beans.IntrospectionException; +import java.beans.Introspector; +import java.beans.PropertyDescriptor; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.util.*; +import java.util.function.Function; + public class OafNavigator { + + public static E apply(E oaf, Map> mapping) { + reflect(oaf, mapping); + return oaf; + } + + public static void reflect(Object o, Map> mapping) { + visit(o, mapping); + } + + public static void visit(final Object thingy, Map> mapping) { + + try { + final Class clazz = thingy.getClass(); + + if (!isPrimitive(thingy) && clazz.getPackage().equals(Oaf.class.getPackage())) { + + final BeanInfo beanInfo = Introspector.getBeanInfo(clazz); + + for (final PropertyDescriptor descriptor : beanInfo.getPropertyDescriptors()) { + try { + final Object value = descriptor.getReadMethod().invoke(thingy); + + if (value != null && !isPrimitive(value)) { + + System.out.println("VISITING " + descriptor.getName() + " " + descriptor.getPropertyType()); + + if (Iterable.class.isAssignableFrom(descriptor.getPropertyType())) { + for(Object vi : (Iterable) value) { + + visit(vi, mapping); + } + } else { + + if (mapping.keySet().contains(value.getClass())) { + final Object newValue = mapping.get(value.getClass()).apply(value); + System.out.println("PATCHING " + descriptor.getName()+ " " + descriptor.getPropertyType()); + System.out.println("OLD VALUE " + getObjectMapper().writeValueAsString(value)); + System.out.println("NEW VALUE " + getObjectMapper().writeValueAsString(newValue)); + descriptor.getWriteMethod().invoke(newValue); + } + + visit(value, mapping); + } + } + + } catch (final IllegalArgumentException e) { + // handle this please + } catch (final IllegalAccessException e) { + // and this also + } catch (final InvocationTargetException e) { + // and this, too + } catch (JsonProcessingException e) { + e.printStackTrace(); + } + } + } + } catch (final IntrospectionException e) { + // do something sensible here + } + } + + private static ObjectMapper getObjectMapper() { + final ObjectMapper mapper = new ObjectMapper(); + return mapper; + } + + private static void navigate(Object o, Map> mapping) { + if (Objects.isNull(o) || isPrimitive(o)) { + return; + } else { + try { + for (Field field : getAllFields(o.getClass())) { + System.out.println(field.getName()); + field.setAccessible(true); + Object value = field.get(o); + + if (Objects.nonNull(value)) { + final Class fieldType = field.getType(); + if ((fieldType.isArray() && !fieldType.getComponentType().isPrimitive())) { + Object[] fs = (Object[]) value; + for (Object fi : fs) { + navigate(fi, mapping); + } + } if (Iterable.class.isAssignableFrom(fieldType)) { + Iterable fs = (Iterable) value; + for (Object fi : fs) { + navigate(fi, mapping); + } + } else { + if (mapping.keySet().contains(value.getClass())) { + System.out.println("PATCHING " + field.getName()); + field.set(o, mapping.get(value.getClass()).apply(value)); + } + } + } + } + + } catch (IllegalAccessException | IllegalArgumentException e) { + throw new RuntimeException(e); + } + } + } + + private static boolean isPrimitive(Object o) { + return o.getClass().isPrimitive() + || o instanceof Class + || o instanceof Integer + || o instanceof Double + || o instanceof Float + || o instanceof Long + || o instanceof Boolean + || o instanceof String + || o instanceof Byte; + } + + private static List getAllFields(Class clazz) { + return getAllFields(new LinkedList<>(), clazz); + } + + private static List getAllFields(List fields, Class clazz) { + fields.addAll(Arrays.asList(clazz.getDeclaredFields())); + + final Class superclass = clazz.getSuperclass(); + if (Objects.nonNull(superclass) && superclass.getPackage().equals(Oaf.class.getPackage())) { + getAllFields(fields, superclass); + } + + return fields; + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java index ec95ade003..447228bf9f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/VocabularyGroup.java @@ -105,7 +105,10 @@ public class VocabularyGroup implements Serializable { } public Qualifier getTermAsQualifier(final String vocId, final String id) { - return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id); + if (vocabularyExists(vocId)) { + return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id); + } + return OafMapperUtils.qualifier(id, id, "", ""); } public Qualifier getSynonymAsQualifier(final String vocId, final String syn) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleTest.java index 019285cc3d..dcc98b2006 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleTest.java @@ -7,6 +7,7 @@ import static org.mockito.Mockito.lenient; import java.io.IOException; import java.util.HashSet; import java.util.List; +import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -15,6 +16,7 @@ import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.platform.commons.util.StringUtils; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; @@ -82,6 +84,11 @@ public class CleaningRuleTest { // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_out)); + assertTrue( + p_out + .getPid() + .stream() + .allMatch(sp -> StringUtils.isNotBlank(sp.getValue()))); } private Stream getAuthorPidTypes(Publication pub) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 435b001b7c..89ebe5af5d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -380,6 +380,28 @@ "schemename": "dnet:pid_types" }, "value": "10.1007/s109090161569x" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "doi", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "" } ], "relevantdate": [