WIP: graph cleaner implementation

2020-06-13 13:06:04 +02:00 · 2020-06-13 13:06:04 +02:00 · 0d52816244
parent bed65a1be6
commit 0d52816244
7 changed files with 76 additions and 107 deletions
--- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java
+++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java
@ -2,6 +2,7 @@
 package eu.dnetlib.dhp.common;

 import java.io.Serializable;
+import java.util.function.Consumer;
 import java.util.function.Supplier;

 /** Provides serializable and throwing extensions to standard functional interfaces. */
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
 	private FunctionalInterfaceSupport() {
 	}

+	/**
+	 * Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
+	 * functions externally.
+	 *
+	 * @param <T>
+	 */
+	@FunctionalInterface
+	public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
+	}
+
 	/**
 	 * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
 	 * functions externally.
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphProperties.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphProperties.java
@ -3,19 +3,9 @@ package eu.dnetlib.dhp.oa.graph.clean;

 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

-import java.beans.BeanInfo;
-import java.beans.IntrospectionException;
-import java.beans.Introspector;
-import java.beans.PropertyDescriptor;
-import java.lang.reflect.InvocationTargetException;
-import java.util.Map;
-import java.util.Objects;
 import java.util.Optional;
-import java.util.TreeMap;
-import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.function.MapFunction;
 import org.apache.spark.sql.Dataset;
@ -33,11 +23,10 @@ import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
 import eu.dnetlib.dhp.schema.oaf.*;
 import eu.dnetlib.dhp.utils.ISLookupClientFactory;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
-import scala.Predef;

-public class CleanGraphProperties {
+public class CleanGraphSparkJob {

-	private static final Logger log = LoggerFactory.getLogger(CleanGraphProperties.class);
+	private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);

 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();

@ -45,7 +34,7 @@ public class CleanGraphProperties {

 		String jsonConfiguration = IOUtils
 			.toString(
-				CleanGraphProperties.class
+				CleanGraphSparkJob.class
 					.getResourceAsStream(
 						"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -91,13 +80,14 @@ public class CleanGraphProperties {
 		Class<T> clazz,
 		String outputPath) {

-		CleaningRule<T> rule = new CleaningRule<>(vocs);
+		final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);

 		readTableFromPath(spark, inputPath, clazz)
-			.map(rule, Encoders.bean(clazz))
+			.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
 			.write()
 			.mode(SaveMode.Overwrite)
-			.parquet(outputPath);
+			.option("compression", "gzip")
+			.json(outputPath);
 	}

 	private static <T extends Oaf> Dataset<T> readTableFromPath(
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java
@ -1,43 +1,23 @@

 package eu.dnetlib.dhp.oa.graph.clean;

-import java.util.Map;
-import java.util.function.Consumer;
-
-import org.apache.spark.api.java.function.MapFunction;
-
-import com.google.common.collect.Maps;
+import java.io.Serializable;
+import java.util.HashMap;

+import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.schema.oaf.Oaf;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.dhp.schema.oaf.StructuredProperty;

-public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
-
-	private VocabularyGroup vocabularies;
-
-	private Map<Class, Consumer<Object>> mapping = Maps.newHashMap();
-
-	public CleaningRule(VocabularyGroup vocabularies) {
-		this.vocabularies = vocabularies;
-		setMappings(vocabularies);
-	}
-
-	@Override
-	public T call(T value) throws Exception {
-
-		OafNavigator.apply(value, mapping);
-
-		return value;
-	}
+public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {

 	/**
-	 * Populates the mapping for the Oaf types subject to cleaning
-	 * 
+	 * Creates the mapping for the Oaf types subject to cleaning
+	 *
 	 * @param vocabularies
 	 */
-	private void setMappings(VocabularyGroup vocabularies) {
+	public static CleaningRuleMap create(VocabularyGroup vocabularies) {
+		CleaningRuleMap mapping = new CleaningRuleMap();
 		mapping.put(Qualifier.class, o -> {
 			Qualifier q = (Qualifier) o;
 			if (vocabularies.vocabularyExists(q.getSchemeid())) {
@ -54,10 +34,7 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
 			 * }
 			 */
 		});
-	}
-
-	public VocabularyGroup getVocabularies() {
-		return vocabularies;
+		return mapping;
 	}

 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/OafNavigator.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/OafNavigator.java
@ -1,15 +1,18 @@

 package eu.dnetlib.dhp.oa.graph.clean;

+import java.io.Serializable;
 import java.lang.reflect.Field;
-import java.util.*;
-import java.util.function.Consumer;
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Objects;

 import eu.dnetlib.dhp.schema.oaf.Oaf;

-public class OafNavigator {
+public class OafCleaner implements Serializable {

-	public static <E extends Oaf> E apply(E oaf, Map<Class, Consumer<Object>> mapping) {
+	public static <E extends Oaf> E apply(E oaf, CleaningRuleMap mapping) {
 		try {
 			navigate(oaf, mapping);
 		} catch (IllegalAccessException e) {
@ -18,7 +21,7 @@ public class OafNavigator {
 		return oaf;
 	}

-	private static void navigate(Object o, Map<Class, Consumer<Object>> mapping) throws IllegalAccessException {
+	private static void navigate(Object o, CleaningRuleMap mapping) throws IllegalAccessException {
 		if (isPrimitive(o)) {
 			return;
 		} else if (isIterable(o.getClass())) {
@ -40,7 +43,7 @@ public class OafNavigator {
 		}
 	}

-	private static boolean hasMapping(Object o, Map<Class, Consumer<Object>> mapping) {
+	private static boolean hasMapping(Object o, CleaningRuleMap mapping) {
 		return mapping.containsKey(o.getClass());
 	}

--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/config-default.xml
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml
@ -72,18 +72,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean publications</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
@ -99,18 +98,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean datasets</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
@ -126,18 +124,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean otherresearchproducts</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
@ -153,18 +150,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean softwares</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
@ -180,18 +176,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean datasources</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
@ -207,18 +202,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean organizations</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
@ -234,18 +228,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean projects</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
@ -261,18 +254,17 @@
            <master>yarn</master>
            <mode>cluster</mode>
            <name>Clean relations</name>
-            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
+            <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
            <spark-opts>
-                --executor-cores=${sparkExecutorCoresForJoining}
-                --executor-memory=${sparkExecutorMemoryForJoining}
-                --driver-memory=${sparkDriverMemoryForJoining}
+                --executor-cores=${sparkExecutorCores}
+                --executor-memory=${sparkExecutorMemory}
+                --driver-memory=${sparkDriverMemory}
                --conf spark.extraListeners=${spark2ExtraListeners}
                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
                --conf spark.sql.shuffle.partitions=7680
-                --conf spark.network.timeout=${sparkNetworkTimeout}
            </spark-opts>
            <arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
            <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
--- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
+++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleaningFunctionTest.java
@ -5,32 +5,27 @@ import static org.junit.jupiter.api.Assertions.*;
 import static org.mockito.Mockito.lenient;

 import java.io.IOException;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Objects;
 import java.util.Set;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;

 import org.apache.commons.io.IOUtils;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.extension.ExtendWith;
-import org.junit.platform.commons.util.StringUtils;
 import org.mockito.Mock;
 import org.mockito.junit.jupiter.MockitoExtension;

 import com.fasterxml.jackson.databind.ObjectMapper;

 import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
-import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm;
 import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Qualifier;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
 import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

@ExtendWith(MockitoExtension.class)
-public class CleaningRuleTest {
+public class CleaningFunctionTest {

 	public static final ObjectMapper MAPPER = new ObjectMapper();

@ -39,7 +34,7 @@ public class CleaningRuleTest {

 	private VocabularyGroup vocabularies;

-	private CleaningRule<Publication> cleaningRule;
+	private CleaningRuleMap mapping;

 	@BeforeEach
 	public void setUp() throws ISLookUpException, IOException {
@ -49,18 +44,19 @@ public class CleaningRuleTest {
 			.thenReturn(synonyms());

 		vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
-		cleaningRule = new CleaningRule(vocabularies);
+		mapping = CleaningRuleMap.create(vocabularies);
 	}

 	@Test
 	public void testCleaning() throws Exception {

-		assertNotNull(cleaningRule.getVocabularies());
+		assertNotNull(vocabularies);
+		assertNotNull(mapping);

 		String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
 		Publication p_in = MAPPER.readValue(json, Publication.class);

-		Publication p_out = cleaningRule.call(p_in);
+		Publication p_out = OafCleaner.apply(p_in, mapping);

 		assertNotNull(p_out);

@ -100,11 +96,11 @@ public class CleaningRuleTest {

 	private List<String> vocs() throws IOException {
 		return IOUtils
-			.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
+			.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
 	}

 	private List<String> synonyms() throws IOException {
 		return IOUtils
-			.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
+			.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
 	}
 }