forked from antonis.lempesis/dnet-hadoop
WIP: graph cleaner implementation
This commit is contained in:
parent
bed65a1be6
commit
0d52816244
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.dhp.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.function.Consumer;
|
||||||
import java.util.function.Supplier;
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
/** Provides serializable and throwing extensions to standard functional interfaces. */
|
/** Provides serializable and throwing extensions to standard functional interfaces. */
|
||||||
|
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
|
||||||
private FunctionalInterfaceSupport() {
|
private FunctionalInterfaceSupport() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
|
||||||
|
* functions externally.
|
||||||
|
*
|
||||||
|
* @param <T>
|
||||||
|
*/
|
||||||
|
@FunctionalInterface
|
||||||
|
public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
|
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
|
||||||
* functions externally.
|
* functions externally.
|
||||||
|
|
|
@ -3,19 +3,9 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.beans.BeanInfo;
|
|
||||||
import java.beans.IntrospectionException;
|
|
||||||
import java.beans.Introspector;
|
|
||||||
import java.beans.PropertyDescriptor;
|
|
||||||
import java.lang.reflect.InvocationTargetException;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -33,11 +23,10 @@ import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import scala.Predef;
|
|
||||||
|
|
||||||
public class CleanGraphProperties {
|
public class CleanGraphSparkJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(CleanGraphProperties.class);
|
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
|
||||||
|
|
||||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
@ -45,7 +34,7 @@ public class CleanGraphProperties {
|
||||||
|
|
||||||
String jsonConfiguration = IOUtils
|
String jsonConfiguration = IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
CleanGraphProperties.class
|
CleanGraphSparkJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
|
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
@ -91,13 +80,14 @@ public class CleanGraphProperties {
|
||||||
Class<T> clazz,
|
Class<T> clazz,
|
||||||
String outputPath) {
|
String outputPath) {
|
||||||
|
|
||||||
CleaningRule<T> rule = new CleaningRule<>(vocs);
|
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
|
||||||
|
|
||||||
readTableFromPath(spark, inputPath, clazz)
|
readTableFromPath(spark, inputPath, clazz)
|
||||||
.map(rule, Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.parquet(outputPath);
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
private static <T extends Oaf> Dataset<T> readTableFromPath(
|
|
@ -1,43 +1,23 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.io.Serializable;
|
||||||
import java.util.function.Consumer;
|
import java.util.HashMap;
|
||||||
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
|
public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
|
||||||
|
|
||||||
private VocabularyGroup vocabularies;
|
|
||||||
|
|
||||||
private Map<Class, Consumer<Object>> mapping = Maps.newHashMap();
|
|
||||||
|
|
||||||
public CleaningRule(VocabularyGroup vocabularies) {
|
|
||||||
this.vocabularies = vocabularies;
|
|
||||||
setMappings(vocabularies);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public T call(T value) throws Exception {
|
|
||||||
|
|
||||||
OafNavigator.apply(value, mapping);
|
|
||||||
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Populates the mapping for the Oaf types subject to cleaning
|
* Creates the mapping for the Oaf types subject to cleaning
|
||||||
*
|
*
|
||||||
* @param vocabularies
|
* @param vocabularies
|
||||||
*/
|
*/
|
||||||
private void setMappings(VocabularyGroup vocabularies) {
|
public static CleaningRuleMap create(VocabularyGroup vocabularies) {
|
||||||
|
CleaningRuleMap mapping = new CleaningRuleMap();
|
||||||
mapping.put(Qualifier.class, o -> {
|
mapping.put(Qualifier.class, o -> {
|
||||||
Qualifier q = (Qualifier) o;
|
Qualifier q = (Qualifier) o;
|
||||||
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
if (vocabularies.vocabularyExists(q.getSchemeid())) {
|
||||||
|
@ -54,10 +34,7 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
|
||||||
* }
|
* }
|
||||||
*/
|
*/
|
||||||
});
|
});
|
||||||
}
|
return mapping;
|
||||||
|
|
||||||
public VocabularyGroup getVocabularies() {
|
|
||||||
return vocabularies;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,15 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
import java.lang.reflect.Field;
|
import java.lang.reflect.Field;
|
||||||
import java.util.*;
|
import java.util.Arrays;
|
||||||
import java.util.function.Consumer;
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
|
||||||
public class OafNavigator {
|
public class OafCleaner implements Serializable {
|
||||||
|
|
||||||
public static <E extends Oaf> E apply(E oaf, Map<Class, Consumer<Object>> mapping) {
|
public static <E extends Oaf> E apply(E oaf, CleaningRuleMap mapping) {
|
||||||
try {
|
try {
|
||||||
navigate(oaf, mapping);
|
navigate(oaf, mapping);
|
||||||
} catch (IllegalAccessException e) {
|
} catch (IllegalAccessException e) {
|
||||||
|
@ -18,7 +21,7 @@ public class OafNavigator {
|
||||||
return oaf;
|
return oaf;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void navigate(Object o, Map<Class, Consumer<Object>> mapping) throws IllegalAccessException {
|
private static void navigate(Object o, CleaningRuleMap mapping) throws IllegalAccessException {
|
||||||
if (isPrimitive(o)) {
|
if (isPrimitive(o)) {
|
||||||
return;
|
return;
|
||||||
} else if (isIterable(o.getClass())) {
|
} else if (isIterable(o.getClass())) {
|
||||||
|
@ -40,7 +43,7 @@ public class OafNavigator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean hasMapping(Object o, Map<Class, Consumer<Object>> mapping) {
|
private static boolean hasMapping(Object o, CleaningRuleMap mapping) {
|
||||||
return mapping.containsKey(o.getClass());
|
return mapping.containsKey(o.getClass());
|
||||||
}
|
}
|
||||||
|
|
|
@ -72,18 +72,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean publications</name>
|
<name>Clean publications</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||||
|
@ -99,18 +98,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean datasets</name>
|
<name>Clean datasets</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||||
|
@ -126,18 +124,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean otherresearchproducts</name>
|
<name>Clean otherresearchproducts</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||||
|
@ -153,18 +150,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean softwares</name>
|
<name>Clean softwares</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||||
|
@ -180,18 +176,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean datasources</name>
|
<name>Clean datasources</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
|
||||||
|
@ -207,18 +202,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean organizations</name>
|
<name>Clean organizations</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
|
||||||
|
@ -234,18 +228,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean projects</name>
|
<name>Clean projects</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
|
||||||
|
@ -261,18 +254,17 @@
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Clean relations</name>
|
<name>Clean relations</name>
|
||||||
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class>
|
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=${sparkExecutorCoresForJoining}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemoryForJoining}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemoryForJoining}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=7680
|
--conf spark.sql.shuffle.partitions=7680
|
||||||
--conf spark.network.timeout=${sparkNetworkTimeout}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
|
||||||
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
|
|
@ -5,32 +5,27 @@ import static org.junit.jupiter.api.Assertions.*;
|
||||||
import static org.mockito.Mockito.lenient;
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.junit.platform.commons.util.StringUtils;
|
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
public class CleaningRuleTest {
|
public class CleaningFunctionTest {
|
||||||
|
|
||||||
public static final ObjectMapper MAPPER = new ObjectMapper();
|
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||||
|
|
||||||
|
@ -39,7 +34,7 @@ public class CleaningRuleTest {
|
||||||
|
|
||||||
private VocabularyGroup vocabularies;
|
private VocabularyGroup vocabularies;
|
||||||
|
|
||||||
private CleaningRule<Publication> cleaningRule;
|
private CleaningRuleMap mapping;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws ISLookUpException, IOException {
|
public void setUp() throws ISLookUpException, IOException {
|
||||||
|
@ -49,18 +44,19 @@ public class CleaningRuleTest {
|
||||||
.thenReturn(synonyms());
|
.thenReturn(synonyms());
|
||||||
|
|
||||||
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
cleaningRule = new CleaningRule(vocabularies);
|
mapping = CleaningRuleMap.create(vocabularies);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testCleaning() throws Exception {
|
public void testCleaning() throws Exception {
|
||||||
|
|
||||||
assertNotNull(cleaningRule.getVocabularies());
|
assertNotNull(vocabularies);
|
||||||
|
assertNotNull(mapping);
|
||||||
|
|
||||||
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
||||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
Publication p_out = cleaningRule.call(p_in);
|
Publication p_out = OafCleaner.apply(p_in, mapping);
|
||||||
|
|
||||||
assertNotNull(p_out);
|
assertNotNull(p_out);
|
||||||
|
|
||||||
|
@ -100,11 +96,11 @@ public class CleaningRuleTest {
|
||||||
|
|
||||||
private List<String> vocs() throws IOException {
|
private List<String> vocs() throws IOException {
|
||||||
return IOUtils
|
return IOUtils
|
||||||
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> synonyms() throws IOException {
|
private List<String> synonyms() throws IOException {
|
||||||
return IOUtils
|
return IOUtils
|
||||||
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue