WIP: graph cleaner implementation

master
Claudio Atzori 4 years ago
parent bed65a1be6
commit 0d52816244

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.common; package eu.dnetlib.dhp.common;
import java.io.Serializable; import java.io.Serializable;
import java.util.function.Consumer;
import java.util.function.Supplier; import java.util.function.Supplier;
/** Provides serializable and throwing extensions to standard functional interfaces. */ /** Provides serializable and throwing extensions to standard functional interfaces. */
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
private FunctionalInterfaceSupport() { private FunctionalInterfaceSupport() {
} }
/**
* Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally.
*
* @param <T>
*/
@FunctionalInterface
public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
}
/** /**
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally. * functions externally.

@ -3,19 +3,9 @@ package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.beans.BeanInfo;
import java.beans.IntrospectionException;
import java.beans.Introspector;
import java.beans.PropertyDescriptor;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.TreeMap;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -33,11 +23,10 @@ import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Predef;
public class CleanGraphProperties { public class CleanGraphSparkJob {
private static final Logger log = LoggerFactory.getLogger(CleanGraphProperties.class); private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -45,7 +34,7 @@ public class CleanGraphProperties {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
CleanGraphProperties.class CleanGraphSparkJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json")); "/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -91,13 +80,14 @@ public class CleanGraphProperties {
Class<T> clazz, Class<T> clazz,
String outputPath) { String outputPath) {
CleaningRule<T> rule = new CleaningRule<>(vocs); final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
readTableFromPath(spark, inputPath, clazz) readTableFromPath(spark, inputPath, clazz)
.map(rule, Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.parquet(outputPath); .option("compression", "gzip")
.json(outputPath);
} }
private static <T extends Oaf> Dataset<T> readTableFromPath( private static <T extends Oaf> Dataset<T> readTableFromPath(

@ -1,43 +1,23 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import java.util.Map; import java.io.Serializable;
import java.util.function.Consumer; import java.util.HashMap;
import org.apache.spark.api.java.function.MapFunction;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningRule<T extends Oaf> implements MapFunction<T, T> { public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
private VocabularyGroup vocabularies;
private Map<Class, Consumer<Object>> mapping = Maps.newHashMap();
public CleaningRule(VocabularyGroup vocabularies) {
this.vocabularies = vocabularies;
setMappings(vocabularies);
}
@Override
public T call(T value) throws Exception {
OafNavigator.apply(value, mapping);
return value;
}
/** /**
* Populates the mapping for the Oaf types subject to cleaning * Creates the mapping for the Oaf types subject to cleaning
* *
* @param vocabularies * @param vocabularies
*/ */
private void setMappings(VocabularyGroup vocabularies) { public static CleaningRuleMap create(VocabularyGroup vocabularies) {
CleaningRuleMap mapping = new CleaningRuleMap();
mapping.put(Qualifier.class, o -> { mapping.put(Qualifier.class, o -> {
Qualifier q = (Qualifier) o; Qualifier q = (Qualifier) o;
if (vocabularies.vocabularyExists(q.getSchemeid())) { if (vocabularies.vocabularyExists(q.getSchemeid())) {
@ -54,10 +34,7 @@ public class CleaningRule<T extends Oaf> implements MapFunction<T, T> {
* } * }
*/ */
}); });
} return mapping;
public VocabularyGroup getVocabularies() {
return vocabularies;
} }
} }

@ -1,15 +1,18 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.lang.reflect.Field; import java.lang.reflect.Field;
import java.util.*; import java.util.Arrays;
import java.util.function.Consumer; import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
public class OafNavigator { public class OafCleaner implements Serializable {
public static <E extends Oaf> E apply(E oaf, Map<Class, Consumer<Object>> mapping) { public static <E extends Oaf> E apply(E oaf, CleaningRuleMap mapping) {
try { try {
navigate(oaf, mapping); navigate(oaf, mapping);
} catch (IllegalAccessException e) { } catch (IllegalAccessException e) {
@ -18,7 +21,7 @@ public class OafNavigator {
return oaf; return oaf;
} }
private static void navigate(Object o, Map<Class, Consumer<Object>> mapping) throws IllegalAccessException { private static void navigate(Object o, CleaningRuleMap mapping) throws IllegalAccessException {
if (isPrimitive(o)) { if (isPrimitive(o)) {
return; return;
} else if (isIterable(o.getClass())) { } else if (isIterable(o.getClass())) {
@ -40,7 +43,7 @@ public class OafNavigator {
} }
} }
private static boolean hasMapping(Object o, Map<Class, Consumer<Object>> mapping) { private static boolean hasMapping(Object o, CleaningRuleMap mapping) {
return mapping.containsKey(o.getClass()); return mapping.containsKey(o.getClass());
} }

@ -72,18 +72,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean publications</name> <name>Clean publications</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg> <arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
@ -99,18 +98,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean datasets</name> <name>Clean datasets</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg> <arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
@ -126,18 +124,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean otherresearchproducts</name> <name>Clean otherresearchproducts</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg> <arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
@ -153,18 +150,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean softwares</name> <name>Clean softwares</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg> <arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
@ -180,18 +176,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean datasources</name> <name>Clean datasources</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg> <arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
@ -207,18 +202,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean organizations</name> <name>Clean organizations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg> <arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
@ -234,18 +228,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean projects</name> <name>Clean projects</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg> <arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
@ -261,18 +254,17 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Clean relations</name> <name>Clean relations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphProperties</class> <class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680 --conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg> <arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg> <arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>

@ -5,32 +5,27 @@ import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Objects;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.platform.commons.util.StringUtils;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyTerm;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class CleaningRuleTest { public class CleaningFunctionTest {
public static final ObjectMapper MAPPER = new ObjectMapper(); public static final ObjectMapper MAPPER = new ObjectMapper();
@ -39,7 +34,7 @@ public class CleaningRuleTest {
private VocabularyGroup vocabularies; private VocabularyGroup vocabularies;
private CleaningRule<Publication> cleaningRule; private CleaningRuleMap mapping;
@BeforeEach @BeforeEach
public void setUp() throws ISLookUpException, IOException { public void setUp() throws ISLookUpException, IOException {
@ -49,18 +44,19 @@ public class CleaningRuleTest {
.thenReturn(synonyms()); .thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
cleaningRule = new CleaningRule(vocabularies); mapping = CleaningRuleMap.create(vocabularies);
} }
@Test @Test
public void testCleaning() throws Exception { public void testCleaning() throws Exception {
assertNotNull(cleaningRule.getVocabularies()); assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = cleaningRule.call(p_in); Publication p_out = OafCleaner.apply(p_in, mapping);
assertNotNull(p_out); assertNotNull(p_out);
@ -100,11 +96,11 @@ public class CleaningRuleTest {
private List<String> vocs() throws IOException { private List<String> vocs() throws IOException {
return IOUtils return IOUtils
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
} }
private List<String> synonyms() throws IOException { private List<String> synonyms() throws IOException {
return IOUtils return IOUtils
.readLines(CleaningRuleTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); .readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
} }
} }
Loading…
Cancel
Save