diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java index 0099798f6..ca77be3c6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJob.java @@ -3,15 +3,18 @@ package eu.dnetlib.dhp.oa.graph.clean; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import java.util.List; -import java.util.Optional; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; +import java.util.stream.Stream; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; @@ -24,15 +27,19 @@ import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.common.action.model.MasterDuplicate; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.oa.graph.clean.cfhb.IdCfHbMapping; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import scala.Tuple2; public class CleanGraphSparkJob { @@ -40,31 +47,43 @@ public class CleanGraphSparkJob { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + private ArgumentApplicationParser parser; + public CleanGraphSparkJob(ArgumentApplicationParser parser) { + this.parser = parser; + } + + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils - .toString( - CleanGraphSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json")); + .toString( + CleanGraphSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + + new CleanGraphSparkJob(parser).run(isSparkSessionManaged, isLookup); + } + + public void run(Boolean isSparkSessionManaged, ISLookUpService isLookUpService) throws ISLookUpException, ClassNotFoundException { + String inputPath = parser.get("inputPath"); log.info("inputPath: {}", inputPath); String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); - String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - String graphTableClassName = parser.get("graphTableClassName"); log.info("graphTableClassName: {}", graphTableClassName); @@ -80,27 +99,38 @@ public class CleanGraphSparkJob { String country = parser.get("country"); log.info("country: {}", country); - String[] verifyCountryParam = parser.get("verifyCountryParam").split(";"); + String[] verifyCountryParam = Optional.ofNullable(parser.get("verifyCountryParam")) + .map(s -> s.split(";")) + .orElse(new String[]{}); log.info("verifyCountryParam: {}", verifyCountryParam); String collectedfrom = parser.get("collectedfrom"); log.info("collectedfrom: {}", collectedfrom); + String dsMasterDuplicatePath = parser.get("masterDuplicatePath"); + log.info("masterDuplicatePath: {}", dsMasterDuplicatePath); + + Boolean deepClean = Optional + .ofNullable(parser.get("deepClean")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + log.info("deepClean: {}", deepClean); + Class entityClazz = (Class) Class.forName(graphTableClassName); - final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); - final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService); + final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookUpService); SparkConf conf = new SparkConf(); + conf.setAppName(CleanGraphSparkJob.class.getSimpleName() + "#" + entityClazz.getSimpleName()); runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); - cleanGraphTable( - spark, vocs, inputPath, entityClazz, outputPath, contextId, verifyParam, datasourcePath, country, - verifyCountryParam, collectedfrom); - }); + conf, + isSparkSessionManaged, + spark -> { + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + cleanGraphTable( + spark, vocs, inputPath, entityClazz, outputPath, contextId, verifyParam, datasourcePath, country, + verifyCountryParam, collectedfrom, dsMasterDuplicatePath, deepClean); + }); } private static void cleanGraphTable( @@ -109,33 +139,74 @@ public class CleanGraphSparkJob { String inputPath, Class clazz, String outputPath, String contextId, String verifyParam, String datasourcePath, String country, - String[] verifyCountryParam, String collectedfrom) { - - Set hostedBy = Sets - .newHashSet( - spark - .read() - .textFile(datasourcePath) - .collectAsList()); + String[] verifyCountryParam, String collectedfrom, String dsMasterDuplicatePath, + Boolean deepClean) { final CleaningRuleMap mapping = CleaningRuleMap.create(vocs); - readTableFromPath(spark, inputPath, clazz) + final Dataset cleaned_basic = readTableFromPath(spark, inputPath, clazz) .map((MapFunction) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz)) .map((MapFunction) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz)) - .filter((FilterFunction) GraphCleaningFunctions::filter) - .map( - (MapFunction) value -> GraphCleaningFunctions.cleanContext(value, contextId, verifyParam), - Encoders.bean(clazz)) - .map( - (MapFunction) value -> GraphCleaningFunctions - .cleanCountry(value, verifyCountryParam, hostedBy, collectedfrom, country), - Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(outputPath); + .filter((FilterFunction) GraphCleaningFunctions::filter); + + if (Boolean.FALSE.equals(deepClean)) { + cleaned_basic + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + + } else if (Boolean.TRUE.equals(ModelSupport.isSubClass(clazz, Result.class))) { + + // read the master-duplicate tuples + Dataset md = spark + .read() + .textFile(dsMasterDuplicatePath) + .map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class)); + + // prepare the resolved CF|HB references with the corresponding EMPTY master ID + Dataset resolved = spark + .read() + .textFile(inputPath) + .map(as(clazz), Encoders.bean(clazz)) + .flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class)); + + // set the EMPTY master ID/NAME and save it + resolved + .joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId"))) + .map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class)) + .filter((FilterFunction) m -> Objects.nonNull(m.getMasterId())); + + // load the hostedby mapping + Set hostedBy = Sets + .newHashSet( + spark + .read() + .textFile(datasourcePath) + .collectAsList()); + + // perform the deep cleaning steps + final Dataset cleaned_deep = cleaned_basic + .map( + (MapFunction) value -> GraphCleaningFunctions.cleanContext(value, contextId, verifyParam), + Encoders.bean(clazz)) + .map( + (MapFunction) value -> GraphCleaningFunctions + .cleanCountry(value, verifyCountryParam, hostedBy, collectedfrom, country), + Encoders.bean(clazz)); + + // Join the results with the resolved CF|HB mapping, apply the mapping and save it + cleaned_deep + .joinWith(resolved, cleaned_deep.col("id").equalTo(resolved.col("resultId")), "left") + .groupByKey( + (MapFunction, String>) t -> ((Result) t._1()).getId(), Encoders.STRING()) + .mapGroups(getMapGroupsFunction(), Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } } private static Dataset readTableFromPath( @@ -145,9 +216,98 @@ public class CleanGraphSparkJob { return spark .read() .textFile(inputEntityPath) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), - Encoders.bean(clazz)); + .map(as(clazz), Encoders.bean(clazz)); + } + + private static MapFunction as(Class clazz) { + return s -> OBJECT_MAPPER.readValue(s, clazz); + } + + private static FlatMapFunction flattenCfHbFn() { + return r -> Stream + .concat( + Optional + .ofNullable(r.getCollectedfrom()) + .map(cf -> cf.stream().map(KeyValue::getKey)) + .orElse(Stream.empty()), + Stream + .concat( + Optional + .ofNullable(((Result) r).getInstance()) + .map( + instances -> instances + .stream() + .map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse(""))) + .orElse(Stream.empty()) + .filter(StringUtils::isNotBlank), + Optional + .ofNullable(((Result) r).getInstance()) + .map( + instances -> instances + .stream() + .map( + i -> Optional + .ofNullable(i.getCollectedfrom()) + .map(KeyValue::getKey) + .orElse(""))) + .orElse(Stream.empty()) + .filter(StringUtils::isNotBlank))) + .distinct() + .filter(StringUtils::isNotBlank) + .map(cfHb -> asIdCfHbMapping(((Result) r).getId(), cfHb)) + .iterator(); + } + + private static MapFunction, IdCfHbMapping> asIdCfHbMapping() { + return t -> { + final IdCfHbMapping mapping = t._1(); + Optional + .ofNullable(t._2()) + .ifPresent(t2 -> { + mapping.setMasterId(t2.getMasterId()); + mapping.setMasterName(t2.getMasterName()); + + }); + return mapping; + }; + } + + private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) { + IdCfHbMapping m = new IdCfHbMapping(resultId); + m.setCfhb(cfHb); + return m; + } + + private static MapGroupsFunction, T> getMapGroupsFunction() { + return new MapGroupsFunction, T>() { + @Override + public T call(String key, Iterator> values) { + final Tuple2 first = values.next(); + final T res = first._1(); + + updateResult(res, first._2()); + values.forEachRemaining(t -> updateResult(res, t._2())); + return res; + } + + private void updateResult(T res, IdCfHbMapping m) { + if (Objects.nonNull(m)) { + res.getCollectedfrom().forEach(kv -> updateKeyValue(kv, m)); + ((Result) res).getInstance().forEach(i -> { + updateKeyValue(i.getHostedby(), m); + updateKeyValue(i.getCollectedfrom(), m); + }); + } + } + + private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) { + if (kv.getKey().equals(a.getCfhb())) { + kv.setKey(a.getMasterId()); + kv.setValue(a.getMasterName()); + } + } + + }; } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 2d6371a9b..b5179b1fc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -83,12 +83,17 @@ - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn @@ -110,10 +115,25 @@ --workingDir${workingDir}/working/hostedby --country${country} - + + + + eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction + --postgresUrl${postgresURL} + --postgresUser${postgresUser} + --postgresPassword${postgresPassword} + --hdfsPath${workingDir}/masterduplicate + --hdfsNameNode${nameNode} + + + + + + + @@ -152,6 +172,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -184,6 +206,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -216,6 +240,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -248,6 +274,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -280,6 +308,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -312,6 +342,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -344,6 +376,8 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} @@ -376,206 +410,14 @@ --verifyCountryParam${verifyCountryParam} --hostedBy${workingDir}/working/hostedby --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} - - - - - ${wf:conf('shouldClean') eq true} - - - - - - - eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} - --hdfsPath${workingDir}/masterduplicate - --hdfsNameNode${nameNode} - - - - - - - - - - - - - - - yarn - cluster - patch publication cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/publication - --resolvedPath${workingDir}/cfHbResolved/publication - --outputPath${workingDir}/cfHbPatched/publication - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - yarn - cluster - patch dataset cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/dataset - --resolvedPath${workingDir}/cfHbResolved/dataset - --outputPath${workingDir}/cfHbPatched/dataset - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - yarn - cluster - patch otherresearchproduct cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/otherresearchproduct - --resolvedPath${workingDir}/cfHbResolved/otherresearchproduct - --outputPath${workingDir}/cfHbPatched/otherresearchproduct - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - yarn - cluster - patch software cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/software - --resolvedPath${workingDir}/cfHbResolved/software - --outputPath${workingDir}/cfHbPatched/software - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - - - - - - - - - - - - - ${workingDir}/cfHbPatched/publication - ${graphOutputPath}/publication - - - - - - - - - - - ${workingDir}/cfHbPatched/dataset - ${graphOutputPath}/dataset - - - - - - - - - - - ${workingDir}/cfHbPatched/otherresearchproduct - ${graphOutputPath}/otherresearchproduct - - - - - - - - - - - ${workingDir}/cfHbPatched/software - ${graphOutputPath}/software - - - - - - + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json index 928215316..0a703763b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json @@ -33,30 +33,48 @@ "paramName": "ci", "paramLongName": "contextId", "paramDescription": "the id of the context to be removed", - "paramRequired": true + "paramRequired": false + }, + { + "paramName": "vf", + "paramLongName": "verifyParam", + "paramDescription": "the parameter to be verified to remove the context", + "paramRequired": false }, { "paramName": "c", "paramLongName": "country", "paramDescription": "the id of the context to be removed", - "paramRequired": true + "paramRequired": false }, { "paramName": "vfc", "paramLongName": "verifyCountryParam", "paramDescription": "the parameter to be verified to remove the country", - "paramRequired": true + "paramRequired": false }, { "paramName": "cf", "paramLongName": "collectedfrom", "paramDescription": "the collectedfrom value for which we should apply the cleaning", - "paramRequired": true + "paramRequired": false }, { "paramName": "hb", "paramLongName": "hostedBy", "paramDescription": "the set of datasources having the specified country in the graph searched for in the hostedby of the results", + "paramRequired": false + }, + { + "paramName": "md", + "paramLongName": "masterDuplicatePath", + "paramDescription": "path to the file on HDFS holding the datasource id tuples [master, duplicate]", + "paramRequired": false + }, + { + "paramName": "dc", + "paramLongName": "deepClean", + "paramDescription": "flag to activate further cleaning steps", "paramRequired": true } ] diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJobTest.java new file mode 100644 index 000000000..f01c53d5f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanGraphSparkJobTest.java @@ -0,0 +1,435 @@ +package eu.dnetlib.dhp.oa.graph.clean; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.IOUtils; +import org.apache.commons.io.filefilter.*; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.ForeachFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.junit.jupiter.MockitoExtension; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collection; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; + +@ExtendWith(MockitoExtension.class) +public class CleanGraphSparkJobTest { + + private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class); + + public static final ObjectMapper MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + @Mock + private ISLookUpService isLookUpService; + + private VocabularyGroup vocabularies; + + private CleaningRuleMap mapping; + + private static SparkSession spark; + + private static Path workingDir; + + private static Path testBaseTmpPath; + + private static String graphInputPath; + + private static String graphOutputPath; + + private static String dsMasterDuplicatePath; + + @BeforeAll + public static void beforeAll() throws IOException, URISyntaxException { + testBaseTmpPath = Files.createTempDirectory(CleanGraphSparkJobTest.class.getSimpleName()); + log.info("using test base path {}", testBaseTmpPath); + + File basePath = Paths + .get(CleanGraphSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/graph").toURI()) + .toFile(); + + + List paths = FileUtils + .listFilesAndDirs(basePath, FalseFileFilter.FALSE, TrueFileFilter.TRUE) + .stream() + .filter(f -> !f.getAbsolutePath().endsWith("/graph")) + .collect(Collectors.toList()); + + for(File path : paths) { + String type = StringUtils.substringAfterLast(path.getAbsolutePath(), "/"); + FileUtils + .copyDirectory( + path, + testBaseTmpPath.resolve("input").resolve("graph").resolve(type).toFile()); + } + + FileUtils + .copyFileToDirectory( + Paths + .get( + CleanGraphSparkJobTest.class + .getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json") + .toURI()) + .toFile(), + testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile()); + + graphInputPath = testBaseTmpPath.resolve("input").resolve("graph").toString(); + graphOutputPath = testBaseTmpPath.resolve("output").resolve("graph").toString(); + dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString(); + + + + workingDir = Files.createTempDirectory(CleanGraphSparkJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(CleanGraphSparkJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .config(conf) + .getOrCreate(); + } + + @BeforeEach + public void setUp() throws ISLookUpException, IOException { + lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs()); + lenient() + .when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY)) + .thenReturn(synonyms()); + + vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService); + mapping = CleaningRuleMap.create(vocabularies); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void testCleanRelations() throws Exception { + + spark.read() + .textFile(graphInputPath.toString() + "/relation") + .map(as(Relation.class), Encoders.bean(Relation.class)) + .collectAsList() + .forEach(r -> assertFalse(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass()))); + + new CleanGraphSparkJob( + args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", + new String[] { + "--inputPath", graphInputPath.toString() + "/relation", + "--outputPath", graphOutputPath.toString() + "/relation", + "--isLookupUrl", "lookupurl", + "--graphTableClassName", Relation.class.getCanonicalName(), + "--deepClean", "false" + })).run(false, isLookUpService); + + spark.read() + .textFile(graphOutputPath.toString() + "/relation") + .map(as(Relation.class), Encoders.bean(Relation.class)) + .collectAsList() + .forEach(r -> { + + assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r.getRelClass())); + assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r.getSubRelType())); + + assertEquals("iis", r.getDataInfo().getProvenanceaction().getClassid()); + assertEquals("Inferred by OpenAIRE", r.getDataInfo().getProvenanceaction().getClassname()); + }); + } + + @Test + void testFilter_invisible_true() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json")); + Publication p_in = MAPPER.readValue(json, Publication.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + + assertEquals(true, GraphCleaningFunctions.filter(p_in)); + } + + @Test + void testFilter_true_nothing_to_filter() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); + Publication p_in = MAPPER.readValue(json, Publication.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + + assertEquals(true, GraphCleaningFunctions.filter(p_in)); + } + + @Test + void testFilter_missing_invisible() throws Exception { + + assertNotNull(vocabularies); + assertNotNull(mapping); + + String json = IOUtils + .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json")); + Publication p_in = MAPPER.readValue(json, Publication.class); + + assertTrue(p_in instanceof Result); + assertTrue(p_in instanceof Publication); + + assertEquals(true, GraphCleaningFunctions.filter(p_in)); + } + + @Test + void testCleaning_publication() throws Exception { + + spark.read() + .textFile(graphInputPath.toString() + "/publication") + .map(as(Publication.class), Encoders.bean(Publication.class)) + .collectAsList() + .forEach(p -> { + assertNull(p.getBestaccessright()); + assertTrue(p instanceof Result); + assertTrue(p instanceof Publication); + }); + + new CleanGraphSparkJob( + args("/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json", + new String[] { + "--inputPath", graphInputPath.toString() + "/publication", + "--outputPath", graphOutputPath.toString() + "/publication", + "--isLookupUrl", "lookupurl", + "--graphTableClassName", Publication.class.getCanonicalName(), + "--deepClean", "false" + })).run(false, isLookUpService); + + Publication p = spark.read() + .textFile(graphOutputPath.toString() + "/publication") + .map(as(Publication.class), Encoders.bean(Publication.class)) + .first(); + + assertNull(p.getPublisher()); + + assertEquals("und", p.getLanguage().getClassid()); + assertEquals("Undetermined", p.getLanguage().getClassname()); + + assertEquals("DE", p.getCountry().get(0).getClassid()); + assertEquals("Germany", p.getCountry().get(0).getClassname()); + + assertEquals("0018", p.getInstance().get(0).getInstancetype().getClassid()); + assertEquals("Annotation", p.getInstance().get(0).getInstancetype().getClassname()); + + assertEquals("0027", p.getInstance().get(1).getInstancetype().getClassid()); + assertEquals("Model", p.getInstance().get(1).getInstancetype().getClassname()); + + assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid()); + assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname()); + + assertEquals("CLOSED", p.getInstance().get(0).getAccessright().getClassid()); + assertEquals("Closed Access", p.getInstance().get(0).getAccessright().getClassname()); + + Set pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES); + assertTrue( + p + .getPid() + .stream() + .map(StructuredProperty::getQualifier) + .allMatch(q -> pidTerms.contains(q.getClassid()))); + + List poi = p.getInstance(); + assertNotNull(poi); + assertEquals(3, poi.size()); + + final Instance poii = poi.get(0); + assertNotNull(poii); + assertNotNull(poii.getPid()); + + assertEquals(2, poii.getPid().size()); + + assertTrue( + poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x"))); + assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd"))); + + assertNotNull(poii.getAlternateIdentifier()); + assertEquals(1, poii.getAlternateIdentifier().size()); + + assertTrue( + poii + .getAlternateIdentifier() + .stream() + .anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); + + assertEquals(3, p.getTitle().size()); + + + List titles = p + .getTitle() + .stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toList()); + assertTrue(titles.contains("omic")); + assertTrue( + titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test")); + assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳")); + + assertEquals("CLOSED", p.getBestaccessright().getClassid()); + assertNull(p.getPublisher()); + + assertEquals("1970-10-07", p.getDateofacceptance().getValue()); + + assertEquals("0038", p.getInstance().get(2).getInstancetype().getClassid()); + assertEquals("Other literature type", p.getInstance().get(2).getInstancetype().getClassname()); + + final List pci = p.getInstance(); + assertNotNull(pci); + assertEquals(3, pci.size()); + + final Instance pcii = pci.get(0); + assertNotNull(pcii); + assertNotNull(pcii.getPid()); + + assertEquals(2, pcii.getPid().size()); + + assertTrue( + pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x"))); + assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd"))); + + assertNotNull(pcii.getAlternateIdentifier()); + assertEquals(1, pcii.getAlternateIdentifier().size()); + assertTrue( + pcii + .getAlternateIdentifier() + .stream() + .anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); + + assertNotNull(p.getSubject()); + + List fos_subjects = p + .getSubject() + .stream() + .filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())) + .collect(Collectors.toList()); + + assertNotNull(fos_subjects); + assertEquals(2, fos_subjects.size()); + + assertTrue( + fos_subjects + .stream() + .anyMatch( + s -> "0101 mathematics".equals(s.getValue()) & + ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) & + "sysimport:crosswalk:datasetarchive" + .equals(s.getDataInfo().getProvenanceaction().getClassid()))); + + assertTrue( + fos_subjects + .stream() + .anyMatch( + s -> "0102 computer and information sciences".equals(s.getValue()) & + ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + + verify_keyword(p, "In Situ Hybridization"); + verify_keyword(p, "Avicennia"); + } + + private List vocs() throws IOException { + return IOUtils + .readLines( + GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt")); + } + + private List synonyms() throws IOException { + return IOUtils + .readLines( + GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt")); + } + + private static MapFunction as(Class clazz) { + return s -> MAPPER.readValue(s, clazz); + } + + private static String classPathResourceAsString(String path) throws IOException { + return IOUtils + .toString( + CleanGraphSparkJobTest.class + .getResourceAsStream(path)); + } + + private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException { + ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs)); + parser.parseArgument(args); + return parser; + } + + private static void verify_keyword(Publication p_cleaned, String subject) { + Optional s1 = p_cleaned + .getSubject() + .stream() + .filter(s -> s.getValue().equals(subject)) + .findFirst(); + + assertTrue(s1.isPresent()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassid()); + assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassname()); + } + + private Stream getAuthorPids(Result pub) { + return pub + .getAuthor() + .stream() + .map(Author::getPid) + .flatMap(Collection::stream); + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index fc7c6e5f1..24b942f4d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -13,7 +13,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -59,7 +58,7 @@ public class GraphCleaningFunctionsTest { void testCleanRelations() throws Exception { List lines = IOUtils - .readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json")); + .readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/graph/relation/relation.json")); for (String json : lines) { Relation r_in = MAPPER.readValue(json, Relation.class); assertNotNull(r_in); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities/dataset/dataset.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/dataset/dataset.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities/dataset/dataset.json rename to dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/dataset/dataset.json diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/publication/publication.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/publication/publication.json new file mode 100644 index 000000000..5bac26fdc --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/publication/publication.json @@ -0,0 +1 @@ +{"author":[{"affiliation":[],"fullname":"Brien, Tom","name":"Tom","pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"ORCID12","classname":"ORCID12","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0001-9613-6639"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"ORCID12","classname":"ORCID12","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"https://orcid.org/0000-0001-9613-6639"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"orcid","classname":"ORCID12","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"0000-0001-9613-6639"}],"rank":1,"surname":"Brien"},{"affiliation":[],"fullname":"Ade, Peter","name":"Peter","pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"xyz","classname":"XYZ","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"qwerty"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"ORCID","classname":"ORCID","schemeid":"","schemename":""},"value":"asdasd"}],"rank":2,"surname":"Ade"},{"affiliation":[],"fullname":"Barry, Peter S.","name":"Peter S.","pid":null,"rank":3,"surname":"Barry"},{"affiliation":[],"fullname":"Dunscombe, Chris J.","name":"Chris J.","pid":[],"rank":4,"surname":"Dunscombe"},{"affiliation":[],"fullname":"Leadley, David R.","name":"David R.","pid":[],"rank":5,"surname":"Leadley"},{"affiliation":[],"fullname":"Morozov, Dmitry V.","name":"Dmitry V.","pid":[],"rank":6,"surname":"Morozov"},{"affiliation":[],"fullname":"Myronov, Maksym","name":"Maksym","pid":[],"rank":7,"surname":"Myronov"},{"affiliation":[],"fullname":"Parker, Evan","name":"Evan","pid":[],"rank":8,"surname":"Parker"},{"affiliation":[],"fullname":"Prest, Martin J.","name":"Martin J.","pid":[],"rank":9,"surname":"Prest"},{"affiliation":[],"fullname":"Prunnila, Mika","name":"Mika","pid":[],"rank":10,"surname":"Prunnila"},{"affiliation":[],"fullname":"Sudiwala, Rashmi V.","name":"Rashmi V.","pid":[],"rank":11,"surname":"Sudiwala"},{"affiliation":[],"fullname":"Whall, Terry E.","name":"Terry E.","pid":[],"rank":12,"surname":"Whall"},{"affiliation":[],"fullname":"Mauskopf","name":"","pid":[],"rank":13,"surname":""},{"affiliation":[],"fullname":" P. D. ","name":"","pid":[],"rank":14,"surname":""}],"bestaccessright":null,"publisher":{"value":null},"collectedfrom":[{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"}],"context":[],"contributor":[],"country":[{"classid":"DE","classname":"DE","schemeid":"dnet:countries","schemename":"dnet:countries"}],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"7 oct 1970"},"dateofcollection":"","dateoftransformation":"2020-04-22T12:34:08.009Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|CSC_________::2250a70c903c6ac6e4c01438259e9375","instance":[{"pid":[{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s109090161569x"},{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1008/abcd"}],"alternateIdentifier":[{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s109090161569x"},{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1009/qwerty"}],"accessright":{"classid":"CLOSED","classname":"CLOSED","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-01-01"},"distributionlocation":"","hostedby":{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"},"instancetype":{"classid":"Comment/debate","classname":"Comment/debate","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"url":["http://juuli.fi/Record/0275158616","http://dx.doi.org/10.1007/s109090161569x"]},{"pid":[{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1002/s21010127267xy"},{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1008/abcd"}],"alternateIdentifier":[{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s109090161569x"},{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1009/qwerty"}],"accessright":{"classid":"CLOSED","classname":"CLOSED","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-01-01"},"distributionlocation":"","hostedby":{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"},"instancetype":{"classid":"Model","classname":"Model","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"url":["http://dx.doi.org/10.1002/s21010127267xy"]},{"pid":[{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1002/s21010127267xy"},{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1008/abcd"}],"alternateIdentifier":[{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s109090161569x"},{"dataInfo":null,"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1009/qwerty"}],"accessright":{"classid":"CLOSED","classname":"CLOSED","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-01-01"},"distributionlocation":"","hostedby":{"key":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","value":"VIRTA"},"instancetype":{"classid":"xyz","classname":"xyz","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"url":["http://dx.doi.org/10.1002/t32121238378t"]}],"journal":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"edition":"","ep":" 7","iss":"9 March","issnLinking":"","issnOnline":"","issnPrinted":"0022-2291","name":"Journal of Low Temperature Physics - Early Acces","sp":"1 ","vol":""},"language":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1591283286319,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fvirta-jtp.csc.fi%2Fapi%2Fcerif","datestamp":"2019-07-30","harvestDate":"2020-04-22T11:04:38.685Z","identifier":"oai:virta-jtp.csc.fi:Publications/0275158616","metadataNamespace":""}},"originalId":["CSC_________::2250a70c903c6ac6e4c01438259e9375"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s109090161569x"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s109090161569x"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"doi","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":""}],"relevantdate":[],"resourcetype":{"classid":"0001","classname":"0001","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"sysimport:crosswalk:repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"deletedbyinference":false,"inferred":false,"inferenceprovenance":"","invisible":false,"trust":"0.9"},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"In Situ Hybridization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"ta213"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"FOS: Mathematics"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"FOS: Computer and information sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"0101 mathematics"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"subject:fos","classname":"subject:fos","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"0101 mathematics"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"slot antennas"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"strained silicon"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"cold electron bolometers"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:actionset","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"FOS","classname":"Fields of Science and Technology classification","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Avicennia"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"measure noise"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"noise equivalent power"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"optical characterisation"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"optical response"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"photon noise"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"","classname":"","schemeid":"","schemename":""},"value":"silicon absorbers"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Optical response of strained- and unstrained-silicon cold-electron bolometers test"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"test test 123 test"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"omic"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"sysimport:crosswalk:datasetarchive","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"-"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/relation.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/relation/relation.json similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/relation.json rename to dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/graph/relation/relation.json diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/logback.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/logback.xml new file mode 100644 index 000000000..77a7627b5 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/logback.xml @@ -0,0 +1,11 @@ + + + + %d{HH:mm:ss.SSS} %-5level %logger{36} - %msg%n + + + + + + + \ No newline at end of file