diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java index 0b2cd571f..654fdd5ac 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java @@ -28,7 +28,7 @@ public class HdfsSupport { * @param configuration Configuration of hadoop env */ public static boolean exists(String path, Configuration configuration) { - logger.info("Removing path: {}", path); + logger.info("Checking existence for path: {}", path); return rethrowAsRuntimeException( () -> { Path f = new Path(path); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index d8b1cded8..592580ab8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -27,8 +27,11 @@ public class GraphCleaningFunctions extends CleaningFunctions { public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; - public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]"; - public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; + + public static final String TITLE_TEST = "test"; + public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST); + + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; public static T fixVocabularyNames(T value) { if (value instanceof Datasource) { @@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions { final String title = sp .getValue() .toLowerCase(); - final String residual = Unidecode - .decode(title) - .replaceAll(TITLE_FILTER_REGEX, ""); - return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH; + final String decoded = Unidecode.decode(title); + + if (StringUtils.contains(decoded, TITLE_TEST)) { + return decoded + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH; + } + return !decoded + .replaceAll("\\W|\\d", "") + .isEmpty(); }) .map(GraphCleaningFunctions::cleanValue) .collect(Collectors.toList())); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index 6a86f30df..5a59bc0df 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -4,19 +4,19 @@ package eu.dnetlib.dhp.utils; import java.io.*; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; -import java.util.List; -import java.util.Map; -import java.util.Properties; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; +import java.util.*; +import java.util.stream.Collectors; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SaveMode; import org.slf4j.Logger; @@ -26,6 +26,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; +import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import net.minidev.json.JSONArray; import scala.collection.JavaConverters; import scala.collection.Seq; @@ -52,10 +54,56 @@ public class DHPUtils { } } + /** + * Retrieves from the metadata store manager application the list of paths associated with mdstores characterized + * by he given format, layout, interpretation + * @param mdstoreManagerUrl the URL of the mdstore manager service + * @param format the mdstore format + * @param layout the mdstore layout + * @param interpretation the mdstore interpretation + * @param includeEmpty include Empty mdstores + * @return the set of hdfs paths + * @throws IOException in case of HTTP communication issues + */ + public static Set mdstorePaths(final String mdstoreManagerUrl, + final String format, + final String layout, + final String interpretation, + boolean includeEmpty) throws IOException { + final String url = mdstoreManagerUrl + "/mdstores/"; + final ObjectMapper objectMapper = new ObjectMapper(); + + final HttpGet req = new HttpGet(url); + + try (final CloseableHttpClient client = HttpClients.createDefault()) { + try (final CloseableHttpResponse response = client.execute(req)) { + final String json = IOUtils.toString(response.getEntity().getContent()); + final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class); + return Arrays + .stream(mdstores) + .filter(md -> md.getFormat().equalsIgnoreCase(format)) + .filter(md -> md.getLayout().equalsIgnoreCase(layout)) + .filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation)) + .filter(md -> StringUtils.isNotBlank(md.getHdfsPath())) + .filter(md -> StringUtils.isNotBlank(md.getCurrentVersion())) + .filter(md -> includeEmpty || md.getSize() > 0) + .map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store") + .collect(Collectors.toSet()); + } + } + } + public static String generateIdentifier(final String originalId, final String nsPrefix) { return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); } + public static String generateUnresolvedIdentifier(final String pid, final String pidType) { + + final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid); + + return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim()); + } + public static String getJPathString(final String jsonPath, final String json) { try { Object o = JsonPath.read(json, jsonPath); diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml index 2450bdad7..b1c8e7c85 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml @@ -107,7 +107,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=5000 --inputGraphTablePath${inputGraphRootPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -159,7 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=5000 --inputGraphTablePath${workingDir}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml index a7dce8f2f..20ffe26d3 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml @@ -99,7 +99,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=5000 --inputGraphTablePath${inputGraphRootPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index 98e22d8a3..c89cc9d1d 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -29,6 +29,13 @@ testCompile + + scala-doc + process-resources + + doc + + ${scala.version} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/Constants.java new file mode 100644 index 000000000..c508d4dbc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/Constants.java @@ -0,0 +1,49 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import java.util.Optional; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class Constants { + + public static final String DOI = "doi"; + + public static final String UPDATE_DATA_INFO_TYPE = "update"; + public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos"; + public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE"; + public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip"; + + public static final String FOS_CLASS_ID = "FOS"; + public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification"; + + public static final String NULL = "NULL"; + + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private Constants() { + } + + public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) { + return Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSData.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSData.java new file mode 100644 index 000000000..9dec3e862 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSData.java @@ -0,0 +1,77 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Serializable; +import java.util.Objects; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.GetCSV; + +public class GetFOSData implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(GetFOSData.class); + + public static final char DEFAULT_DELIMITER = '\t'; + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + GetFOSData.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json")))); + + parser.parseArgument(args); + + // the path where the original fos csv file is stored + final String sourcePath = parser.get("sourcePath"); + log.info("sourcePath {}", sourcePath); + + // the path where to put the file as json + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}", outputPath); + + final String hdfsNameNode = parser.get("hdfsNameNode"); + log.info("hdfsNameNode {}", hdfsNameNode); + + final String classForName = parser.get("classForName"); + log.info("classForName {}", classForName); + + final char delimiter = Optional + .ofNullable(parser.get("delimiter")) + .map(s -> s.charAt(0)) + .orElse(DEFAULT_DELIMITER); + log.info("delimiter {}", delimiter); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + + new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem); + + } + + public void doRewrite(String inputPath, String outputFile, String classForName, char delimiter, FileSystem fs) + throws IOException, ClassNotFoundException { + + // reads the csv and writes it as its json equivalent + try (InputStreamReader reader = new InputStreamReader(fs.open(new Path(inputPath)))) { + GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter); + } + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java new file mode 100644 index 000000000..3d68db27b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -0,0 +1,145 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*; +import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.UPDATE_CLASS_NAME; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.hdfs.client.HdfsUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipDeserialize; +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipScore; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Measure; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class PrepareBipFinder implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareBipFinder.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String sourcePath = parser.get("sourcePath"); + log.info("sourcePath {}: ", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + prepareResults(spark, sourcePath, outputPath); + }); + } + + private static void prepareResults(SparkSession spark, String inputPath, String outputPath) { + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD bipDeserializeJavaRDD = sc + .textFile(inputPath) + .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); + + spark + .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { + BipScore bs = new BipScore(); + bs.setId(key); + bs.setScoreList(entry.get(key)); + return bs; + }).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class)) + .map((MapFunction) v -> { + Result r = new Result(); + + r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI)); + r.setMeasures(getMeasure(v)); + return r; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/bip"); + } + + private static List getMeasure(BipScore value) { + return value + .getScoreList() + .stream() + .map(score -> { + Measure m = new Measure(); + m.setId(score.getId()); + m + .setUnit( + score + .getUnit() + .stream() + .map(unit -> { + KeyValue kv = new KeyValue(); + kv.setValue(unit.getValue()); + kv.setKey(unit.getKey()); + kv + .setDataInfo( + OafMapperUtils + .dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils + .qualifier( + UPDATE_MEASURE_BIP_CLASS_ID, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "")); + return kv; + }) + .collect(Collectors.toList())); + return m; + }) + .collect(Collectors.toList()); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java new file mode 100644 index 000000000..5ae2f8c88 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java @@ -0,0 +1,133 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class PrepareFOSSparkJob implements Serializable { + private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareFOSSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + distributeFOSdois( + spark, + sourcePath, + + outputPath); + }); + } + + private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) { + Dataset fosDataset = readPath(spark, sourcePath, FOSDataModel.class); + + fosDataset.flatMap((FlatMapFunction) v -> { + List fosList = new ArrayList<>(); + final String level1 = v.getLevel1(); + final String level2 = v.getLevel2(); + final String level3 = v.getLevel3(); + Arrays + .stream(v.getDoi().split("\u0002")) + .forEach(d -> fosList.add(FOSDataModel.newInstance(d, level1, level2, level3))); + return fosList.iterator(); + }, Encoders.bean(FOSDataModel.class)) + .map((MapFunction) value -> { + Result r = new Result(); + r.setId(DHPUtils.generateUnresolvedIdentifier(value.getDoi(), DOI)); + r.setSubject(getSubjects(value)); + return r; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/fos"); + } + + private static List getSubjects(FOSDataModel fos) { + return Arrays + .asList(getSubject(fos.getLevel1()), getSubject(fos.getLevel2()), getSubject(fos.getLevel3())) + .stream() + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + private static StructuredProperty getSubject(String sbj) { + if (sbj.equals(NULL)) + return null; + StructuredProperty sp = new StructuredProperty(); + sp.setValue(sbj); + sp + .setQualifier( + OafMapperUtils + .qualifier( + FOS_CLASS_ID, + FOS_CLASS_NAME, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES)); + sp + .setDataInfo( + OafMapperUtils + .dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils + .qualifier( + UPDATE_SUBJECT_FOS_CLASS_ID, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "")); + + return sp; + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java new file mode 100644 index 000000000..62b813602 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class SparkSaveUnresolved implements Serializable { + private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareFOSSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + saveUnresolved( + spark, + sourcePath, + + outputPath); + }); + } + + private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) { + + spark + .read() + .textFile(sourcePath + "/*") + .map( + (MapFunction) l -> OBJECT_MAPPER.readValue(l, Result.class), + Encoders.bean(Result.class)) + .groupByKey((MapFunction) r -> r.getId(), Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> { + Result ret = it.next(); + it.forEachRemaining(r -> ret.mergeFrom(r)); + return ret; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipDeserialize.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipDeserialize.java new file mode 100644 index 000000000..f950d9260 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipDeserialize.java @@ -0,0 +1,28 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * Class that maps the model of the bipFinder! input data. + * Only needed for deserialization purposes + */ + +public class BipDeserialize extends HashMap> implements Serializable { + + public BipDeserialize() { + super(); + } + + public List get(String key) { + + if (super.get(key) == null) { + return new ArrayList<>(); + } + return super.get(key); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipScore.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipScore.java new file mode 100644 index 000000000..c36856a5b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/BipScore.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; +import java.util.List; + +/** + * Rewriting of the bipFinder input data by extracting the identifier of the result (doi) + */ + +public class BipScore implements Serializable { + private String id; // doi + private List scoreList; // unit as given in the inputfile + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getScoreList() { + return scoreList; + } + + public void setScoreList(List scoreList) { + this.scoreList = scoreList; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java new file mode 100644 index 000000000..befb230cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java @@ -0,0 +1,71 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; + +import com.opencsv.bean.CsvBindByPosition; + +public class FOSDataModel implements Serializable { + @CsvBindByPosition(position = 1) +// @CsvBindByName(column = "doi") + private String doi; + + @CsvBindByPosition(position = 2) +// @CsvBindByName(column = "level1") + private String level1; + + @CsvBindByPosition(position = 3) +// @CsvBindByName(column = "level2") + private String level2; + + @CsvBindByPosition(position = 4) +// @CsvBindByName(column = "level3") + private String level3; + + public FOSDataModel() { + + } + + public FOSDataModel(String doi, String level1, String level2, String level3) { + this.doi = doi; + this.level1 = level1; + this.level2 = level2; + this.level3 = level3; + } + + public static FOSDataModel newInstance(String d, String level1, String level2, String level3) { + return new FOSDataModel(d, level1, level2, level3); + } + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public String getLevel1() { + return level1; + } + + public void setLevel1(String level1) { + this.level1 = level1; + } + + public String getLevel2() { + return level2; + } + + public void setLevel2(String level2) { + this.level2 = level2; + } + + public String getLevel3() { + return level3; + } + + public void setLevel3(String level3) { + this.level3 = level3; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/KeyValue.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/KeyValue.java new file mode 100644 index 000000000..4384e4ba1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/KeyValue.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; + +public class KeyValue implements Serializable { + + private String key; + private String value; + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/Score.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/Score.java new file mode 100644 index 000000000..3d1cca9a0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/Score.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; +import java.util.List; + +/** + * represents the score in the input file + */ +public class Score implements Serializable { + + private String id; + private List unit; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getUnit() { + return unit; + } + + public void setUnit(List unit) { + this.unit = unit; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala deleted file mode 100644 index 9f0d25735..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ExportActionSetJobNode.scala +++ /dev/null @@ -1,41 +0,0 @@ -package eu.dnetlib.dhp.actionmanager.datacite - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.Oaf -import org.apache.hadoop.io.Text -import org.apache.hadoop.io.compress.GzipCodec -import org.apache.hadoop.mapred.SequenceFileOutputFormat -import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.slf4j.{Logger, LoggerFactory} - -import scala.io.Source - -object ExportActionSetJobNode { - - val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass) - - def main(args: Array[String]): Unit = { - val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString) - parser.parseArgument(args) - val master = parser.get("master") - val sourcePath = parser.get("sourcePath") - val targetPath = parser.get("targetPath") - - val spark: SparkSession = SparkSession.builder().config(conf) - .appName(ExportActionSetJobNode.getClass.getSimpleName) - .master(master) - .getOrCreate() - implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING) - - spark.read.load(sourcePath).as[Oaf] - .map(o =>DataciteToOAFTransformation.toActionSet(o)) - .filter(o => o!= null) - .rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec]) - - - } - -} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala deleted file mode 100644 index 5860c50ac..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/FilterCrossrefEntitiesSpark.scala +++ /dev/null @@ -1,46 +0,0 @@ -package eu.dnetlib.dhp.actionmanager.datacite - -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup -import eu.dnetlib.dhp.schema.mdstore.MetadataRecord -import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} -import eu.dnetlib.dhp.utils.ISLookupClientFactory -import org.apache.spark.SparkConf -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} -import org.slf4j.{Logger, LoggerFactory} - -import scala.io.Source - -object FilterCrossrefEntitiesSpark { - - val log: Logger = LoggerFactory.getLogger(getClass.getClass) - - def main(args: Array[String]): Unit = { - val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString) - parser.parseArgument(args) - val master = parser.get("master") - val sourcePath = parser.get("sourcePath") - log.info("sourcePath: {}", sourcePath) - val targetPath = parser.get("targetPath") - log.info("targetPath: {}", targetPath) - - - - val spark: SparkSession = SparkSession.builder().config(conf) - .appName(getClass.getSimpleName) - .master(master) - .getOrCreate() - - - - implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result] - - val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf] - - d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath) - - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java new file mode 100644 index 000000000..ea5fea96f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java @@ -0,0 +1,181 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.io.Serializable; +import java.util.*; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import scala.Tuple2; + +public class CreateActionSetSparkJob implements Serializable { + public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations"; + public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations"; + private static final String ID_PREFIX = "50|doi_________::"; + private static final String TRUST = "0.91"; + + private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws IOException, ParseException { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + CreateActionSetSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json")))); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath {}", inputPath.toString()); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}", outputPath); + + final boolean shouldDuplicateRels = Optional + .ofNullable(parser.get("shouldDuplicateRels")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + extractContent(spark, inputPath, outputPath, shouldDuplicateRels); + }); + + } + + private static void extractContent(SparkSession spark, String inputPath, String outputPath, + boolean shouldDuplicateRels) { + spark + .sqlContext() + .createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING()) + .flatMap( + (FlatMapFunction) value -> createRelation(value, shouldDuplicateRels).iterator(), + Encoders.bean(Relation.class)) + .filter((FilterFunction) value -> value != null) + .toJavaRDD() + .map(p -> new AtomicAction(p.getClass(), p)) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + + } + + private static List createRelation(String value, boolean duplicate) { + String[] line = value.split(","); + if (!line[1].startsWith("10.")) { + return new ArrayList<>(); + } + List relationList = new ArrayList<>(); + + String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1])); + final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2])); + + relationList + .addAll( + getRelations( + citing, + cited)); + + if (duplicate && line[1].endsWith(".refs")) { + citing = ID_PREFIX + IdentifierFactory + .md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs")))); + relationList.addAll(getRelations(citing, cited)); + } + + return relationList; + } + + private static Collection getRelations(String citing, String cited) { + + return Arrays + .asList( + getRelation(citing, cited, ModelConstants.CITES), + getRelation(cited, citing, ModelConstants.IS_CITED_BY)); + } + + public static Relation getRelation( + String source, + String target, + String relclass) { + Relation r = new Relation(); + r.setCollectedfrom(getCollectedFrom()); + r.setSource(source); + r.setTarget(target); + r.setRelClass(relclass); + r.setRelType(ModelConstants.RESULT_RESULT); + r.setSubRelType(ModelConstants.CITATION); + r + .setDataInfo( + getDataInfo()); + return r; + } + + public static List getCollectedFrom() { + KeyValue kv = new KeyValue(); + kv.setKey(ModelConstants.OPENOCITATIONS_ID); + kv.setValue(ModelConstants.OPENOCITATIONS_NAME); + + return Arrays.asList(kv); + } + + public static DataInfo getDataInfo() { + DataInfo di = new DataInfo(); + di.setInferred(false); + di.setDeletedbyinference(false); + di.setTrust(TRUST); + + di + .setProvenanceaction( + getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS)); + return di; + } + + public static Qualifier getQualifier(String class_id, String class_name, + String qualifierSchema) { + Qualifier pa = new Qualifier(); + pa.setClassid(class_id); + pa.setClassname(class_name); + pa.setSchemeid(qualifierSchema); + pa.setSchemename(qualifierSchema); + return pa; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java new file mode 100644 index 000000000..3530c9980 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java @@ -0,0 +1,93 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations; + +import java.io.*; +import java.io.Serializable; +import java.util.Objects; +import java.util.zip.GZIPOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class GetOpenCitationsRefs implements Serializable { + private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class); + + public static void main(final String[] args) throws IOException, ParseException { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + GetOpenCitationsRefs.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json")))); + + parser.parseArgument(args); + + final String[] inputFile = parser.get("inputFile").split(";"); + log.info("inputFile {}", inputFile.toString()); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath {}", workingPath); + + final String hdfsNameNode = parser.get("hdfsNameNode"); + log.info("hdfsNameNode {}", hdfsNameNode); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + + GetOpenCitationsRefs ocr = new GetOpenCitationsRefs(); + + for (String file : inputFile) { + ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem); + } + + } + + private void doExtract(String inputFile, String workingPath, FileSystem fileSystem) + throws IOException { + + final Path path = new Path(inputFile); + + FSDataInputStream oc_zip = fileSystem.open(path); + + int count = 1; + try (ZipInputStream zis = new ZipInputStream(oc_zip)) { + ZipEntry entry = null; + while ((entry = zis.getNextEntry()) != null) { + + if (!entry.isDirectory()) { + String fileName = entry.getName(); + fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count; + count++; + try ( + FSDataOutputStream out = fileSystem + .create(new Path(workingPath + "/COCI/" + fileName + ".gz")); + GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) { + + IOUtils.copy(zis, gzipOs); + + } + } + + } + + } + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala new file mode 100644 index 000000000..11ecfd6cb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -0,0 +1,49 @@ +package eu.dnetlib.dhp.collection + +import eu.dnetlib.dhp.schema.common.ModelSupport +import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} + +object CollectionUtils { + + /** + * This method in pipeline to the transformation phase, + * generates relations in both verse, typically it should be a phase of flatMap + * + * @param i input OAF + * @return + * If the input OAF is an entity -> List(i) + * If the input OAF is a relation -> List(relation, inverseRelation) + * + */ + + def fixRelations(i: Oaf): List[Oaf] = { + if (i.isInstanceOf[OafEntity]) + return List(i) + else { + val r: Relation = i.asInstanceOf[Relation] + val currentRel = ModelSupport.findRelation(r.getRelClass) + if (currentRel != null) { + + // Cleaning relation + r.setRelType(currentRel.getRelType) + r.setSubRelType(currentRel.getSubReltype) + r.setRelClass(currentRel.getRelClass) + val inverse = new Relation + inverse.setSource(r.getTarget) + inverse.setTarget(r.getSource) + inverse.setRelType(currentRel.getRelType) + inverse.setSubRelType(currentRel.getSubReltype) + inverse.setRelClass(currentRel.getInverseRelClass) + inverse.setCollectedfrom(r.getCollectedfrom) + inverse.setDataInfo(r.getDataInfo) + inverse.setProperties(r.getProperties) + inverse.setLastupdatetimestamp(r.getLastupdatetimestamp) + inverse.setValidated(r.getValidated) + inverse.setValidationDate(r.getValidationDate) + return List(r, inverse) + } + } + List() + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala similarity index 90% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala index bae41b218..6a9b8e3e5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/AbstractRestClient.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/AbstractRestClient.scala @@ -1,12 +1,10 @@ -package eu.dnetlib.dhp.actionmanager.datacite +package eu.dnetlib.dhp.datacite import org.apache.commons.io.IOUtils import org.apache.http.client.config.RequestConfig -import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest} +import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest} import org.apache.http.entity.StringEntity -import org.apache.http.impl.client.{HttpClientBuilder, HttpClients} - -import java.io.IOException +import org.apache.http.impl.client.HttpClientBuilder abstract class AbstractRestClient extends Iterator[String] { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala similarity index 95% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala index 36ec9e8c3..7ec44a6ff 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteAPIImporter.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala @@ -1,7 +1,7 @@ -package eu.dnetlib.dhp.actionmanager.datacite +package eu.dnetlib.dhp.datacite -import org.json4s.{DefaultFormats, JValue} import org.json4s.jackson.JsonMethods.{compact, parse, render} +import org.json4s.{DefaultFormats, JValue} class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index cfdd98d30..6ce4920ed 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dhp.actionmanager.datacite +package eu.dnetlib.dhp.datacite import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup @@ -325,8 +325,9 @@ object DataciteToOAFTransformation { val grantId = m.matcher(awardUri).replaceAll("$2") val targetId = s"$p${DHPUtils.md5(grantId)}" List( - generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo), - generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo) + generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo) +// REMOVED INVERSE RELATION since there is a specific method that should generate later +// generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo) ) } else @@ -580,11 +581,11 @@ object DataciteToOAFTransformation { rel.setProperties(List(dateProps).asJava) rel.setSource(id) - rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}") + rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier,r.relatedIdentifierType)) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) - rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut) + rel.getCollectedfrom.asScala.map(c => c.getValue).toList rel - })(collection breakOut) + }).toList } def generateDataInfo(trust: String): DataInfo = { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala similarity index 61% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala index 2cabc7879..a63627d1c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/GenerateDataciteDatasetSpark.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -1,9 +1,14 @@ -package eu.dnetlib.dhp.actionmanager.datacite +package eu.dnetlib.dhp.datacite +import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations +import eu.dnetlib.dhp.common.Constants.MDSTORE_DATA_PATH +import eu.dnetlib.dhp.common.Constants.MDSTORE_SIZE_PATH import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup -import eu.dnetlib.dhp.schema.mdstore.MetadataRecord +import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord} import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} @@ -17,11 +22,10 @@ object GenerateDataciteDatasetSpark { def main(args: Array[String]): Unit = { val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString) + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString) parser.parseArgument(args) val master = parser.get("master") val sourcePath = parser.get("sourcePath") - val targetPath = parser.get("targetPath") val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks")) val isLookupUrl: String = parser.get("isLookupUrl") log.info("isLookupUrl: {}", isLookupUrl) @@ -33,16 +37,28 @@ object GenerateDataciteDatasetSpark { .master(master) .getOrCreate() + import spark.implicits._ + implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] - import spark.implicits._ + val mdstoreOutputVersion = parser.get("mdstoreOutputVersion") + val mapper = new ObjectMapper() + val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion]) + val outputBasePath = cleanedMdStoreVersion.getHdfsPath + + log.info("outputBasePath: {}", outputBasePath) + val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH" spark.read.load(sourcePath).as[DataciteType] .filter(d => d.isActive) .flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks)) .filter(d => d != null) + .flatMap(i => fixRelations(i)).filter(i => i != null) .write.mode(SaveMode.Overwrite).save(targetPath) + + val total_items = spark.read.load(targetPath).as[Oaf].count() + writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH) } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala similarity index 93% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala index 2b73d2955..018b4958a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/ImportDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/ImportDatacite.scala @@ -1,6 +1,5 @@ -package eu.dnetlib.dhp.actionmanager.datacite +package eu.dnetlib.dhp.datacite -import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it import eu.dnetlib.dhp.application.ArgumentApplicationParser import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} @@ -9,14 +8,14 @@ import org.apache.hadoop.io.{IntWritable, SequenceFile, Text} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.functions.max import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods.parse -import org.apache.spark.sql.functions.max import org.slf4j.{Logger, LoggerFactory} -import java.time.format.DateTimeFormatter._ -import java.time.{LocalDate, LocalDateTime, ZoneOffset} +import java.time.format.DateTimeFormatter.ISO_DATE_TIME +import java.time.{LocalDateTime, ZoneOffset} import scala.io.Source object ImportDatacite { @@ -138,11 +137,11 @@ object ImportDatacite { } } - private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = { - var from:Long = timestamp * 1000 - val delta:Long = 100000000L + private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = { + var from: Long = timestamp * 1000 + val delta: Long = 100000000L var client: DataciteAPIImporter = null - val now :Long =System.currentTimeMillis() + val now: Long = System.currentTimeMillis() var i = 0 try { val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) @@ -168,7 +167,7 @@ object ImportDatacite { start = System.currentTimeMillis } } - println(s"updating from value: $from -> ${from+delta}") + println(s"updating from value: $from -> ${from + delta}") from = from + delta } } catch { @@ -183,4 +182,4 @@ object ImportDatacite { i } -} \ No newline at end of file +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala similarity index 68% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/SparkDownloadUpdateDatacite.scala rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala index 0b459fcf6..d46e5423d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/datacite/SparkDownloadUpdateDatacite.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala @@ -1,18 +1,14 @@ -package eu.dnetlib.dhp.actionmanager.datacite - +package eu.dnetlib.dhp.datacite import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} -import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.LocalFileSystem -import org.apache.hadoop.hdfs.DistributedFileSystem import org.apache.spark.SparkConf -import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.functions.max +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} import java.text.SimpleDateFormat -import java.util.{Date, Locale} +import java.util.Locale import scala.io.Source object SparkDownloadUpdateDatacite { @@ -21,7 +17,7 @@ object SparkDownloadUpdateDatacite { def main(args: Array[String]): Unit = { val conf = new SparkConf - val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString) + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString) parser.parseArgument(args) val master = parser.get("master") val sourcePath = parser.get("sourcePath") @@ -42,9 +38,9 @@ object SparkDownloadUpdateDatacite { import spark.implicits._ - val maxDate:String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0) + val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0) val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) - val string_to_date =ISO8601FORMAT.parse(maxDate) + val string_to_date = ISO8601FORMAT.parse(maxDate) val ts = string_to_date.getTime diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala index 7a62437a3..8ae8285e3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.sx.bio import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.schema.oaf.Oaf import BioDBToOAF.ScholixResolved +import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} @@ -35,13 +36,13 @@ object SparkTransformBioDatabaseToOAF { import spark.implicits._ database.toUpperCase() match { case "UNIPROT" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "PDB" => - spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "SCHOLIX" => - spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).write.mode(SaveMode.Overwrite).save(targetPath) + spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) case "CROSSREF_LINKS" => - spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath) + spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala index b19bfc23a..8da617ca0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -5,6 +5,7 @@ import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.sx.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem import BioDBToOAF.EBILinkItem +import eu.dnetlib.dhp.collection.CollectionUtils import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf import org.apache.spark.sql._ @@ -37,6 +38,7 @@ object SparkEBILinksToOaf { ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)) + .flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null) .write.mode(SaveMode.Overwrite).save(targetPath) } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java index 881528425..af0d5169d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -5,94 +5,249 @@ import java.io.Serializable; import java.util.ArrayList; import java.util.List; +/** + * This class represent an instance of Pubmed Article extracted from the native XML + * + * @author Sandro La Bruzzo + */ + public class PMArticle implements Serializable { + /** + * the Pubmed Identifier + */ private String pmid; + /** + * the DOI + */ private String doi; + /** + * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. + * All dates will have a , , and elements. Some may have an , , and element(s). + */ private String date; + /** + * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself. + */ private PMJournal journal; + /** + * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the element. + */ private String title; + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + */ private String description; + /** + * the language in which an article was published is recorded in . + * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single + * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. + * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. + */ private String language; + + /** + * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. * + */ private final List subjects = new ArrayList<>(); + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + */ private final List publicationTypes = new ArrayList<>(); + /** + * Personal and collective (corporate) author names published with the article are found in . + */ private List authors = new ArrayList<>(); - public List getPublicationTypes() { - return publicationTypes; - } - + /** + * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service + * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. + */ private final List grants = new ArrayList<>(); - public List getGrants() { - return grants; - } - + /** + * get the DOI + * @return a DOI + */ public String getDoi() { return doi; } + /** + * Set the DOI + * @param doi a DOI + */ public void setDoi(String doi) { this.doi = doi; } + /** + * get the Pubmed Identifier + * @return the PMID + */ public String getPmid() { return pmid; } + /** + * set the Pubmed Identifier + * @param pmid the Pubmed Identifier + */ public void setPmid(String pmid) { this.pmid = pmid; } + /** + * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. + * All dates will have a , , and elements. Some may have an , , and element(s). + * + * @return the Pubmed Date + */ public String getDate() { return date; } + /** + * Set the pubmed Date + * @param date + */ public void setDate(String date) { this.date = date; } + /** + * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. + * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. + * The NLM journal title abbreviation is exported in the element. + * + * @return the pubmed Journal Extracted + */ public PMJournal getJournal() { return journal; } + /** + * Set the mapped pubmed Journal + * @param journal + */ public void setJournal(PMJournal journal) { this.journal = journal; } + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * + * @return the extracted pubmed Title + */ public String getTitle() { return title; } + /** + * set the pubmed title + * @param title + */ public void setTitle(String title) { this.title = title; } + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * + * @return the Mapped Pubmed Article Abstracts + */ public String getDescription() { return description; } + /** + * Set the Mapped Pubmed Article Abstracts + * @param description + */ public void setDescription(String description) { this.description = description; } + /** + * Personal and collective (corporate) author names published with the article are found in . + * + * @return get the Mapped Authors lists + */ public List getAuthors() { return authors; } + /** + * Set the Mapped Authors lists + * @param authors + */ public void setAuthors(List authors) { this.authors = authors; } + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + * + * @return the mapped Subjects + */ public List getSubjects() { return subjects; } + /** + * + * the language in which an article was published is recorded in . + * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single + * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. + * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. + * + * @return The mapped Language + */ public String getLanguage() { return language; } + /** + * + * Set The mapped Language + * + * @param language the mapped Language + */ public void setLanguage(String language) { this.language = language; } + + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + * + * @return the mapped Publication Type + */ + public List getPublicationTypes() { + return publicationTypes; + } + + /** + * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service + * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. + * @return the mapped grants + */ + + public List getGrants() { + return grants; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java index cef92d003..68ef6459e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -3,27 +3,57 @@ package eu.dnetlib.dhp.sx.bio.pubmed; import java.io.Serializable; +/** + * The type Pubmed author. + * + * @author Sandro La Bruzzo + */ public class PMAuthor implements Serializable { private String lastName; private String foreName; + /** + * Gets last name. + * + * @return the last name + */ public String getLastName() { return lastName; } + /** + * Sets last name. + * + * @param lastName the last name + */ public void setLastName(String lastName) { this.lastName = lastName; } + /** + * Gets fore name. + * + * @return the fore name + */ public String getForeName() { return foreName; } + /** + * Sets fore name. + * + * @param foreName the fore name + */ public void setForeName(String foreName) { this.foreName = foreName; } + /** + * Gets full name. + * + * @return the full name + */ public String getFullName() { return String .format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : ""); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java index ce9420cc1..abb908483 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java @@ -1,41 +1,86 @@ package eu.dnetlib.dhp.sx.bio.pubmed; +/** + * The type Pm grant. + * + * @author Sandro La Bruzzo + */ public class PMGrant { private String grantID; private String agency; private String country; + /** + * Instantiates a new Pm grant. + */ public PMGrant() { } + /** + * Instantiates a new Pm grant. + * + * @param grantID the grant id + * @param agency the agency + * @param country the country + */ public PMGrant(String grantID, String agency, String country) { this.grantID = grantID; this.agency = agency; this.country = country; } + /** + * Gets grant id. + * + * @return the grant id + */ public String getGrantID() { return grantID; } + /** + * Sets grant id. + * + * @param grantID the grant id + */ public void setGrantID(String grantID) { this.grantID = grantID; } + /** + * Gets agency. + * + * @return the agency + */ public String getAgency() { return agency; } + /** + * Sets agency. + * + * @param agency the agency + */ public void setAgency(String agency) { this.agency = agency; } + /** + * Gets country. + * + * @return the country + */ public String getCountry() { return country; } + /** + * Sets country. + * + * @param country the country + */ public void setCountry(String country) { this.country = country; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java index 863a23bd5..731648839 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java @@ -3,6 +3,11 @@ package eu.dnetlib.dhp.sx.bio.pubmed; import java.io.Serializable; +/** + * The type Pm journal. + * + * @author Sandro La Bruzzo + */ public class PMJournal implements Serializable { private String issn; @@ -11,42 +16,92 @@ public class PMJournal implements Serializable { private String date; private String title; + /** + * Gets issn. + * + * @return the issn + */ public String getIssn() { return issn; } + /** + * Sets issn. + * + * @param issn the issn + */ public void setIssn(String issn) { this.issn = issn; } + /** + * Gets volume. + * + * @return the volume + */ public String getVolume() { return volume; } + /** + * Sets volume. + * + * @param volume the volume + */ public void setVolume(String volume) { this.volume = volume; } + /** + * Gets issue. + * + * @return the issue + */ public String getIssue() { return issue; } + /** + * Sets issue. + * + * @param issue the issue + */ public void setIssue(String issue) { this.issue = issue; } + /** + * Gets date. + * + * @return the date + */ public String getDate() { return date; } + /** + * Sets date. + * + * @param date the date + */ public void setDate(String date) { this.date = date; } + /** + * Gets title. + * + * @return the title + */ public String getTitle() { return title; } + /** + * Sets title. + * + * @param title the title + */ public void setTitle(String title) { this.title = title; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala index 80cb0667c..c6d5fdf74 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -2,6 +2,12 @@ package eu.dnetlib.dhp.sx.bio.pubmed import scala.xml.MetaData import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} + + +/** + * + * @param xml + */ class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { var currentArticle:PMArticle = generateNextArticle() diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java index 862d39a94..e3829bb7b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java @@ -1,40 +1,83 @@ package eu.dnetlib.dhp.sx.bio.pubmed; +/** + * The type Pubmed subject. + */ public class PMSubject { private String value; private String meshId; private String registryNumber; + /** + * Instantiates a new Pm subject. + */ public PMSubject() { } + /** + * Instantiates a new Pm subject. + * + * @param value the value + * @param meshId the mesh id + * @param registryNumber the registry number + */ public PMSubject(String value, String meshId, String registryNumber) { this.value = value; this.meshId = meshId; this.registryNumber = registryNumber; } + /** + * Gets value. + * + * @return the value + */ public String getValue() { return value; } + /** + * Sets value. + * + * @param value the value + */ public void setValue(String value) { this.value = value; } + /** + * Gets mesh id. + * + * @return the mesh id + */ public String getMeshId() { return meshId; } + /** + * Sets mesh id. + * + * @param meshId the mesh id + */ public void setMeshId(String meshId) { this.meshId = meshId; } + /** + * Gets registry number. + * + * @return the registry number + */ public String getRegistryNumber() { return registryNumber; } + /** + * Sets registry number. + * + * @param registryNumber the registry number + */ public void setRegistryNumber(String registryNumber) { this.registryNumber = registryNumber; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 13f38408e..ecef32202 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -8,6 +8,9 @@ import scala.collection.JavaConverters._ import java.util.regex.Pattern +/** + * + */ object PubMedToOaf { val SUBJ_CLASS = "keywords" @@ -15,7 +18,17 @@ object PubMedToOaf { "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "doi" -> "https://dx.doi.org/" ) + val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") + val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + + + /** + * Cleaning the DOI Applying regex in order to + * remove doi starting with URL + * @param doi input DOI + * @return cleaned DOI + */ def cleanDoi(doi: String): String = { val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" @@ -30,6 +43,15 @@ object PubMedToOaf { null } + /** + * + * Create an instance of class extends Result + * starting from OAF instanceType value + * + * @param cobjQualifier OAF instance type + * @param vocabularies All dnet vocabularies + * @return the correct instance + */ def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) result_typologies.getClassid match { @@ -42,6 +64,12 @@ object PubMedToOaf { } } + /** + * Mapping the Pubmedjournal info into the OAF Journale + * + * @param j the pubmedJournal + * @return the OAF Journal + */ def mapJournal(j: PMJournal): Journal = { if (j == null) return null @@ -49,6 +77,7 @@ object PubMedToOaf { journal.setDataInfo(dataInfo) journal.setName(j.getTitle) + journal.setConferencedate(j.getDate) journal.setVol(j.getVolume) journal.setIssnPrinted(j.getIssn) journal.setIss(j.getIssue) @@ -57,25 +86,43 @@ object PubMedToOaf { } - + /** + * + * Find vocabulary term into synonyms and term in the vocabulary + * + * @param vocabularyName the input vocabulary name + * @param vocabularies all the vocabularies + * @param term the term to find + * + * @return the cleaned term value + */ def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) val b = vocabularies.getTermAsQualifier(vocabularyName, term) if (a == null) b else a } - val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") - val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + /** + * Map the Pubmed Article into the OAF instance + * + * + * @param article the pubmed articles + * @param vocabularies the vocabularies + * @return The OAF instance if the mapping did not fail + */ def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = { if (article.getPublicationTypes == null) return null - val i = new Instance + + + // MAP PMID into pid with classid = classname = pmid val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)) if (pidList == null) return null + // MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi var alternateIdentifier: StructuredProperty = null if (article.getDoi != null) { val normalizedPid = cleanDoi(article.getDoi) @@ -83,43 +130,64 @@ object PubMedToOaf { alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) } + // INSTANCE MAPPING + //-------------------------------------------------------------------------------------- + // If the article contains the typology Journal Article then we apply this type //else We have to find a terms that match the vocabulary otherwise we discard it val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) + val pubmedInstance = new Instance if (ja.isDefined) { val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) - i.setInstancetype(cojbCategory) + pubmedInstance.setInstancetype(cojbCategory) } else { val i_type = article.getPublicationTypes.asScala .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)) .find(q => q != null) if (i_type.isDefined) - i.setInstancetype(i_type.get) + pubmedInstance.setInstancetype(i_type.get) else return null } - val result = createResult(i.getInstancetype, vocabularies) + val result = createResult(pubmedInstance.getInstancetype, vocabularies) if (result == null) return result result.setDataInfo(dataInfo) - i.setPid(pidList.asJava) + pubmedInstance.setPid(pidList.asJava) if (alternateIdentifier != null) - i.setAlternateIdentifier(List(alternateIdentifier).asJava) - result.setInstance(List(i).asJava) - i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) + pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava) + result.setInstance(List(pubmedInstance).asJava) + pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut) + //CREATE URL From pmid val urlLists: List[String] = pidList .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .filter(t => t._1.nonEmpty) .map(t => t._1 + t._2) if (urlLists != null) - i.setUrl(urlLists.asJava) - i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) - i.setCollectedfrom(collectedFrom) + pubmedInstance.setUrl(urlLists.asJava) + + //ASSIGN DateofAcceptance + pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) + //ASSIGN COLLECTEDFROM + pubmedInstance.setCollectedfrom(collectedFrom) result.setPid(pidList.asJava) + + //END INSTANCE MAPPING + //-------------------------------------------------------------------------------------- + + + // JOURNAL MAPPING + //-------------------------------------------------------------------------------------- if (article.getJournal != null && result.isInstanceOf[Publication]) result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal)) result.setCollectedfrom(List(collectedFrom).asJava) + //END JOURNAL MAPPING + //-------------------------------------------------------------------------------------- + + + // RESULT MAPPING + //-------------------------------------------------------------------------------------- result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) if (article.getTitle == null || article.getTitle.isEmpty) @@ -159,6 +227,9 @@ object PubMedToOaf { result.setId(article.getPmid) + + // END RESULT MAPPING + //-------------------------------------------------------------------------------------- val id = IdentifierFactory.createIdentifier(result) if (article.getPmid.equalsIgnoreCase(id)) return null diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json new file mode 100644 index 000000000..050a25677 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json @@ -0,0 +1,33 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"out", + "paramLongName":"outputPath", + "paramDescription": "the output path", + "paramRequired": true + }, + + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "hnn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the path used to store the HostedByMap", + "paramRequired": true + }, + { + "paramName": "cfn", + "paramLongName": "classForName", + "paramDescription": "the path used to store the HostedByMap", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml similarity index 56% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml index dd3c32c62..d262cb6e0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml @@ -12,12 +12,19 @@ true - oozie.action.sharelib.for.spark - spark2 + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire - oozie.launcher.mapreduce.user.classpath.first true - \ No newline at end of file + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml new file mode 100644 index 000000000..d53504fe6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml @@ -0,0 +1,174 @@ + + + + + fosPath + the input path of the resources to be extended + + + + bipScorePath + the path where to find the bipFinder scores + + + outputPath + the path where to store the actionset + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + yarn + cluster + Produces the unresolved from bip finder! + eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${bipScorePath} + --outputPath${workingDir}/prepared + + + + + + + + eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSData + --hdfsNameNode${nameNode} + --sourcePath${fosPath} + --outputPath${workingDir}/input/fos + --classForNameeu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel + + + + + + + + yarn + cluster + Produces the unresolved from FOS! + eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/input/fos + --outputPath${workingDir}/prepared + + + + + + + + + + + + + yarn + cluster + Saves the result produced for bip and fos by grouping results with the same id + eu.dnetlib.dhp.actionmanager.createunresolvedentities.SparkSaveUnresolved + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/prepared + --outputPath${outputPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json new file mode 100644 index 000000000..b7bad73e6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "sp", + "paramLongName": "sourcePath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json new file mode 100644 index 000000000..b7bad73e6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "sp", + "paramLongName": "sourcePath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml deleted file mode 100644 index c6332ff7d..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/workflow.xml +++ /dev/null @@ -1,81 +0,0 @@ - - - - mainPath - the working path of Datacite stores - - - isLookupUrl - The IS lookUp service endopoint - - - blocksize - 100 - The request block size - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - yarn-cluster - cluster - ImportDatacite - eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --targetPath${mainPath}/datacite_update - --dataciteDumpPath${mainPath}/datacite_dump - --namenode${nameNode} - --masteryarn-cluster - --blocksize${blocksize} - - - - - - - - - yarn-cluster - cluster - TransformJob - eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${mainPath}/datacite_dump - --targetPath${mainPath}/datacite_oaf - --isLookupUrl${isLookupUrl} - --exportLinksfalse - --masteryarn-cluster - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml deleted file mode 100644 index 397288c69..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/scholix/oozie_app/workflow.xml +++ /dev/null @@ -1,84 +0,0 @@ - - - - datacitePath - the path of Datacite spark dataset - - - isLookupUrl - The IS lookUp service endopoint - - - crossrefPath - the path of Crossref spark dataset - - - - targetPath - the path of Crossref spark dataset - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - yarn-cluster - cluster - ImportDatacite - eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${datacitePath} - --targetPath${targetPath}/datacite_oaf - --isLookupUrl${isLookupUrl} - --exportLinkstrue - --masteryarn-cluster - - - - - - - - - yarn-cluster - cluster - FilterCrossrefEntities - eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark - dhp-aggregation-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --sourcePath${crossrefPath} - --targetPath${targetPath}/crossref_oaf - --masteryarn-cluster - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json new file mode 100644 index 000000000..308e02026 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json @@ -0,0 +1,25 @@ +[ + { + "paramName": "ip", + "paramLongName": "inputPath", + "paramDescription": "the zipped opencitations file", + "paramRequired": true + }, + { + "paramName": "op", + "paramLongName": "outputPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the hdfs name node", + "paramRequired": false + }, { + "paramName": "sdr", + "paramLongName": "shouldDuplicateRels", + "paramDescription": "the hdfs name node", + "paramRequired": false +} +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json new file mode 100644 index 000000000..4910ad11d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "if", + "paramLongName": "inputFile", + "paramDescription": "the zipped opencitations file", + "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "hnn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the hdfs name node", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml new file mode 100644 index 000000000..a1755f329 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + oozie.launcher.mapreduce.user.classpath.first + true + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh new file mode 100644 index 000000000..7a34f3c4e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh @@ -0,0 +1,2 @@ +#!/bin/bash +for file in $(echo $1 | tr ";" "\n"); do curl -L $(echo $file | cut -d '@' -f 1 ) | hdfs dfs -put - $2/$(echo $file | cut -d '@' -f 2) ; done; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml new file mode 100644 index 000000000..d052791a3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml @@ -0,0 +1,91 @@ + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + + ${wf:conf('resumeFrom') eq 'DownloadDump'} + ${wf:conf('resumeFrom') eq 'ExtractContent'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + ${jobTracker} + ${nameNode} + + + mapred.job.queue.name + ${queueName} + + + download.sh + ${filelist} + ${workingPath}/Original + HADOOP_USER_NAME=${wf:user()} + download.sh + + + + + + + + eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs + --hdfsNameNode${nameNode} + --inputFile${inputFile} + --workingPath${workingPath} + + + + + + + + yarn + cluster + Produces the AS for OC + eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${workingPath}/COCI + --outputPath${outputPath} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json new file mode 100644 index 000000000..258d6816e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json @@ -0,0 +1,8 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"opencitationFile", "paramDescription": "the name of the file", "paramRequired": true}, + {"paramName":"issm", "paramLongName":"isSparkSessionManaged", "paramDescription": "the name of the activities orcid file", "paramRequired": false}, + {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the name of the activities orcid file", "paramRequired": true} + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/config-default.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/workflow.xml similarity index 59% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/workflow.xml index 3c58ace7b..6989eed66 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/actionset/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/workflow.xml @@ -1,46 +1,52 @@ - + - sourcePath + mainPath the working path of Datacite stores - outputPath - the path of Datacite ActionSet + isLookupUrl + The IS lookUp service endopoint + + blocksize + 100 + The request block size + + - + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - + yarn-cluster cluster - ExportDataset - eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode + ImportDatacite + eu.dnetlib.dhp.datacite.ImportDatacite dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} - --conf spark.sql.shuffle.partitions=3840 --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --sourcePath${sourcePath} - --targetPath${outputPath} + --targetPath${mainPath}/datacite_update + --dataciteDumpPath${mainPath}/datacite_dump + --namenode${nameNode} --masteryarn-cluster + --blocksize${blocksize} - \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/datacite_filter similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/datacite_filter rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/datacite_filter diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/exportDataset_parameters.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/exportDataset_parameters.json diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/filter_crossref_param.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/filter_crossref_param.json diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/generate_dataset_params.json similarity index 90% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/generate_dataset_params.json index 67e7f37dc..04dc8b942 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/generate_dataset_params.json @@ -7,8 +7,8 @@ }, { - "paramName": "t", - "paramLongName": "targetPath", + "paramName": "mo", + "paramLongName": "mdstoreOutputVersion", "paramDescription": "the target mdstore path", "paramRequired": true }, diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/hostedBy_map.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/import_from_api.json similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/import_from_api.json diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/datacite/oozie_app/config-default.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml new file mode 100644 index 000000000..d5bae7305 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml @@ -0,0 +1,126 @@ + + + + mainPath + the working path of Datacite stores + + + isLookupUrl + The IS lookUp service endopoint + + + mdStoreOutputId + the identifier of the cleaned MDStore + + + mdStoreManagerURI + the path of the cleaned mdstore + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionNEW_VERSION + --mdStoreID${mdStoreOutputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + yarn-cluster + cluster + TransformJob + eu.dnetlib.dhp.datacite.GenerateDataciteDatasetSpark + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mainPath}/datacite_dump + --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --isLookupUrl${isLookupUrl} + --exportLinkstrue + --masteryarn-cluster + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml index 73b1a3b60..4b47ae38e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml @@ -52,7 +52,7 @@ yarn-cluster cluster Incremental Download EBI Links - eu.dnetllib.dhp.sx.bio.ebi.SparkDownloadEBILinks + eu.dnetlib.dhp.sx.bio.ebi.SparkDownloadEBILinks dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -85,7 +85,7 @@ yarn-cluster cluster Create OAF DataSet - eu.dnetllib.dhp.sx.bio.ebi.SparkEBILinksToOaf + eu.dnetlib.dhp.sx.bio.ebi.SparkEBILinksToOaf dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml index 21fd2d153..8915a090b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml @@ -30,7 +30,7 @@ yarn cluster Convert Baseline to OAF Dataset - eu.dnetllib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame + eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md b/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/index.md b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md new file mode 100644 index 000000000..c0c756082 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md @@ -0,0 +1,9 @@ +##DHP-Aggregation + +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. + +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. + +It defines [mappings](mappings.md) for transformation of different datasource (See mapping section). \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md new file mode 100644 index 000000000..9da46a27e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md @@ -0,0 +1,7 @@ +##DHP-Aggregation + +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. + +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md new file mode 100644 index 000000000..576c4b6be --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md @@ -0,0 +1,18 @@ +DHP Aggregation +=============== + +DHP-Aggregations contains different mappings from original data format into OAF Data Format, +which converge in the graph in different ways: + +- Via Action Manager +- Direct in the MdStore on Hadoop + +Below the list of the implemented mapping + + +Mappings +======= + +1. [PubMed](pubmed.md) +2. [Datacite](datacite.md) + diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md new file mode 100644 index 000000000..f6327a51b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md @@ -0,0 +1,62 @@ +#Pubmed Mapping +This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/). + +Collection +--------- +The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with +the following [shcema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) + + +Parsing +------- +The resposible class of parsing is [PMParser](./scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates +an intermediate mapping of PubMed Article defined [here](/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html) + + +Mapping +------- + +The table below describes the mapping from the XML Native to the OAF mapping + + + + + +| Xpath Source | Oaf Field | Notes | +| ----------- | ----------- | ----------- | +| //PMID | pid | classid = classname = pmid +| | **Instance Mapping** | | +|//PublicationType | InstanceType | If the article contains the typology **Journal Article** then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it +|//PMID | instance/PID | Map the pmid also in the pid in the instance | +| //ArticleId[./@IdType="doi" | instance/alternateIdentifier |classid = classname = doi +|//PMID | instance/URL | prepend to the PMId the base url https://pubmed.ncbi.nlm.nih.gov/ +| //PubmedPubDate | instance/Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it +| FOR ALL INSTANCE | CollectedFrom | datasourceName: *Europe PubMed Central* DatasourceId: +| | **Journal Mapping** | | +|//Journal/PubDate| Journal/Conferencedate | map the date of the Journal +|//Journal/Title| Journal/Name | | +|//Journal/Volume| Journal/Vol | | +|//Journal/ISSN| Journal/issPrinted | | +|//Journal/Issue| Journal/Iss | | +| | **Publication Mapping** | | +| //PubmedPubDate | Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it +| //Title | title | with qualifier ModelConstants.MAIN_TITLE_QUALIFIER +| //AbstractText | Description || +|//Language| Language| cleaning vocabulary -> dnet:languages +|//DescriptorName| Subject | classId, className = keyword +| | **Author Mapping** | | +|//Author/LastName| author.Surname| | +|//Author/ForeName| author.Forename| | +|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist | +|FOR ALL AUTHOR | author.rank| sequential number starting from 1| + + + + + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png new file mode 100644 index 000000000..00d320c39 Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png differ diff --git a/dhp-workflows/dhp-aggregation/src/site/site.xml b/dhp-workflows/dhp-aggregation/src/site/site.xml new file mode 100644 index 000000000..da5da0f1e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/site.xml @@ -0,0 +1,32 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java new file mode 100644 index 000000000..c48ccc8c2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareTest.java @@ -0,0 +1,250 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.stream.Collectors; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class PrepareTest { + + private static final Logger log = LoggerFactory.getLogger(ProduceTest.class); + + private static Path workingDir; + private static SparkSession spark; + private static LocalFileSystem fs; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName()); + + fs = FileSystem.getLocal(new Configuration()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ProduceTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(PrepareTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void bipPrepareTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json") + .getPath(); + + PrepareBipFinder + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", sourcePath, + "--outputPath", workingDir.toString() + "/work" + + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/work/bip") + .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); + + Assertions.assertEquals(86, tmp.count()); + + String doi1 = "unresolved::10.0000/096020199389707::doi"; + + Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count()); + Assertions.assertEquals(3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getMeasures().size()); + Assertions + .assertEquals( + "6.34596412687e-09", tmp + .filter(r -> r.getId().equals(doi1)) + .collect() + .get(0) + .getMeasures() + .stream() + .filter(sl -> sl.getId().equals("influence")) + .collect(Collectors.toList()) + .get(0) + .getUnit() + .get(0) + .getValue()); + Assertions + .assertEquals( + "0.641151896994", tmp + .filter(r -> r.getId().equals(doi1)) + .collect() + .get(0) + .getMeasures() + .stream() + .filter(sl -> sl.getId().equals("popularity_alt")) + .collect(Collectors.toList()) + .get(0) + .getUnit() + .get(0) + .getValue()); + Assertions + .assertEquals( + "2.33375102921e-09", tmp + .filter(r -> r.getId().equals(doi1)) + .collect() + .get(0) + .getMeasures() + .stream() + .filter(sl -> sl.getId().equals("popularity")) + .collect(Collectors.toList()) + .get(0) + .getUnit() + .get(0) + .getValue()); + + } + + @Test + void getFOSFileTest() throws IOException, ClassNotFoundException { + + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv") + .getPath(); + final String outputPath = workingDir.toString() + "/fos.json"; + + new GetFOSData() + .doRewrite( + sourcePath, outputPath, "eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel", + '\t', fs); + + BufferedReader in = new BufferedReader( + new InputStreamReader(fs.open(new org.apache.hadoop.fs.Path(outputPath)))); + + String line; + int count = 0; + while ((line = in.readLine()) != null) { + FOSDataModel fos = new ObjectMapper().readValue(line, FOSDataModel.class); + + System.out.println(new ObjectMapper().writeValueAsString(fos)); + count += 1; + } + + assertEquals(38, count); + + } + + @Test + void fosPrepareTest() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json") + .getPath(); + + PrepareFOSSparkJob + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", sourcePath, + + "-outputPath", workingDir.toString() + "/work" + + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/work/fos") + .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); + + String doi1 = "unresolved::10.3390/s18072310::doi"; + + assertEquals(50, tmp.count()); + assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count()); + assertTrue( + tmp + .filter(r -> r.getId().equals(doi1)) + .flatMap(r -> r.getSubject().iterator()) + .map(sbj -> sbj.getValue()) + .collect() + .contains("engineering and technology")); + + assertTrue( + tmp + .filter(r -> r.getId().equals(doi1)) + .flatMap(r -> r.getSubject().iterator()) + .map(sbj -> sbj.getValue()) + .collect() + .contains("nano-technology")); + assertTrue( + tmp + .filter(r -> r.getId().equals(doi1)) + .flatMap(r -> r.getSubject().iterator()) + .map(sbj -> sbj.getValue()) + .collect() + .contains("nanoscience & nanotechnology")); + + String doi = "unresolved::10.1111/1365-2656.12831::doi"; + assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count()); + assertTrue( + tmp + .filter(r -> r.getId().equals(doi)) + .flatMap(r -> r.getSubject().iterator()) + .map(sbj -> sbj.getValue()) + .collect() + .contains("psychology and cognitive sciences")); + + assertTrue( + tmp + .filter(r -> r.getId().equals(doi)) + .flatMap(r -> r.getSubject().iterator()) + .map(sbj -> sbj.getValue()) + .collect() + .contains("social sciences")); + assertFalse( + tmp + .filter(r -> r.getId().equals(doi)) + .flatMap(r -> r.getSubject().iterator()) + .map(sbj -> sbj.getValue()) + .collect() + .contains("NULL")); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java new file mode 100644 index 000000000..b77b5bb36 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java @@ -0,0 +1,234 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; + +public class ProduceTest { + private static final Logger log = LoggerFactory.getLogger(ProduceTest.class); + + private static Path workingDir; + private static SparkSession spark; + private static LocalFileSystem fs; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final String ID_PREFIX = "50|doi_________"; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(ProduceTest.class.getSimpleName()); + + fs = FileSystem.getLocal(new Configuration()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ProduceTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(ProduceTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void produceTest() throws Exception { + + final String bipPath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json") + .getPath(); + + PrepareBipFinder + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", bipPath, + "--outputPath", workingDir.toString() + "/work" + + }); + final String fosPath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json") + .getPath(); + + PrepareFOSSparkJob + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", fosPath, + "-outputPath", workingDir.toString() + "/work" + }); + + SparkSaveUnresolved.main(new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", workingDir.toString() + "/work", + + "-outputPath", workingDir.toString() + "/unresolved" + + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/unresolved") + .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); + + Assertions.assertEquals(135, tmp.count()); + + Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")).count()); + + Assertions + .assertEquals( + 3, tmp + .filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")) + .collect() + .get(0) + .getSubject() + .size()); + + Assertions + .assertEquals( + 3, tmp + .filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")) + .collect() + .get(0) + .getMeasures() + .size()); + + List sbjs = tmp + .filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")) + .flatMap(row -> row.getSubject().iterator()) + .collect(); + + sbjs.forEach(sbj -> Assertions.assertEquals("FOS", sbj.getQualifier().getClassid())); + sbjs + .forEach( + sbj -> Assertions + .assertEquals( + "Fields of Science and Technology classification", sbj.getQualifier().getClassname())); + sbjs + .forEach( + sbj -> Assertions + .assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid())); + sbjs + .forEach( + sbj -> Assertions + .assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename())); + + sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference())); + sbjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred())); + sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible())); + sbjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust())); + sbjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance())); + sbjs + .forEach( + sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid())); + sbjs + .forEach( + sbj -> Assertions + .assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname())); + sbjs + .forEach( + sbj -> Assertions + .assertEquals( + ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid())); + sbjs + .forEach( + sbj -> Assertions + .assertEquals( + ModelConstants.DNET_PROVENANCE_ACTIONS, + sbj.getDataInfo().getProvenanceaction().getSchemename())); + + sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("engineering and technology")); + sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nano-technology")); + sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nanoscience & nanotechnology")); + + List measures = tmp + .filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")) + .flatMap(row -> row.getMeasures().iterator()) + .collect(); + Assertions + .assertEquals( + "7.5597134689e-09", measures + .stream() + .filter(mes -> mes.getId().equals("influence")) + .collect(Collectors.toList()) + .get(0) + .getUnit() + .get(0) + .getValue()); + + Assertions + .assertEquals( + "4.903880192", measures + .stream() + .filter(mes -> mes.getId().equals("popularity_alt")) + .collect(Collectors.toList()) + .get(0) + .getUnit() + .get(0) + .getValue()); + + Assertions + .assertEquals( + "1.17977512835e-08", measures + .stream() + .filter(mes -> mes.getId().equals("popularity")) + .collect(Collectors.toList()) + .get(0) + .getUnit() + .get(0) + .getValue()); + + Assertions + .assertEquals( + 49, tmp + .filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi")) + .filter(row -> row.getSubject() != null) + .count()); + + Assertions + .assertEquals( + 85, + tmp + .filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi")) + .filter(r -> r.getMeasures() != null) + .count()); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java new file mode 100644 index 000000000..5a04dcefe --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateOpenCitationsASTest.java @@ -0,0 +1,335 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; + +public class CreateOpenCitationsASTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + private static final Logger log = LoggerFactory + .getLogger(CreateOpenCitationsASTest.class); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory(CreateOpenCitationsASTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(CreateOpenCitationsASTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(CreateOpenCitationsASTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void testNumberofRelations() throws Exception { + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-shouldDuplicateRels", + Boolean.TRUE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + assertEquals(60, tmp.count()); + + // tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); + + } + + @Test + void testNumberofRelations2() throws Exception { + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + assertEquals(44, tmp.count()); + + // tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r))); + + } + + @Test + void testRelationsCollectedFrom() throws Exception { + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + tmp.foreach(r -> { + assertEquals(ModelConstants.OPENOCITATIONS_NAME, r.getCollectedfrom().get(0).getValue()); + assertEquals(ModelConstants.OPENOCITATIONS_ID, r.getCollectedfrom().get(0).getKey()); + }); + + } + + @Test + void testRelationsDataInfo() throws Exception { + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + tmp.foreach(r -> { + assertEquals(false, r.getDataInfo().getInferred()); + assertEquals(false, r.getDataInfo().getDeletedbyinference()); + assertEquals("0.91", r.getDataInfo().getTrust()); + assertEquals( + CreateActionSetSparkJob.OPENCITATIONS_CLASSID, r.getDataInfo().getProvenanceaction().getClassid()); + assertEquals( + CreateActionSetSparkJob.OPENCITATIONS_CLASSNAME, r.getDataInfo().getProvenanceaction().getClassname()); + assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemeid()); + assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemename()); + }); + + } + + @Test + void testRelationsSemantics() throws Exception { + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + tmp.foreach(r -> { + assertEquals("citation", r.getSubRelType()); + assertEquals("resultResult", r.getRelType()); + }); + assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count()); + assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count()); + + } + + @Test + void testRelationsSourceTargetPrefix() throws Exception { + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + tmp.foreach(r -> { + assertEquals("50|doi_________::", r.getSource().substring(0, 17)); + assertEquals("50|doi_________::", r.getTarget().substring(0, 17)); + }); + + } + + @Test + void testRelationsSourceTargetCouple() throws Exception { + final String doi1 = "50|doi_________::" + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x")); + final String doi2 = "50|doi_________::" + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x")); + final String doi3 = "50|doi_________::" + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9")); + final String doi4 = "50|doi_________::" + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069")); + final String doi5 = "50|doi_________::" + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4")); + final String doi6 = "50|doi_________::" + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5")); + + String inputPath = getClass() + .getResource( + "/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles") + .getPath(); + + CreateActionSetSparkJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + inputPath, + "-outputPath", + workingDir.toString() + "/actionSet" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(aa -> ((Relation) aa.getPayload())); + + JavaRDD check = tmp.filter(r -> r.getSource().equals(doi1) || r.getTarget().equals(doi1)); + + assertEquals(10, check.count()); + + check.foreach(r -> { + if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) || + r.getSource().equals(doi5) || r.getSource().equals(doi6)) { + assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass()); + assertEquals(doi1, r.getTarget()); + } + }); + + assertEquals(5, check.filter(r -> r.getSource().equals(doi1)).count()); + check.filter(r -> r.getSource().equals(doi1)).foreach(r -> assertEquals(ModelConstants.CITES, r.getRelClass())); + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala similarity index 88% rename from dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala rename to dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala index a795a910d..f21e9eab1 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/datacite/DataciteToOAFTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/datacite/DataciteToOAFTest.scala @@ -1,8 +1,7 @@ -package eu.dnetlib.dhp.actionmanager.datacite +package eu.dnetlib.dhp.datacite -import com.fasterxml.jackson.databind.ObjectMapper -import com.fasterxml.jackson.databind.SerializationFeature +import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature} import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.schema.oaf.Oaf import org.junit.jupiter.api.extension.ExtendWith diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json new file mode 100644 index 000000000..03cef4be1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json @@ -0,0 +1,86 @@ +{"10.3390/s18072310": [{"id": "influence", "unit": [{"value": "7.5597134689e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "4.903880192", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.17977512835e-08", "key": "score"}]}]} +{"10.0000/096020199389707": [{"id": "influence", "unit": [{"value": "6.34596412687e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.641151896994", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.33375102921e-09", "key": "score"}]}]} +{"10.00000/jpmc.2017.106": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]} +{"10.0000/9781845416881": [{"id": "influence", "unit": [{"value": "5.96492048955e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.12641925838e-08", "key": "score"}]}]} +{"10.0000/anziamj.v0i0.266": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.76260934675e-10", "key": "score"}]}]} +{"10.0000/anziamj.v48i0.79": [{"id": "influence", "unit": [{"value": "6.93311506443e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.002176782336", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.7668105708e-09", "key": "score"}]}]} +{"10.0000/anziamj.v50i0.1472": [{"id": "influence", "unit": [{"value": "6.26777280882e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.406656", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.39745193285e-09", "key": "score"}]}]} +{"10.0000/cja5553": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/czastest.16": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/czastest.17": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]} +{"10.0000/czastest.18": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]} +{"10.0000/czastest.20": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/czastest.21": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]} +{"10.0000/czastest.28": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]} +{"10.0000/czastest.60": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/czt.2019.1.2.15": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v4i02.36": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v4i02.37": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v4i02.38": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v5i01.32": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v6i01.24": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v6i01.27": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v6i02.41": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v6i02.44": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i01.40": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i01.42": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i01.47": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i01.51": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i01.52": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i02.86": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i02.88": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v7i02.91": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v8i01.129": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v8i01.180": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]} +{"10.0000/geoekonomi.v8i01.87": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]} +{"10.0000/hbv2004w010": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hbv2101w001": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2101w002": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2101w003": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2101w004": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2101w005": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2101w006": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2101w007": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2102w001": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hbv2102w010": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i1.13207": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i1.13208": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i1.13209": [{"id": "influence", "unit": [{"value": "6.32078461509e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.6", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.3168486939e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i1.13210": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i1.13211": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i1.13212": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]} +{"10.0000/hoplos.v1i2.13231": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i2.28782": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i2.28783": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i2.28784": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i2.28786": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i2.28787": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i2.28788": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i3.28234": [{"id": "influence", "unit": [{"value": "6.40470414877e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.6", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.89465099068e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i3.28236": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i3.28238": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i3.28239": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i3.28242": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v2i3.28243": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i4.38186": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i4.38187": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i4.38190": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i4.38207": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i4.38209": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i5.41163": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i5.41166": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i5.41167": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i5.41168": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]} +{"10.0000/hoplos.v3i5.41229": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.36360": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.40796": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.41153": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.42511": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.42555": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.42752": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.42768": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i6.42795": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} +{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json new file mode 100644 index 000000000..1b46a3d25 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json @@ -0,0 +1,38 @@ +{"doi":"10.3390/s18072310","level1":"engineering and technology","level2":"nano-technology","level3":"nanoscience & nanotechnology"} +{"doi":"10.1111/1365-2656.12831\u000210.17863/cam.24369","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"} +{"doi":"10.3929/ethz-b-000187584\u000210.1002/chem.201701644","level1":"natural sciences","level2":"NULL","level3":"NULL"} +{"doi":"10.1080/01913123.2017.1367361","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"} +{"doi":"10.1051/e3sconf/20199207011","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"environmental sciences"} +{"doi":"10.1038/onc.2015.333","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"} +{"doi":"10.1093/mnras/staa256","level1":"natural sciences","level2":"physical sciences","level3":"NULL"} +{"doi":"10.1016/j.jclepro.2018.07.166","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"} +{"doi":"10.1103/physrevlett.125.037403","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"} +{"doi":"10.1080/03602532.2017.1316285","level1":"natural sciences","level2":"NULL","level3":"NULL"} +{"doi":"10.1001/jamanetworkopen.2019.1868","level1":"medical and health sciences","level2":"other medical science","level3":"health policy & services"} +{"doi":"10.1128/mra.00874-18","level1":"natural sciences","level2":"biological sciences","level3":"plant biology & botany"} +{"doi":"10.1016/j.nancom.2018.03.001","level1":"engineering and technology","level2":"NULL","level3":"NULL"} +{"doi":"10.1112/topo.12174","level1":"natural sciences","level2":"NULL","level3":"NULL"} +{"doi":"10.12688/wellcomeopenres.15846.1","level1":"medical and health sciences","level2":"health sciences","level3":"NULL"} +{"doi":"10.21468/scipostphys.3.1.001","level1":"natural sciences","level2":"physical sciences","level3":"NULL"} +{"doi":"10.1088/1741-4326/ab6c77","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"} +{"doi":"10.1109/tpwrs.2019.2944747","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"} +{"doi":"10.1016/j.expthermflusci.2019.109994\u000210.17863/cam.46212","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"} +{"doi":"10.1109/tc.2018.2860012","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"computer hardware & architecture"} +{"doi":"10.1002/mma.6622","level1":"natural sciences","level2":"mathematics","level3":"numerical & computational mathematics"} +{"doi":"10.1051/radiopro/2020020","level1":"natural sciences","level2":"chemical sciences","level3":"NULL"} +{"doi":"10.1007/s12268-019-1003-4","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"} +{"doi":"10.3390/cancers12010236","level1":"medical and health sciences","level2":"health sciences","level3":"biochemistry & molecular biology"} +{"doi":"10.6084/m9.figshare.9912614\u000210.6084/m9.figshare.9912614.v1\u000210.1080/00268976.2019.1665199","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"} +{"doi":"10.1175/jpo-d-17-0239.1","level1":"natural sciences","level2":"biological sciences","level3":"marine biology & hydrobiology"} +{"doi":"10.1007/s13218-020-00674-7","level1":"engineering and technology","level2":"industrial biotechnology","level3":"industrial engineering & automation"} +{"doi":"10.1016/j.psyneuen.2016.02.003\u000210.1016/j.psyneuen.2016.02.00310.7892/boris.78886\u000210.7892/boris.78886","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"} +{"doi":"10.1109/ted.2018.2813542","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"} +{"doi":"10.3989/scimar.04739.25a","level1":"natural sciences","level2":"biological sciences","level3":"NULL"} +{"doi":"10.3390/su12187503","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"NULL"} +{"doi":"10.1016/j.ccell.2018.08.017","level1":"medical and health sciences","level2":"basic medicine","level3":"biochemistry & molecular biology"} +{"doi":"10.1103/physrevresearch.2.023322","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"} +{"doi":"10.1039/c8cp03234c","level1":"natural sciences","level2":"NULL","level3":"NULL"} +{"doi":"10.5281/zenodo.3696557\u000210.5281/zenodo.3696556\u000210.1109/jsac.2016.2545384","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"} +{"doi":"10.1038/ng.3667\u000210.1038/ng.3667.\u000210.17615/tct6-4m26\u000210.17863/cam.15649","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"} +{"doi":"10.1016/j.jclepro.2019.119065","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"} +{"doi":"10.1111/pce.13392","level1":"agricultural and veterinary sciences","level2":"agriculture, forestry, and fisheries","level3":"agronomy & agriculture"} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv new file mode 100644 index 000000000..e874353e8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv @@ -0,0 +1,38 @@ +dedup_wf_001::ddcc7a56fa13e49bcc59c6bdd19ad26c 10.3390/s18072310 engineering and technology nano-technology nanoscience & nanotechnology +dedup_wf_001::b76062d56e28224eac56111a4e1e5ecf 10.1111/1365-2656.1283110.17863/cam.24369 social sciences psychology and cognitive sciences NULL +dedup_wf_001::bb752acb8f403a25fa7851a302f7b7ac 10.3929/ethz-b-00018758410.1002/chem.201701644 natural sciences NULL NULL +dedup_wf_001::2f1435a9201ecf5cbbcb12c9b2d971cd 10.1080/01913123.2017.1367361 medical and health sciences clinical medicine oncology & carcinogenesis +dedup_wf_001::fc9e47ec16c67b101724320d4b030514 10.1051/e3sconf/20199207011 natural sciences earth and related environmental sciences environmental sciences +dedup_wf_001::caa1e5b4de387cb31751552f4f0f5d72 10.1038/onc.2015.333 medical and health sciences clinical medicine oncology & carcinogenesis +dedup_wf_001::c2a98df5637d69bf0524eaf40fe6bf11 10.1093/mnras/staa256 natural sciences physical sciences NULL +dedup_wf_001::c221262bdc77cbfd59859a402f0e3991 10.1016/j.jclepro.2018.07.166 engineering and technology other engineering and technologies building & construction +doiboost____::d56d9dc21f317b3e009d5b6c8ea87212 10.1103/physrevlett.125.037403 natural sciences physical sciences nuclear & particles physics +dedup_wf_001::8a7269c8ee6470b2fb4fd384bc389e08 10.1080/03602532.2017.1316285 natural sciences NULL NULL +dedup_wf_001::28342ebbc19833e4e1f4a2b23cf5ee20 10.1001/jamanetworkopen.2019.1868 medical and health sciences other medical science health policy & services +dedup_wf_001::c1e1daf2b55dd9ec8e1c7c7458bbc7bc 10.1128/mra.00874-18 natural sciences biological sciences plant biology & botany +dedup_wf_001::a2ef4a2720c71907180750e5871298ef 10.1016/j.nancom.2018.03.001 engineering and technology NULL NULL +dedup_wf_001::676f46a31519e83a89efcb1c626286fb 10.1112/topo.12174 natural sciences NULL NULL +dedup_wf_001::6f2761642f1e39313388e2c4060657dd 10.12688/wellcomeopenres.15846.1 medical and health sciences health sciences NULL +dedup_wf_001::e414c1dec599521a9635a60de0f6755b 10.21468/scipostphys.3.1.001 natural sciences physical sciences NULL +dedup_wf_001::f3395fe0f330164ea424dc61c86c9a3d 10.1088/1741-4326/ab6c77 natural sciences physical sciences nuclear & particles physics +dedup_wf_001::a4f32a97a783117012f1de11797e73f2 10.1109/tpwrs.2019.2944747 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering +dedup_wf_001::313ae1cd083ae1696d12dd1909f97df8 10.1016/j.expthermflusci.2019.10999410.17863/cam.46212 engineering and technology mechanical engineering mechanical engineering & transports +dedup_wf_001::2a300a7d3ca7347791ebcef986bc0682 10.1109/tc.2018.2860012 engineering and technology electrical engineering, electronic engineering, information engineering computer hardware & architecture +doiboost____::5b79bd7bd9f87361b4a4abc3cbb2df75 10.1002/mma.6622 natural sciences mathematics numerical & computational mathematics +dedup_wf_001::6a3f61f217a2519fbaddea1094e3bfc2 10.1051/radiopro/2020020 natural sciences chemical sciences NULL +dedup_wf_001::a3f0430309a639f4234a0e57b10f2dee 10.1007/s12268-019-1003-4 medical and health sciences basic medicine NULL +dedup_wf_001::b6b8a3a1cccbee459cf3343485efdb12 10.3390/cancers12010236 medical and health sciences health sciences biochemistry & molecular biology +dedup_wf_001::dd06ee7974730e7b09a4f03c83b3f9bd 10.6084/m9.figshare.991261410.6084/m9.figshare.9912614.v110.1080/00268976.2019.1665199 natural sciences chemical sciences physical chemistry +dedup_wf_001::027c78bef6f972b5e26dfea55d30fbe3 10.1175/jpo-d-17-0239.1 natural sciences biological sciences marine biology & hydrobiology +dedup_wf_001::43edc179aa9e1fbaf582c5203b18b519 10.1007/s13218-020-00674-7 engineering and technology industrial biotechnology industrial engineering & automation +dedup_wf_001::e7770e11cd6eb514bb52c07b5a8a80f0 10.1016/j.psyneuen.2016.02.00310.1016/j.psyneuen.2016.02.00310.7892/boris.7888610.7892/boris.78886 medical and health sciences basic medicine NULL +dedup_wf_001::80bc15d69bdc589149631f3439dde5aa 10.1109/ted.2018.2813542 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering +dedup_wf_001::42c1cfa33e7872944b920cff90f4d99e 10.3989/scimar.04739.25a natural sciences biological sciences NULL +dedup_wf_001::9bacdbbaa9da3658b7243d5de8e3ce14 10.3390/su12187503 natural sciences earth and related environmental sciences NULL +dedup_wf_001::59e43d3527dcfecb6097fbd5740c8950 10.1016/j.ccell.2018.08.017 medical and health sciences basic medicine biochemistry & molecular biology +doiboost____::e024d1b738df3b24bc58fa0228542571 10.1103/physrevresearch.2.023322 natural sciences physical sciences nuclear & particles physics +dedup_wf_001::66e9a3237fa8178886d26d3c2d5b9e66 10.1039/c8cp03234c natural sciences NULL NULL +dedup_wf_001::83737ab4205bae751571bb3b166efa18 10.5281/zenodo.369655710.5281/zenodo.369655610.1109/jsac.2016.2545384 engineering and technology electrical engineering, electronic engineering, information engineering networking & telecommunications +dedup_wf_001::e3f892db413a689e572dd256acad55fe 10.1038/ng.366710.1038/ng.3667.10.17615/tct6-4m2610.17863/cam.15649 medical and health sciences health sciences genetics & heredity +dedup_wf_001::14ba594e8fd081847bc3f50f56335003 10.1016/j.jclepro.2019.119065 engineering and technology other engineering and technologies building & construction +dedup_wf_001::08ac7b33a41bcea2d055ecd8585d632e 10.1111/pce.13392 agricultural and veterinary sciences agriculture, forestry, and fisheries agronomy & agriculture \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1 new file mode 100644 index 000000000..d93d6fd99 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input1 @@ -0,0 +1,8 @@ +oci,citing,cited,creation,timespan,journal_sc,author_sc +02001000007362801000805046300010563030608046333-0200101010136193701050501630209010637020000083700020400083733,10.1007/s10854-015-3684-x,10.1111/j.1551-2916.2008.02408.x,2015-09-01,P7Y2M,no,no +02001000007362801000805046300010563030608046333-02001000007362801000805046300010463020101046309,10.1007/s10854-015-3684-x,10.1007/s10854-014-2114-9,2015-09-01,P1Y2M4D,yes,no +02001000007362801000805046300010563030608046333-020010001063619371214271022182329370200010337000937000609,10.1007/s10854-015-3684-x,10.1016/j.ceramint.2013.09.069,2015-09-01,P1Y6M,no,no +02001000007362801000805046300010563030608046333-02001000007362801000805046300000963090901036304,10.1007/s10854-015-3684-x,10.1007/s10854-009-9913-4,2015-09-01,P6Y3M10D,yes,no +02001000007362801000805046300010563030608046333-02001000106360000030863010009085807025909000307006305,10.1007/s10854-015-3684-x,10.1016/0038-1098(72)90370-5,2015-09-01,P43Y8M,no,no +02001000007362801000805046300010563030608056309-02001000106361937281010370200010437000937000308,10.1007/s10854-015-3685-9,10.1016/j.saa.2014.09.038,2015-09-03,P0Y7M,no,no +02001000007362801000805046300010563030608056309-0200100010636193722102912171027370200010537000437000106,10.1007/s10854-015-3685-9,10.1016/j.matchar.2015.04.016,2015-09-03,P0Y2M,no,no \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2 new file mode 100644 index 000000000..14ee8b354 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input2 @@ -0,0 +1,8 @@ +oci,citing,cited,creation,timespan,journal_sc,author_sc +02001000308362804010509076300010963000003086301-0200100020936020001003227000009010004,10.1038/s41597-019-0038-1,10.1029/2010wr009104,2019-04-15,P8Y1M,no,no +02001000308362804010509076300010963000003086301-0200100010636280103060463080105025800015900000006006303,10.1038/s41597-019-0038-1,10.1016/s1364-8152(01)00060-3,2019-04-15,P17Y3M,no,no +02001000308362804010509076300010963000003086301-02001000007362800000407076300010063000401066333,10.1038/s41597-019-0038-1,10.1007/s00477-010-0416-x,2019-04-15,P8Y9M6D,no,no +02001000308362804010509076300010963000003086301-02001000007362800000700046300010363000905016308,10.1038/s41597-019-0038-1,10.1007/s00704-013-0951-8,2019-04-15,P5Y9M23D,no,no +02001000308362804010509076300010963000003086301-02001000002361924123705070707,10.1038/s41597-019-0038-1,10.1002/joc.5777,2019-04-15,P0Y8M1D,no,no +02001000308362804010509076300010963000003086301-02005010904361714282863020263040504076302000108,10.1038/s41597-019-0038-1,10.5194/hess-22-4547-2018,2019-04-15,P0Y7M18D,no,no +02001000308362804010509076300010963000003086301-02001000002361924123703050404,10.1038/s41597-019-0038-1,10.1002/joc.3544,2019-04-15,P6Y9M6D,no,no \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3 b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3 new file mode 100644 index 000000000..0611929d5 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles/input3 @@ -0,0 +1,9 @@ +oci,citing,cited,creation,timespan,journal_sc,author_sc +0200100000236090708010101090307000202023727141528-020050302063600040000010307,10.1002/9781119370222.refs,10.5326/0400137,2020-06-22,P16Y3M,no,no +0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020000073700000301093733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2007.00319.x,2020-06-22,P12Y8M,no,no +0200100000236090708010101090307000202023727141528-0200101010136312830370102030509,10.1002/9781119370222.refs,10.1111/vsu.12359,2020-06-22,P4Y10M29D,no,no +0200100000236090708010101090307000202023727141528-020050302063600030900020904,10.1002/9781119370222.refs,10.5326/0390294,2020-06-22,P17Y1M,no,no +0200100000236090708010101090307000202023727141528-020050302063600040200030701,10.1002/9781119370222.refs,10.5326/0420371,2020-06-22,P13Y9M,no,no +0200100000236090708010101090307000202023727141528-0200101010136193701050302630905003337020001033701020000003733,10.1002/9781119370222.refs,10.1111/j.1532-950x.2013.12000.x,2020-06-22,P7Y2M,no,no +0200100000236090708010101090307000202023727141528-020010008003600000408000106093702000006370306070200,10.1002/9781119370222.refs,10.1080/00480169.2006.36720,2020-06-22,P13Y6M,no,no +0200100000236090708010101090307000202023727141528-0200101010136193701070501630008010337020000063700000003033733,10.1002/9781119370222.refs,10.1111/j.1751-0813.2006.00033.x,2020-06-22,P13Y8M,no,no \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml index 83a47ea6c..4ea003926 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/consistency/oozie_app/workflow.xml @@ -89,7 +89,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=15000 --graphBasePath${graphBasePath} --o${graphOutputPath} @@ -114,7 +114,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 + --conf spark.sql.shuffle.partitions=15000 --graphInputPath${graphBasePath} --outputPath${workingPath}/grouped_entities diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 25f0ff381..1b1c850ba 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -70,7 +70,7 @@ case object Crossref2Oaf { "reference-book" -> "0002 Book", "monograph" -> "0002 Book", "journal-article" -> "0001 Article", - "dissertation" -> "0006 Doctoral thesis", + "dissertation" -> "0044 Thesis", "other" -> "0038 Other literature type", "peer-review" -> "0015 Review", "proceedings" -> "0004 Conference object", @@ -206,11 +206,16 @@ case object Crossref2Oaf { else { instance.setDateofacceptance(asField(createdDate.getValue)) } - val s: String = (json \ "URL").extract[String] - val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct - if (links.nonEmpty) { - instance.setUrl(links.asJava) - } + val s: List[String] = List("https://doi.org/" + doi) +// val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null && p.toLowerCase().contains(doi.toLowerCase())).distinct +// if (links.nonEmpty) { +// instance.setUrl(links.asJava) +// } + if(s.nonEmpty) + { + instance.setUrl(s.asJava) + } + result.setInstance(List(instance).asJava) //IMPORTANT diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala index ecb389af8..016279787 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/SparkProcessMAG.scala @@ -111,26 +111,9 @@ object SparkProcessMAG { .map(item => ConversionUtil.updatePubsWithConferenceInfo(item)) .write .mode(SaveMode.Overwrite) - .save(s"$workingPath/merge_step_2_conference") - - - magPubs= spark.read.load(s"$workingPath/merge_step_2_conference").as[Publication] - .map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)] - - val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl] - - - logger.info("Phase 5) enrich publication with URL and Instances") - magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left") - .map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) } - .write.mode(SaveMode.Overwrite) .save(s"$workingPath/merge_step_3") -// logger.info("Phase 6) Enrich Publication with description") -// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract] -// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract") - val paperAbstract = spark.read.load((s"$workingPath/PaperAbstract")).as[MagPaperAbstract] @@ -162,12 +145,14 @@ object SparkProcessMAG { .write.mode(SaveMode.Overwrite) .save(s"$workingPath/mag_publication") + spark.read.load(s"$workingPath/mag_publication").as[Publication] + .filter(p => p.getId == null) + .groupByKey(p => p.getId) + .reduceGroups((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) + .map(_._2) + .write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") - val s:RDD[Publication] = spark.read.load(s"$workingPath/mag_publication").as[Publication] - .map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b)) - .map(_._2) - spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"$targetPath/magPublication") } } diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala index 75fb3f787..5ef92cfa4 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/crossref/CrossrefMappingTest.scala @@ -612,4 +612,26 @@ class CrossrefMappingTest { } + @Test + def testMultipleURLs() :Unit = { + val json = Source.fromInputStream(getClass.getResourceAsStream("multiple_urls.json")).mkString + + + assertNotNull(json) + assertFalse(json.isEmpty); + + val resultList: List[Oaf] = Crossref2Oaf.convert(json) + + assertTrue(resultList.nonEmpty) + + + val item : Result = resultList.filter(p => p.isInstanceOf[Result]).head.asInstanceOf[Result] + + assertEquals(1, item.getInstance().size()) + assertEquals(1, item.getInstance().get(0).getUrl().size()) + assertEquals("https://doi.org/10.1016/j.jas.2019.105013", item.getInstance().get(0).getUrl().get(0)) + //println(mapper.writeValueAsString(item)) + + } + } diff --git a/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json new file mode 100644 index 000000000..5f90feac4 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/test/resources/eu/dnetlib/doiboost/crossref/multiple_urls.json @@ -0,0 +1,614 @@ + +{ +"indexed": { +"date-parts": [ +[ +2021, +10, +31 +] +], +"date-time": "2021-10-31T15:48:01Z", +"timestamp": 1635695281393 +}, +"reference-count": 39, +"publisher": "Elsevier BV", +"license": [ +{ +"start": { +"date-parts": [ +[ +2019, +12, +1 +] +], +"date-time": "2019-12-01T00:00:00Z", +"timestamp": 1575158400000 +}, +"content-version": "tdm", +"delay-in-days": 0, +"URL": "https://www.elsevier.com/tdm/userlicense/1.0/" +}, +{ +"start": { +"date-parts": [ +[ +2019, +9, +13 +] +], +"date-time": "2019-09-13T00:00:00Z", +"timestamp": 1568332800000 +}, +"content-version": "vor", +"delay-in-days": 0, +"URL": "http://creativecommons.org/licenses/by/4.0/" +} +], +"funder": [ +{ +"DOI": "10.13039/100001182", +"name": "INSTAP", +"doi-asserted-by": "publisher" +}, +{ +"DOI": "10.13039/100014440", +"name": "Ministry of Science, Innovation and Universities", +"doi-asserted-by": "publisher", +"award": [ +"RYC-2016-19637" +] +}, +{ +"DOI": "10.13039/100010661", +"name": "European Union’s Horizon 2020", +"doi-asserted-by": "publisher", +"award": [ +"746446" +] +} +], +"content-domain": { +"domain": [ +"elsevier.com", +"sciencedirect.com" +], +"crossmark-restriction": true +}, +"short-container-title": [ +"Journal of Archaeological Science" +], +"published-print": { +"date-parts": [ +[ +2019, +12 +] +] +}, +"DOI": "10.1016/j.jas.2019.105013", +"type": "journal-article", +"created": { +"date-parts": [ +[ +2019, +9, +25 +] +], +"date-time": "2019-09-25T20:05:08Z", +"timestamp": 1569441908000 +}, +"page": "105013", +"update-policy": "http://dx.doi.org/10.1016/elsevier_cm_policy", +"source": "Crossref", +"is-referenced-by-count": 21, +"title": [ +"A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery" +], +"prefix": "10.1016", +"volume": "112", +"author": [ +{ +"given": "H.A.", +"family": "Orengo", +"sequence": "first", +"affiliation": [ + +] +}, +{ +"given": "A.", +"family": "Garcia-Molsosa", +"sequence": "additional", +"affiliation": [ + +] +} +], +"member": "78", +"reference": [ +{ +"key": "10.1016/j.jas.2019.105013_bib1", +"doi-asserted-by": "crossref", +"first-page": "85", +"DOI": "10.1080/17538947.2016.1250829", +"article-title": "Remote sensing heritage in a petabyte-scale: satellite data and heritage Earth Engine© applications", +"volume": "10", +"author": "Agapiou", +"year": "2017", +"journal-title": "Int. J. Digit. Earth" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib2", +"series-title": "Extracting Meaning from Ploughsoil Assemblages", +"first-page": "1", +"article-title": "Extracting meaning from ploughsoil assemblages: assessments of the past, strategies for the future", +"author": "Alcock", +"year": "2000" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib3", +"series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World", +"first-page": "1", +"article-title": "Introduction", +"author": "Alcock", +"year": "2004" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib4", +"doi-asserted-by": "crossref", +"first-page": "93", +"DOI": "10.1111/j.1538-4632.1995.tb00338.x", +"article-title": "Local indicators of spatial association—LISA", +"volume": "27", +"author": "Anselin", +"year": "1995", +"journal-title": "Geogr. Anal." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib5", +"series-title": "Archaeological Survey", +"author": "Banning", +"year": "2002" +}, +{ +"issue": "1/2", +"key": "10.1016/j.jas.2019.105013_bib6", +"doi-asserted-by": "crossref", +"first-page": "123", +"DOI": "10.2307/3181488", +"article-title": "GIS, archaeological survey and landscape archaeology on the island of Kythera, Greece", +"volume": "29", +"author": "Bevan", +"year": "2004", +"journal-title": "J. Field Archaeol." +}, +{ +"issue": "1", +"key": "10.1016/j.jas.2019.105013_bib8", +"doi-asserted-by": "crossref", +"first-page": "5", +"DOI": "10.1023/A:1010933404324", +"article-title": "Random forests", +"volume": "45", +"author": "Breiman", +"year": "2001", +"journal-title": "Mach. Learn." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib9", +"series-title": "Sampling in Contemporary British Archaeology", +"author": "Cherry", +"year": "1978" +}, +{ +"issue": "3", +"key": "10.1016/j.jas.2019.105013_bib10", +"doi-asserted-by": "crossref", +"first-page": "273", +"DOI": "10.1016/0734-189X(84)90197-X", +"article-title": "Segmentation of a high-resolution urban scene using texture operators", +"volume": "25", +"author": "Conners", +"year": "1984", +"journal-title": "Comput. Vis. Graph Image Process" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib11", +"first-page": "31", +"article-title": "Old land surfaces and modern ploughsoil: implications of recent work at Maxey, Cambridgeshire", +"volume": "2", +"author": "Crowther", +"year": "1983", +"journal-title": "Scott. Archaeol. Rev." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib12", +"series-title": "Settlement Pattern Studies in the Americas: Fifty Years since Virú", +"first-page": "203", +"article-title": "Conclusions: the settlement pattern concept from an Americanist perspective", +"author": "Fish", +"year": "1999" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib13", +"doi-asserted-by": "crossref", +"first-page": "21", +"DOI": "10.3390/geosciences9010021", +"article-title": "Remote sensing and historical morphodynamics of alluvial plains. The 1909 indus flood and the city of Dera Gazhi Khan (province of Punjab, Pakistan)", +"volume": "9", +"author": "Garcia", +"year": "2019", +"journal-title": "Geosciences" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib14", +"unstructured": "Georgiadis, M.; Garcia-Molsosa, A.; Orengo, H.A.; Kefalidou, E. and Kallintzi, K. In Preparation. APAX Project 2015-2018: A Preliminary Report. (Hesperia)." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib15", +"series-title": "Geographical Information Systems and Landscape Archaeology", +"first-page": "35", +"article-title": "Regional survey and GIS: the boeotia project", +"author": "Gillings", +"year": "1999" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib16", +"doi-asserted-by": "crossref", +"first-page": "18", +"DOI": "10.1016/j.rse.2017.06.031", +"article-title": "Google Earth engine: planetary-scale geospatial analysis for everyone", +"volume": "202", +"author": "Gorelick", +"year": "2017", +"journal-title": "Remote Sens. Environ." +}, +{ +"issue": "107", +"key": "10.1016/j.jas.2019.105013_bib17", +"doi-asserted-by": "crossref", +"first-page": "177", +"DOI": "10.1111/j.0031-868X.2004.00278.x", +"article-title": "Photogrammetric reconstruction of the great buddha of Bamiyan, Afghanistan", +"volume": "19", +"author": "Grün", +"year": "2004", +"journal-title": "Photogramm. Rec." +}, +{ +"issue": "6", +"key": "10.1016/j.jas.2019.105013_bib18", +"doi-asserted-by": "crossref", +"first-page": "610", +"DOI": "10.1109/TSMC.1973.4309314", +"article-title": "Textural features for image classification", +"author": "Haralick", +"year": "1973", +"journal-title": "IEEE Trans. Syst., Man, Cybernet., SMC-3" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib19", +"doi-asserted-by": "crossref", +"first-page": "76", +"DOI": "10.1558/jmea.v14i1.76", +"article-title": "Excavating to excess? Implications of the last decade of archaeology in Israel", +"volume": "14", +"author": "Kletter", +"year": "2001", +"journal-title": "J. Mediterr. Archaeol." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib20", +"first-page": "299", +"article-title": "Testing Google Earth Engine for the automatic identification and vectorization of archaeological features: a case study from Faynan, Jordan", +"volume": "15", +"author": "Liss", +"year": "2017", +"journal-title": "J. Archaeol. Sci.: Report" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib21", +"series-title": "Geographical Information Systems and Landscape Archaeology", +"first-page": "55", +"article-title": "Towards a methodology for modelling surface survey data: the sangro valley project", +"author": "Lock", +"year": "1999" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib22", +"series-title": "Extracting Meaning from Ploughsoil Assemblages", +"first-page": "5", +"article-title": "Methods of collection recording and quantification", +"author": "Mattingly", +"year": "2000" +}, +{ +"issue": "14", +"key": "10.1016/j.jas.2019.105013_bib23", +"doi-asserted-by": "crossref", +"first-page": "E778", +"DOI": "10.1073/pnas.1115472109", +"article-title": "Mapping patterns of long-term settlement in Northern Mesopotamia at a large scale", +"volume": "109", +"author": "Menze", +"year": "2012", +"journal-title": "Proc. Natl. Acad. Sci." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib24", +"doi-asserted-by": "crossref", +"first-page": "80", +"DOI": "10.1016/j.jas.2015.04.002", +"article-title": "A supervised machine-learning approach towards geochemical predictive modelling in archaeology", +"volume": "59", +"author": "Oonk", +"year": "2015", +"journal-title": "J. Archaeol. Sci." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib25", +"doi-asserted-by": "crossref", +"first-page": "49", +"DOI": "10.1016/j.isprsjprs.2012.07.005", +"article-title": "Combining terrestrial stereophotogrammetry, DGPS and GIS-based 3D voxel modelling in the volumetric recording of archaeological features", +"volume": "76", +"author": "Orengo", +"year": "2013", +"journal-title": "ISPRS J. Photogrammetry Remote Sens." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib26", +"doi-asserted-by": "crossref", +"first-page": "100", +"DOI": "10.1016/j.jas.2015.10.008", +"article-title": "Photogrammetric re-discovery of the Eastern Thessalian hidden long-term landscapes", +"volume": "64", +"author": "Orengo", +"year": "2015", +"journal-title": "J. Archaeol. Sci." +}, +{ +"issue": "3", +"key": "10.1016/j.jas.2019.105013_bib27", +"doi-asserted-by": "crossref", +"first-page": "479", +"DOI": "10.3764/aja.122.3.0479", +"article-title": "Towards a definition of Minoan agro-pastoral landscapes: results of the survey at Palaikastro (Crete)", +"volume": "122", +"author": "Orengo", +"year": "2018", +"journal-title": "Am. J. Archaeol." +}, +{ +"issue": "7", +"key": "10.1016/j.jas.2019.105013_bib28", +"doi-asserted-by": "crossref", +"first-page": "735", +"DOI": "10.3390/rs9070735", +"article-title": "Large-scale, multi-temporal remote sensing of palaeo-river networks: a case study from Northwest India and its implications for the Indus civilisation", +"volume": "9", +"author": "Orengo", +"year": "2017", +"journal-title": "Remote Sens." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib29", +"doi-asserted-by": "crossref", +"first-page": "1361", +"DOI": "10.1002/esp.4317", +"article-title": "Multi-scale relief model (MSRM): a new algorithm for the visualization of subtle topographic change of variable size in digital elevation models", +"volume": "43", +"author": "Orengo", +"year": "2018", +"journal-title": "Earth Surf. Process. Landforms" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib30", +"series-title": "Submitted to Proceedings of the National Academy of Sciences", +"article-title": "Living on the edge of the desert: automated detection of archaeological mounds in Cholistan (Pakistan) using machine learning classification of multi-sensor multi-temporal satellite data", +"author": "Orengo", +"year": "2019" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib31", +"first-page": "154", +"article-title": "How many trees in a random forest?", +"volume": "vol. 7376", +"author": "Oshiro", +"year": "2012" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib32", +"article-title": "Decision-making in modern surveys", +"volume": "ume 1", +"author": "Plog", +"year": "1978" +}, +{ +"issue": "4", +"key": "10.1016/j.jas.2019.105013_bib33", +"doi-asserted-by": "crossref", +"first-page": "100", +"DOI": "10.3390/geosciences7040100", +"article-title": "From above and on the ground: geospatial methods for recording endangered archaeology in the Middle East and north africa", +"volume": "7", +"author": "Rayne", +"year": "2017", +"journal-title": "Geosciences" +}, +{ +"issue": "1", +"key": "10.1016/j.jas.2019.105013_bib34", +"doi-asserted-by": "crossref", +"first-page": "1", +"DOI": "10.1080/00438243.1978.9979712", +"article-title": "The design of archaeological surveys", +"volume": "10", +"author": "Schiffer", +"year": "1978", +"journal-title": "World Archaeol." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib35", +"series-title": "Experiments in the Collection and Analysis of Archaeological Survey Data: the East Hampshire Survey", +"author": "Shennan", +"year": "1985" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib36", +"doi-asserted-by": "crossref", +"first-page": "1066", +"DOI": "10.1016/j.culher.2016.06.006", +"article-title": "Drones over Mediterranean landscapes. The potential of small UAV's (drones) for site detection and heritage management in archaeological survey projects: a case study from Le Pianelle in the Tappino Valley, Molise (Italy)", +"volume": "22", +"author": "Stek", +"year": "2016", +"journal-title": "J. Cult. Herit." +}, +{ +"key": "10.1016/j.jas.2019.105013_bib37", +"series-title": "Side-by-Side Survey. Comparative Regional Studies in the Mediterranean World", +"first-page": "65", +"article-title": "Side-by-side and back to front: exploring intra-regional latitudinal and longitudinal comparability in survey data. Three case studies from Metaponto, southern Italy", +"author": "Thomson", +"year": "2004" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib38", +"series-title": "Digital Discovery. Exploring New Frontiers in Human Heritage. Computer Applications and Quantitative Methods in Archaeology", +"article-title": "Computer vision and machine learning for archaeology", +"author": "van der Maaten", +"year": "2007" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib39", +"doi-asserted-by": "crossref", +"first-page": "1114", +"DOI": "10.1111/j.1475-4754.2012.00667.x", +"article-title": "Computer vision-based orthophoto mapping of complex archaeological sites: the ancient quarry of Pitaranha (Portugal-Spain)", +"volume": "54", +"author": "Verhoeven", +"year": "2012", +"journal-title": "Archaeometry" +}, +{ +"key": "10.1016/j.jas.2019.105013_bib40", +"series-title": "A Guide for Salvage Archeology", +"author": "Wendorf", +"year": "1962" +} +], +"container-title": [ +"Journal of Archaeological Science" +], +"original-title": [ + +], +"language": "en", +"link": [ +{ +"URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/xml", +"content-type": "text/xml", +"content-version": "vor", +"intended-application": "text-mining" +}, +{ +"URL": "https://api.elsevier.com/content/article/PII:S0305440319301001?httpAccept=text/plain", +"content-type": "text/plain", +"content-version": "vor", +"intended-application": "text-mining" +} +], +"deposited": { +"date-parts": [ +[ +2019, +11, +25 +] +], +"date-time": "2019-11-25T06:46:34Z", +"timestamp": 1574664394000 +}, +"score": 1, +"subtitle": [ + +], +"short-title": [ + +], +"issued": { +"date-parts": [ +[ +2019, +12 +] +] +}, +"references-count": 39, +"alternative-id": [ +"S0305440319301001" +], +"URL": "http://dx.doi.org/10.1016/j.jas.2019.105013", +"relation": { + +}, +"ISSN": [ +"0305-4403" +], +"issn-type": [ +{ +"value": "0305-4403", +"type": "print" +} +], +"subject": [ +"Archaeology", +"Archaeology" +], +"published": { +"date-parts": [ +[ +2019, +12 +] +] +}, +"assertion": [ +{ +"value": "Elsevier", +"name": "publisher", +"label": "This article is maintained by" +}, +{ +"value": "A brave new world for archaeological survey: Automated machine learning-based potsherd detection using high-resolution drone imagery", +"name": "articletitle", +"label": "Article Title" +}, +{ +"value": "Journal of Archaeological Science", +"name": "journaltitle", +"label": "Journal Title" +}, +{ +"value": "https://doi.org/10.1016/j.jas.2019.105013", +"name": "articlelink", +"label": "CrossRef DOI link to publisher maintained version" +}, +{ +"value": "article", +"name": "content_type", +"label": "Content Type" +}, +{ +"value": "© 2019 The Authors. Published by Elsevier Ltd.", +"name": "copyright", +"label": "Copyright" +} +], +"article-number": "105013" +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 23e97a97a..392a5ab44 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -25,6 +25,24 @@ public class PropagationConstant { private PropagationConstant() { } + public static final String DOI = "doi"; + public static final String REF_DOI = ".refs"; + + public static final String UPDATE_DATA_INFO_TYPE = "update"; + public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos"; + public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE"; + public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip"; + + public static final String FOS_CLASS_ID = "FOS"; + public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification"; + + public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations"; + public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations"; + public static final String ID_PREFIX = "50|doi_________::"; + public static final String OC_TRUST = "0.91"; + + public final static String NULL = "NULL"; + public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional"; public static final String PROPAGATION_DATA_INFO_TYPE = "propagation"; @@ -75,10 +93,25 @@ public class PropagationConstant { public static DataInfo getDataInfo( String inference_provenance, String inference_class_id, String inference_class_name, String qualifierSchema) { + + return getDataInfo(inference_provenance, inference_class_id, inference_class_name, qualifierSchema, "0.85"); + } + + public static DataInfo getDataInfo( + String inference_provenance, String inference_class_id, String inference_class_name, String qualifierSchema, + String trust) { + return getDataInfo( + inference_provenance, inference_class_id, inference_class_name, qualifierSchema, trust, true); + + } + + public static DataInfo getDataInfo( + String inference_provenance, String inference_class_id, String inference_class_name, String qualifierSchema, + String trust, boolean inferred) { DataInfo di = new DataInfo(); - di.setInferred(true); + di.setInferred(inferred); di.setDeletedbyinference(false); - di.setTrust("0.85"); + di.setTrust(trust); di.setInferenceprovenance(inference_provenance); di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name, qualifierSchema)); return di; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java index ef419a042..474944260 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/merge/MergeGraphTableSparkJob.java @@ -127,13 +127,6 @@ public class MergeGraphTableSparkJob { } }, Encoders.bean(p_clazz)) .filter((FilterFunction

) Objects::nonNull) - .filter((FilterFunction

) o -> { - HashSet collectedFromNames = Optional - .ofNullable(o.getCollectedfrom()) - .map(c -> c.stream().map(KeyValue::getValue).collect(Collectors.toCollection(HashSet::new))) - .orElse(new HashSet<>()); - return !collectedFromNames.contains("Datacite"); - }) .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafApplication.java deleted file mode 100644 index 4cff88d82..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafApplication.java +++ /dev/null @@ -1,136 +0,0 @@ - -package eu.dnetlib.dhp.oa.graph.raw; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.util.*; - -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FilterFunction; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.clearspring.analytics.util.Lists; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; -import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; - -public class CopyHdfsOafApplication extends AbstractMigrationApplication { - - private static final Logger log = LoggerFactory.getLogger(CopyHdfsOafApplication.class); - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - CopyHdfsOafApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json"))); - parser.parseArgument(args); - - final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String mdstoreManagerUrl = parser.get("mdstoreManagerUrl"); - log.info("mdstoreManagerUrl: {}", mdstoreManagerUrl); - - final String mdFormat = parser.get("mdFormat"); - log.info("mdFormat: {}", mdFormat); - - final String mdLayout = parser.get("mdLayout"); - log.info("mdLayout: {}", mdLayout); - - final String mdInterpretation = parser.get("mdInterpretation"); - log.info("mdInterpretation: {}", mdInterpretation); - - final String hdfsPath = parser.get("hdfsPath"); - log.info("hdfsPath: {}", hdfsPath); - - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - - final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); - final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService); - - final Set paths = mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation); - - final SparkConf conf = new SparkConf(); - runWithSparkSession(conf, isSparkSessionManaged, spark -> processPaths(spark, vocs, hdfsPath, paths)); - } - - public static void processPaths(final SparkSession spark, - final VocabularyGroup vocs, - final String outputPath, - final Set paths) { - - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - - log.info("Found {} mdstores", paths.size()); - paths.forEach(log::info); - - final String[] validPaths = paths - .stream() - .filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration())) - .toArray(String[]::new); - log.info("Non empty mdstores {}", validPaths.length); - - if (validPaths.length > 0) { - // load the dataset - Dataset oaf = spark - .read() - .load(validPaths) - .as(Encoders.kryo(Oaf.class)); - - // dispatch each entity type individually in the respective graph subdirectory in append mode - for (Map.Entry e : ModelSupport.oafTypes.entrySet()) { - oaf - .filter((FilterFunction) o -> o.getClass().getSimpleName().toLowerCase().equals(e.getKey())) - .map((MapFunction) OBJECT_MAPPER::writeValueAsString, Encoders.bean(e.getValue())) - .write() - .option("compression", "gzip") - .mode(SaveMode.Append) - .text(outputPath + "/" + e.getKey()); - } - } - } - - private static Relation getInverse(Relation rel, VocabularyGroup vocs) { - final Relation inverse = new Relation(); - - inverse.setProperties(rel.getProperties()); - inverse.setValidated(rel.getValidated()); - inverse.setValidationDate(rel.getValidationDate()); - inverse.setCollectedfrom(rel.getCollectedfrom()); - inverse.setDataInfo(rel.getDataInfo()); - inverse.setLastupdatetimestamp(rel.getLastupdatetimestamp()); - - inverse.setSource(rel.getTarget()); - inverse.setTarget(rel.getSource()); - inverse.setRelType(rel.getRelType()); - inverse.setSubRelType(rel.getSubRelType()); - - return inverse; - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala new file mode 100644 index 000000000..c7ad1890d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/CopyHdfsOafSparkApplication.scala @@ -0,0 +1,74 @@ +package eu.dnetlib.dhp.oa.graph.raw + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.HdfsSupport +import eu.dnetlib.dhp.schema.common.ModelSupport +import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.io.IOUtils +import org.apache.commons.lang3.StringUtils +import org.apache.http.client.methods.HttpGet +import org.apache.http.impl.client.HttpClients +import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} +import org.apache.spark.{SparkConf, SparkContext} +import org.slf4j.LoggerFactory + +import scala.collection.JavaConverters._ +import scala.io.Source + +object CopyHdfsOafSparkApplication { + + def main(args: Array[String]): Unit = { + val log = LoggerFactory.getLogger(getClass) + val conf = new SparkConf() + val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json")).mkString) + parser.parseArgument(args) + + val spark = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + val sc: SparkContext = spark.sparkContext + + val mdstoreManagerUrl = parser.get("mdstoreManagerUrl") + log.info("mdstoreManagerUrl: {}", mdstoreManagerUrl) + + val mdFormat = parser.get("mdFormat") + log.info("mdFormat: {}", mdFormat) + + val mdLayout = parser.get("mdLayout") + log.info("mdLayout: {}", mdLayout) + + val mdInterpretation = parser.get("mdInterpretation") + log.info("mdInterpretation: {}", mdInterpretation) + + val hdfsPath = parser.get("hdfsPath") + log.info("hdfsPath: {}", hdfsPath) + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + + val paths = DHPUtils.mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation, true).asScala + + val validPaths: List[String] = paths.filter(p => HdfsSupport.exists(p, sc.hadoopConfiguration)).toList + + if (validPaths.nonEmpty) { + val oaf = spark.read.load(validPaths: _*).as[Oaf] + val mapper = new ObjectMapper() + val l =ModelSupport.oafTypes.entrySet.asScala.map(e => e.getKey).toList + l.foreach( + e => + oaf.filter(o => o.getClass.getSimpleName.equalsIgnoreCase(e)) + .map(s => mapper.writeValueAsString(s))(Encoders.STRING) + .write + .option("compression", "gzip") + .mode(SaveMode.Append) + .text(s"$hdfsPath/${e}") + ) + } + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index d78732f9b..e453f7918 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -186,6 +186,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i log.info("Processing Openorgs Merge Rels..."); smdbe.execute("queryOpenOrgsSimilarityForProvision.sql", smdbe::processOrgOrgMergeRels); + + log.info("Processing Openorgs Parent/Child Rels..."); + smdbe.execute("queryParentChildRelsOpenOrgs.sql", smdbe::processOrgOrgParentChildRels); break; case openaire_organizations: @@ -689,6 +692,35 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i } } + public List processOrgOrgParentChildRels(final ResultSet rs) { + try { + final DataInfo info = prepareDataInfo(rs); // TODO + + final String orgId1 = createOpenaireId(20, rs.getString("source"), true); + final String orgId2 = createOpenaireId(20, rs.getString("target"), true); + + final List collectedFrom = listKeyValues( + createOpenaireId(10, rs.getString("collectedfromid"), true), rs.getString("collectedfromname")); + + final Relation r = new Relation(); + r.setRelType(ORG_ORG_RELTYPE); + r.setSubRelType(ModelConstants.RELATIONSHIP); + r + .setRelClass( + rs.getString("type").equalsIgnoreCase("parent") ? ModelConstants.IS_PARENT_OF + : ModelConstants.IS_CHILD_OF); + r.setSource(orgId1); + r.setTarget(orgId2); + r.setCollectedfrom(collectedFrom); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + + return Arrays.asList(r); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + public List processOrgOrgSimRels(final ResultSet rs) { try { final DataInfo info = prepareDataInfo(rs); // TODO diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java index 8cd495e08..cba64899b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java @@ -26,6 +26,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.utils.DHPUtils; public class AbstractMigrationApplication implements Closeable { @@ -71,27 +72,7 @@ public class AbstractMigrationApplication implements Closeable { final String format, final String layout, final String interpretation) throws IOException { - final String url = mdstoreManagerUrl + "/mdstores/"; - final ObjectMapper objectMapper = new ObjectMapper(); - - final HttpGet req = new HttpGet(url); - - try (final CloseableHttpClient client = HttpClients.createDefault()) { - try (final CloseableHttpResponse response = client.execute(req)) { - final String json = IOUtils.toString(response.getEntity().getContent()); - final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class); - return Arrays - .stream(mdstores) - .filter(md -> md.getFormat().equalsIgnoreCase(format)) - .filter(md -> md.getLayout().equalsIgnoreCase(layout)) - .filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation)) - .filter(md -> StringUtils.isNotBlank(md.getHdfsPath())) - .filter(md -> StringUtils.isNotBlank(md.getCurrentVersion())) - .filter(md -> md.getSize() > 0) - .map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store") - .collect(Collectors.toSet()); - } - } + return DHPUtils.mdstorePaths(mdstoreManagerUrl, format, layout, interpretation, false); } private Configuration getConf() { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala new file mode 100644 index 000000000..316b8afed --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveEntities.scala @@ -0,0 +1,107 @@ +package eu.dnetlib.dhp.oa.graph.resolution + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.HdfsSupport +import eu.dnetlib.dhp.schema.common.EntityType +import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset} +import org.apache.commons.io.IOUtils +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.slf4j.{Logger, LoggerFactory} + +object SparkResolveEntities { + + val mapper = new ObjectMapper() + val entities = List(EntityType.dataset,EntityType.publication, EntityType.software, EntityType.otherresearchproduct) + + def main(args: Array[String]): Unit = { + val log: Logger = LoggerFactory.getLogger(getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + + val graphBasePath = parser.get("graphBasePath") + log.info(s"graphBasePath -> $graphBasePath") + val workingPath = parser.get("workingPath") + log.info(s"workingPath -> $workingPath") + val unresolvedPath = parser.get("unresolvedPath") + log.info(s"unresolvedPath -> $unresolvedPath") + + val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) + fs.mkdirs(new Path(workingPath)) + + resolveEntities(spark, workingPath, unresolvedPath) + generateResolvedEntities(spark, workingPath, graphBasePath) + + // TO BE conservative we keep the original entities in the working dir + // and save the resolved entities on the graphBasePath + //In future these lines of code should be removed + entities.foreach { + e => + fs.rename(new Path(s"$graphBasePath/$e"), new Path(s"$workingPath/${e}_old")) + fs.rename(new Path(s"$workingPath/resolvedGraph/$e"), new Path(s"$graphBasePath/$e")) + } + +} + + +def resolveEntities(spark: SparkSession, workingPath: String, unresolvedPath: String) = { + implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + import spark.implicits._ + + val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] + val up: Dataset[(String, Result)] = spark.read.text(unresolvedPath).as[String].map(s => mapper.readValue(s, classOf[Result])).map(r => (r.getId, r))(Encoders.tuple(Encoders.STRING, resEncoder)) + + rPid.joinWith(up, rPid("_2").equalTo(up("_1")), "inner").map { + r => + val result = r._2._2 + val dnetId = r._1._1 + result.setId(dnetId) + result + }.write.mode(SaveMode.Overwrite).save(s"$workingPath/resolvedEntities") + } + + + def deserializeObject(input:String, entity:EntityType ) :Result = { + + entity match { + case EntityType.publication => mapper.readValue(input, classOf[Publication]) + case EntityType.dataset => mapper.readValue(input, classOf[OafDataset]) + case EntityType.software=> mapper.readValue(input, classOf[Software]) + case EntityType.otherresearchproduct=> mapper.readValue(input, classOf[OtherResearchProduct]) + } + } + + def generateResolvedEntities(spark:SparkSession, workingPath: String, graphBasePath:String) = { + + implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + import spark.implicits._ + + val re:Dataset[Result] = spark.read.load(s"$workingPath/resolvedEntities").as[Result] + entities.foreach { + e => + + spark.read.text(s"$graphBasePath/$e").as[String] + .map(s => deserializeObject(s, e)) + .union(re) + .groupByKey(_.getId) + .reduceGroups { + (x, y) => + x.mergeFrom(y) + x + }.map(_._2) + .filter(r => r.getClass.getSimpleName.toLowerCase != "result") + .map(r => mapper.writeValueAsString(r))(Encoders.STRING) + .write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingPath/resolvedGraph/$e") + } + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala new file mode 100644 index 000000000..cd517dd5e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/resolution/SparkResolveRelation.scala @@ -0,0 +1,161 @@ +package eu.dnetlib.dhp.oa.graph.resolution + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.common.HdfsSupport +import eu.dnetlib.dhp.schema.oaf.{Relation, Result} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.io.IOUtils +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD +import org.apache.spark.sql._ +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse +import org.slf4j.{Logger, LoggerFactory} + +object SparkResolveRelation { + def main(args: Array[String]): Unit = { + val log: Logger = LoggerFactory.getLogger(getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json"))) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")).getOrCreate() + + + val graphBasePath = parser.get("graphBasePath") + log.info(s"graphBasePath -> $graphBasePath") + val workingPath = parser.get("workingPath") + log.info(s"workingPath -> $workingPath") + + implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) + import spark.implicits._ + + + //CLEANING TEMPORARY FOLDER + HdfsSupport.remove(workingPath, spark.sparkContext.hadoopConfiguration) + val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) + fs.mkdirs(new Path(workingPath)) + + extractPidResolvedTableFromJsonRDD(spark, graphBasePath, workingPath) + + val mapper: ObjectMapper = new ObjectMapper() + + val rPid: Dataset[(String, String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String, String)] + + val relationDs: Dataset[(String, Relation)] = spark.read.text(s"$graphBasePath/relation").as[String] + .map(s => mapper.readValue(s, classOf[Relation])).as[Relation] + .map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) + + relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map { + m => + val sourceResolved = m._2 + val currentRelation = m._1._2 + if (sourceResolved != null && sourceResolved._1 != null && sourceResolved._1.nonEmpty) + currentRelation.setSource(sourceResolved._1) + currentRelation + }.write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/relationResolvedSource") + + + val relationSourceResolved: Dataset[(String, Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation] + .map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) + relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map { + m => + val targetResolved = m._2 + val currentRelation = m._1._2 + if (targetResolved != null && targetResolved._1.nonEmpty) + currentRelation.setTarget(targetResolved._1) + currentRelation + } + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/relation_resolved") + + + // TO BE conservative we keep the original relation in the working dir + // and save the relation resolved on the graphBasePath + //In future this two line of code should be removed + + fs.rename(new Path(s"$graphBasePath/relation"), new Path(s"$workingPath/relation")) + + spark.read.load(s"$workingPath/relation_resolved").as[Relation] + .filter(r => !r.getSource.startsWith("unresolved") && !r.getTarget.startsWith("unresolved")) + .map(r => mapper.writeValueAsString(r)) + .write + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .text(s"$graphBasePath/relation") + } + + def extractInstanceCF(input: String): List[(String, String)] = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + val result: List[(String, String)] = for { + JObject(iObj) <- json \ "instance" + JField("collectedfrom", JObject(cf)) <- iObj + JField("instancetype", JObject(instancetype)) <- iObj + JField("value", JString(collectedFrom)) <- cf + JField("classname", JString(classname)) <- instancetype + } yield (classname, collectedFrom) + + result + + } + + + def extractPidsFromRecord(input: String): (String, List[(String, String)]) = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + val id: String = (json \ "id").extract[String] + val result: List[(String, String)] = for { + JObject(pids) <- json \\ "instance" \ "pid" + JField("value", JString(pidValue)) <- pids + JField("qualifier", JObject(qualifier)) <- pids + JField("classid", JString(pidType)) <- qualifier + } yield (pidValue, pidType) + + (id, result) + } + + + private def isRelation(input: String): Boolean = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: json4s.JValue = parse(input) + val source = (json \ "source").extractOrElse[String](null) + + source != null + } + + def extractPidResolvedTableFromJsonRDD(spark: SparkSession, graphPath: String, workingPath: String) = { + import spark.implicits._ + + val d: RDD[(String, String)] = spark.sparkContext.textFile(s"$graphPath/*") + .filter(i => !isRelation(i)) + .map(i => extractPidsFromRecord(i)) + .filter(s => s != null && s._1 != null && s._2 != null && s._2.nonEmpty) + .flatMap { p => + p._2.map(pid => + (p._1, DHPUtils.generateUnresolvedIdentifier(pid._1, pid._2)) + ) + }.filter(r => r._1 != null || r._2 != null) + + spark.createDataset(d) + .groupByKey(_._2) + .reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y) + .map(s => s._2) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/relationResolvedPid") + } + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala index cb41d6134..4b82fe645 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkConvertRDDtoDataset.scala @@ -59,7 +59,12 @@ object SparkConvertRDDtoDataset { log.info("Converting Relation") - val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation])) + val relationSemanticFilter = List("cites", "iscitedby","merges", "ismergedin") + + val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation") + .map(s => mapper.readValue(s, classOf[Relation])) + .filter(r=> r.getSource.startsWith("50") && r.getTarget.startsWith("50")) + .filter(r => !relationSemanticFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))) spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath") diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala index 0a7fc18fb..e4fcd2782 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkCreateScholix.scala @@ -51,10 +51,14 @@ object SparkCreateScholix { relationDS.joinWith(summaryDS, relationDS("_1").equalTo(summaryDS("_1")), "left") .map { input: ((String, Relation), (String, ScholixSummary)) => - val rel: Relation = input._1._2 - val source: ScholixSummary = input._2._2 - (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) + if (input._1!= null && input._2!= null) { + val rel: Relation = input._1._2 + val source: ScholixSummary = input._2._2 + (rel.getTarget, ScholixUtils.scholixFromSource(rel, source)) + } + else null }(Encoders.tuple(Encoders.STRING, scholixEncoder)) + .filter(r => r!= null) .write.mode(SaveMode.Overwrite).save(s"$targetPath/scholix_from_source") val scholixSource: Dataset[(String, Scholix)] = spark.read.load(s"$targetPath/scholix_from_source").as[(String, Scholix)](Encoders.tuple(Encoders.STRING, scholixEncoder)) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala deleted file mode 100644 index 1b13b81c7..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkResolveRelation.scala +++ /dev/null @@ -1,154 +0,0 @@ -package eu.dnetlib.dhp.sx.graph - -import com.fasterxml.jackson.databind.ObjectMapper -import eu.dnetlib.dhp.application.ArgumentApplicationParser -import eu.dnetlib.dhp.schema.oaf.{Relation, Result} -import org.apache.commons.io.IOUtils -import org.apache.hadoop.io.compress.GzipCodec -import org.apache.spark.SparkConf -import org.apache.spark.rdd.RDD -import org.apache.spark.sql._ -import org.json4s -import org.json4s.DefaultFormats -import org.json4s.JsonAST.{JField, JObject, JString} -import org.json4s.jackson.JsonMethods.parse -import org.slf4j.{Logger, LoggerFactory} - -import scala.collection.JavaConverters._ -object SparkResolveRelation { - def main(args: Array[String]): Unit = { - val log: Logger = LoggerFactory.getLogger(getClass) - val conf: SparkConf = new SparkConf() - val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/resolve_relations_params.json"))) - parser.parseArgument(args) - val spark: SparkSession = - SparkSession - .builder() - .config(conf) - .appName(getClass.getSimpleName) - .master(parser.get("master")).getOrCreate() - - - val relationPath = parser.get("relationPath") - log.info(s"sourcePath -> $relationPath") - val entityPath = parser.get("entityPath") - log.info(s"entityPath -> $entityPath") - val workingPath = parser.get("workingPath") - log.info(s"workingPath -> $workingPath") - - implicit val relEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation]) - import spark.implicits._ - - - extractPidResolvedTableFromJsonRDD(spark, entityPath, workingPath) - - val mappper = new ObjectMapper() - - val rPid:Dataset[(String,String)] = spark.read.load(s"$workingPath/relationResolvedPid").as[(String,String)] - - val relationDs:Dataset[(String,Relation)] = spark.read.load(relationPath).as[Relation].map(r => (r.getSource.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) - - relationDs.joinWith(rPid, relationDs("_1").equalTo(rPid("_2")), "left").map{ - m => - val sourceResolved = m._2 - val currentRelation = m._1._2 - if (sourceResolved!=null && sourceResolved._1!=null && sourceResolved._1.nonEmpty) - currentRelation.setSource(sourceResolved._1) - currentRelation - }.write - .mode(SaveMode.Overwrite) - .save(s"$workingPath/relationResolvedSource") - - - val relationSourceResolved:Dataset[(String,Relation)] = spark.read.load(s"$workingPath/relationResolvedSource").as[Relation].map(r => (r.getTarget.toLowerCase, r))(Encoders.tuple(Encoders.STRING, relEncoder)) - relationSourceResolved.joinWith(rPid, relationSourceResolved("_1").equalTo(rPid("_2")), "left").map{ - m => - val targetResolved = m._2 - val currentRelation = m._1._2 - if (targetResolved!=null && targetResolved._1.nonEmpty) - currentRelation.setTarget(targetResolved._1) - currentRelation - }.filter(r => r.getSource.startsWith("50")&& r.getTarget.startsWith("50")) - .write - .mode(SaveMode.Overwrite) - .save(s"$workingPath/relation_resolved") - - spark.read.load(s"$workingPath/relation_resolved").as[Relation] - .map(r => mappper.writeValueAsString(r)) - .rdd.saveAsTextFile(s"$workingPath/relation", classOf[GzipCodec]) - - } - - - def extractPidsFromRecord(input:String):(String,List[(String,String)]) = { - implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats - lazy val json: json4s.JValue = parse(input) - val id:String = (json \ "id").extract[String] - val result: List[(String,String)] = for { - JObject(pids) <- json \ "pid" - JField("value", JString(pidValue)) <- pids - JField("qualifier", JObject(qualifier)) <- pids - JField("classname", JString(pidType)) <- qualifier - } yield (pidValue, pidType) - - val alternateIds: List[(String,String)] = for { - JObject(pids) <- json \\ "alternateIdentifier" - JField("value", JString(pidValue)) <- pids - JField("qualifier", JObject(qualifier)) <- pids - JField("classname", JString(pidType)) <- qualifier - } yield (pidValue, pidType) - - (id,result:::alternateIds) - } - - private def extractPidResolvedTableFromJsonRDD(spark: SparkSession, entityPath: String, workingPath: String) = { - import spark.implicits._ - - val d: RDD[(String,String)] = spark.sparkContext.textFile(s"$entityPath/*") - .map(i => extractPidsFromRecord(i)) - .filter(s => s != null && s._1!= null && s._2!=null && s._2.nonEmpty) - .flatMap{ p => - p._2.map(pid => - (p._1, convertPidToDNETIdentifier(pid._1, pid._2)) - ) - }.filter(r =>r._1 != null || r._2 != null) - - spark.createDataset(d) - .groupByKey(_._2) - .reduceGroups((x, y) => if (x._1.startsWith("50|doi") || x._1.startsWith("50|pmid")) x else y) - .map(s => s._2) - .write - .mode(SaveMode.Overwrite) - .save(s"$workingPath/relationResolvedPid") - } - - - /* - This method should be used once we finally convert everythings in Kryo dataset - instead of using rdd of json - */ - private def extractPidResolvedTableFromKryo(spark: SparkSession, entityPath: String, workingPath: String) = { - import spark.implicits._ - implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) - val entities: Dataset[Result] = spark.read.load(s"$entityPath/*").as[Result] - entities.flatMap(e => e.getPid.asScala - .map(p => - convertPidToDNETIdentifier(p.getValue, p.getQualifier.getClassid)) - .filter(s => s != null) - .map(s => (s, e.getId)) - ).groupByKey(_._1) - .reduceGroups((x, y) => if (x._2.startsWith("50|doi") || x._2.startsWith("50|pmid")) x else y) - .map(s => s._2) - .write - .mode(SaveMode.Overwrite) - .save(s"$workingPath/relationResolvedPid") - } - - def convertPidToDNETIdentifier(pid:String, pidType: String):String = { - if (pid==null || pid.isEmpty || pidType== null || pidType.isEmpty) - null - else - s"unresolved::${pid.toLowerCase}::${pidType.toLowerCase}" - } - -} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index 4dafd4fa3..93c554e04 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -289,7 +289,7 @@ object ScholixUtils { if (r.getInstance() == null || r.getInstance().isEmpty) return List() r.getInstance().asScala.filter(i => i.getUrl!= null && !i.getUrl.isEmpty) - + .filter(i => i.getPid!= null && i.getUrl != null) .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)).distinct.toList } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json index 1e862198f..d1b16b09a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/copy_hdfs_oaf_parameters.json @@ -23,16 +23,16 @@ "paramDescription": "metadata layout", "paramRequired": true }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "should be yarn or local", + "paramRequired": true + }, { "paramName": "i", "paramLongName": "mdInterpretation", "paramDescription": "metadata interpretation", "paramRequired": true - }, - { - "paramName": "isu", - "paramLongName": "isLookupUrl", - "paramDescription": "the url of the ISLookupService", - "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index 563923a5a..307e26267 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -258,7 +258,7 @@ ${wf:conf('reuseDB') eq false} ${wf:conf('reuseDB') eq true} - + @@ -553,7 +553,7 @@ yarn cluster ImportOAF_hdfs_graph - eu.dnetlib.dhp.oa.graph.raw.CopyHdfsOafApplication + eu.dnetlib.dhp.oa.graph.raw.CopyHdfsOafSparkApplication dhp-graph-mapper-${projectVersion}.jar --executor-memory ${sparkExecutorMemory} @@ -568,8 +568,8 @@ --mdstoreManagerUrl${mdstoreManagerUrl} --mdFormatOAF --mdLayoutstore + --masteryarn --mdInterpretationgraph - --isLookupUrl${isLookupUrl} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/config-default.xml rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml new file mode 100644 index 000000000..ceb13c5e8 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/oozie_app/workflow.xml @@ -0,0 +1,72 @@ + + + + graphBasePath + the path of the graph + + + unresolvedPath + the path of the unresolved Entities + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Resolve Relations in raw graph + eu.dnetlib.dhp.oa.graph.resolution.SparkResolveRelation + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=15000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --graphBasePath${graphBasePath} + --workingPath${workingDir} + + + + + + + + yarn + cluster + Resolve Entities in raw graph + eu.dnetlib.dhp.oa.graph.resolution.SparkResolveEntities + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=10000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --graphBasePath${graphBasePath} + --unresolvedPath${unresolvedPath} + --workingPath${workingDir} + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json new file mode 100644 index 000000000..f38cc1291 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_entities_params.json @@ -0,0 +1,6 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true}, + {"paramName":"u", "paramLongName":"unresolvedPath", "paramDescription": "the source Path", "paramRequired": true}, + {"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolve_relations_params.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json similarity index 50% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolve_relations_params.json rename to dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json index f211adb9a..1fbe20648 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolve_relations_params.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/resolution/resolve_relations_params.json @@ -1,6 +1,5 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, - {"paramName":"r", "paramLongName":"relationPath", "paramDescription": "the source Path", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the source Path", "paramRequired": true}, - {"paramName":"e", "paramLongName":"entityPath", "paramDescription": "the path of the raw graph", "paramRequired": true} + {"paramName":"g", "paramLongName":"graphBasePath", "paramDescription": "the path of the raw graph", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql new file mode 100644 index 000000000..388fee3f5 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/sql/queryParentChildRelsOpenOrgs.sql @@ -0,0 +1,13 @@ +SELECT + id1 AS source, + id2 AS target, + reltype AS type, + false AS inferred, + false AS deletedbyinference, + 0.95 AS trust, + '' AS inferenceprovenance, + 'openaire____::openorgs' AS collectedfromid, + 'OpenOrgs Database' AS collectedfromname, + 'sysimport:crosswalk:entityregistry@@@dnet:provenance_actions' AS provenanceaction +FROM relationships +WHERE reltype = 'Child' OR reltype = 'Parent' \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml index d8eb1fc80..17996c82c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/finalGraph/oozie_app/workflow.xml @@ -54,7 +54,7 @@ --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.shuffle.partitions=5000 + --conf spark.sql.shuffle.partitions=20000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} @@ -79,7 +79,7 @@ --executor-cores=${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.shuffle.partitions=6000 + --conf spark.sql.shuffle.partitions=20000 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/workflow.xml deleted file mode 100644 index 7683ff94c..000000000 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/graph/resolverelation/oozie_app/workflow.xml +++ /dev/null @@ -1,62 +0,0 @@ - - - - entityPath - the path of deduplicate Entities - - - relationPath - the path of relation unresolved - - - targetPath - the path of relation unresolved - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - - - - yarn - cluster - Resolve Relations in raw graph - eu.dnetlib.dhp.sx.graph.SparkResolveRelation - dhp-graph-mapper-${projectVersion}.jar - - --executor-memory=${sparkExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.shuffle.partitions=3000 - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --masteryarn - --relationPath${relationPath} - --workingPath${targetPath} - --entityPath${entityPath} - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 42d9f226c..aa9535ef7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -8,6 +8,7 @@ import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.io.IOUtils; @@ -137,9 +138,21 @@ public class GraphCleaningFunctionsTest { .stream() .anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); + assertEquals(5, p_out.getTitle().size()); + Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); - assertEquals(1, p_cleaned.getTitle().size()); + assertEquals(3, p_cleaned.getTitle().size()); + + List titles = p_cleaned + .getTitle() + .stream() + .map(StructuredProperty::getValue) + .collect(Collectors.toList()); + assertTrue(titles.contains("omic")); + assertTrue( + titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test")); + assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳")); assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid()); assertNull(p_out.getPublisher()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java index bf6301ec4..602aaf6e6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/dump/DumpJobTest.java @@ -331,7 +331,6 @@ public class DumpJobTest { Assertions .assertEquals( Constants.accessRightsCoarMap.get(ModelConstants.ACCESS_RIGHT_OPEN), gr.getBestaccessright().getCode()); - Assertions.assertEquals(null, gr.getBestaccessright().getOpenAccessRoute()); Assertions.assertEquals("One Ecosystem", gr.getContainer().getName()); Assertions.assertEquals("2367-8194", gr.getContainer().getIssnOnline()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala new file mode 100644 index 000000000..46bf48974 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala @@ -0,0 +1,187 @@ +package eu.dnetlib.dhp.oa.graph.resolution + + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.schema.common.EntityType +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils +import eu.dnetlib.dhp.schema.oaf.{Result, StructuredProperty} +import org.apache.commons.io.FileUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.junit.jupiter.api.Assertions._ +import org.junit.jupiter.api.TestInstance.Lifecycle +import org.junit.jupiter.api.{AfterAll, BeforeAll, Test, TestInstance} + +import java.nio.file.{Files, Path} +import scala.collection.JavaConverters._ +import scala.io.Source + +@TestInstance(Lifecycle.PER_CLASS) +class ResolveEntitiesTest extends Serializable { + + var workingDir:Path = null + + val FAKE_TITLE = "FAKETITLE" + val FAKE_SUBJECT = "FAKESUBJECT" + + var sparkSession:Option[SparkSession] = None + + + @BeforeAll + def setUp() :Unit = { + workingDir = Files.createTempDirectory(getClass.getSimpleName) + + val conf = new SparkConf() + sparkSession = Some(SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master("local[*]").getOrCreate()) + populateDatasets(sparkSession.get) + generateUpdates(sparkSession.get) + + } + + + @AfterAll + def tearDown():Unit = { + FileUtils.deleteDirectory(workingDir.toFile) + sparkSession.get.stop() + + + } + + + def generateUpdates(spark:SparkSession):Unit = { + val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString + + + val pids:List[String] = template.lines.map{id => + val r = new Result + r.setId(id.toLowerCase.trim) + r.setSubject(List(OafMapperUtils.structuredProperty(FAKE_SUBJECT, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) + r.setTitle(List(OafMapperUtils.structuredProperty(FAKE_TITLE, OafMapperUtils.qualifier("fos","fosCS", "fossSchema", "fossiFIgo"), null)).asJava) + r + }.map{r => + val mapper = new ObjectMapper() + + mapper.writeValueAsString(r)}.toList + + + val sc =spark.sparkContext + + println(sc.parallelize(pids).count()) + + spark.createDataset(sc.parallelize(pids))(Encoders.STRING).write.mode(SaveMode.Overwrite).option("compression", "gzip").text(s"$workingDir/updates") + + + + + + import spark.implicits._ + implicit val resEncoder: Encoder[Result] = Encoders.bean(classOf[Result]) + val ds = spark.read.text(s"$workingDir/updates").as[String].map{s => val mapper = new ObjectMapper() + mapper.readValue(s, classOf[Result])}.collect() + + + + + assertEquals(4, ds.length) + ds.foreach{r => assertNotNull(r.getSubject)} + ds.foreach{r => assertEquals(1,r.getSubject.size())} + ds.foreach{r => assertNotNull(r.getTitle)} + ds.foreach{r => assertEquals(1,r.getTitle.size())} + + + + ds.flatMap(r => r.getTitle.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_TITLE,t)) + ds.flatMap(r => r.getSubject.asScala.map(t => t.getValue)).foreach(t => assertEquals(FAKE_SUBJECT,t)) + + println("generated Updates") + } + + + def populateDatasets(spark:SparkSession):Unit = { + import spark.implicits._ + val entities =SparkResolveEntities.entities + + entities.foreach{ + e => + val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString + spark.createDataset(spark.sparkContext.parallelize(template.lines.toList)).as[String].write.option("compression", "gzip").text(s"$workingDir/graph/$e") + println(s"Created Dataset $e") + } + SparkResolveRelation.extractPidResolvedTableFromJsonRDD(spark, s"$workingDir/graph", s"$workingDir/work") + + } + + + @Test + def testResolution():Unit = { + val spark:SparkSession = sparkSession.get + implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" ) + + val ds = spark.read.load(s"$workingDir/work/resolvedEntities").as[Result] + + assertEquals(3, ds.count()) + + ds.collect().foreach{ + r => + assertTrue(r.getId.startsWith("50")) + } + } + + + + + private def structuredPContainsValue(l:java.util.List[StructuredProperty], exptectedValue:String):Boolean = { + l.asScala.exists(p =>p.getValue!= null && p.getValue.equalsIgnoreCase(exptectedValue)) + } + + @Test + def testUpdate():Unit = { + val spark:SparkSession = sparkSession.get + import spark.implicits._ + implicit val resEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) + val m = new ObjectMapper() + SparkResolveEntities.resolveEntities(spark,s"$workingDir/work", s"$workingDir/updates" ) + SparkResolveEntities.generateResolvedEntities(spark,s"$workingDir/work",s"$workingDir/graph" ) + + + + val pubDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/publication").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.publication)) + val t = pubDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + + + + val datDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/dataset").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.dataset)) + val td = datDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + + + val softDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/software").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.software)) + val ts = softDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + + + val orpDS:Dataset[Result] = spark.read.text(s"$workingDir/work/resolvedGraph/otherresearchproduct").as[String].map(s => SparkResolveEntities.deserializeObject(s, EntityType.otherresearchproduct)) + val to = orpDS.filter(p => p.getTitle!=null && p.getSubject!=null).filter(p => p.getTitle.asScala.exists(t => t.getValue.equalsIgnoreCase("FAKETITLE"))).count() + + + assertEquals(0, t) + assertEquals(2, td) + assertEquals(1, ts) + assertEquals(0, to) + + } + + + + + + + + + + + +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala index 5b7fbe1cf..bd7e4fd09 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala @@ -1,10 +1,10 @@ package eu.dnetlib.dhp.sx.graph.scholix import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} +import eu.dnetlib.dhp.oa.graph.resolution.SparkResolveRelation import eu.dnetlib.dhp.schema.oaf.{Relation, Result} import eu.dnetlib.dhp.schema.sx.scholix.Scholix import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary -import eu.dnetlib.dhp.sx.graph.SparkResolveRelation import eu.dnetlib.dhp.sx.graph.bio.pubmed.AbstractVocabularyTest import org.json4s import org.json4s.DefaultFormats diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 6795ccf1b..b3e302474 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -864,7 +864,7 @@ "schemeid": "dnet:dataCite_title", "schemename": "dnet:dataCite_title" }, - "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers" + "value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test" }, { "dataInfo": { @@ -887,6 +887,72 @@ "schemename": "dnet:dataCite_title" }, "value": "test test 123 test" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "omic" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "sysimport:crosswalk:datasetarchive", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title", + "schemename": "dnet:dataCite_title" + }, + "value": "-" } ] } \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset new file mode 100644 index 000000000..c22dc94e3 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/dataset @@ -0,0 +1,3 @@ +{"author":[{"affiliation":[],"fullname":"Greenough, B","name":"B","pid":[],"rank":1,"surname":"Greenough"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:55:00.639Z","dateoftransformation":"2021-09-25T11:00:04.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Heritage Education"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-59-cjhf"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17632/96bpgw5j9d.1"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17632/96bpgw5j9d.1"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434801681,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T15:29:45Z","harvestDate":"2021-09-25T10:55:00.639Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323","metadataNamespace":""}},"originalId":["50|DansKnawCris::09821844208a5cd6300b2bfb13bca1b9","oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:211323"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Interdisciplinary sciences"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Heritage Education"}]} +{"author":[{"affiliation":[],"fullname":"Keijers, D.M.G.","name":"D.M.G.","pid":[],"rank":1,"surname":"Keijers"}],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:41:59.767Z","dateoftransformation":"2021-09-25T11:00:19.238Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"onderzoeksrapport"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-das-fkq"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-xsw-qtnx"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-xsw-qtnx"]}],"language":{"classid":"dut/nld","classname":"Dutch; Flemish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434847381,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T13:53:29Z","harvestDate":"2021-09-25T10:41:59.767Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:20759","50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"PROSPECTIE"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Plangebied Lange Ekker te Vessem, gemeente Eersel"}]} +{"author":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofcollection":"2021-09-25T10:43:13.768Z","dateoftransformation":"2021-09-25T11:01:22.863Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This find is registered at Portable Antiquities of the Netherlands with number PAN-00054604"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"geolocation":[],"id":"50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c","instance":[{"accessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"urn","classname":"urn","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"urn:nbn:nl:ui:13-a7-hwgy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.17026/dans-x3z-fsq5"}],"collectedfrom":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"hostedby":{"key":"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f","value":"DANS (Data Archiving and Networked Services)"},"instancetype":{"classid":"0021","classname":"Dataset","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"alternateIdentifier":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["","http://dx.doi.org/10.17026/dans-x3z-fsq5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635434508886,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"http%3A%2F%2Fservices.nod.dans.knaw.nl%2Foa-cerif","datestamp":"2021-08-16T14:01:37Z","harvestDate":"2021-09-25T10:43:13.768Z","identifier":"oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","metadataNamespace":""}},"originalId":["oai:services.nod.dans.knaw.nl:Products/dans:oai:easy.dans.knaw.nl:easy-dataset:129566","50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"0021","classname":"0021","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"dataset","classname":"dataset","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"early medieval enamelled disc brooch variant A9"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: disc brooches"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Broader Match: schijffibula - geemailleerd"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"metal"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"copper alloy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages C"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: Early Middle Ages D"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Temporal coverage: 800 until 1000"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Archaeology"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"PAN-00054604 - early medieval enamelled disc brooch variant A9"}]} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct new file mode 100644 index 000000000..f3693848d --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/otherresearchproduct @@ -0,0 +1,2 @@ +{"author":[{"affiliation":[],"fullname":"Martinec, Nikola","name":"Nikola","pid":[],"rank":1,"surname":"Martinec"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::345c9d171ef3c5d706d08041d506428c","value":"Croatian Scientific Bibliography - CROSBI"}],"contactgroup":[],"contactperson":[],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-01-01"},"dateofcollection":"2021-10-25T05:50:25+0000","dateoftransformation":"2021-10-26T05:23:26.742Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Simulacije mekih tijela neizostavan su dio realističnog prikaza stvarnog svijeta u različitim virtualnim okruženjima. U ovom radu obrađene su dvije metode simulacije mekih tijela – metoda s oprugama i metoda usklađivanja oblika. Napravljene su programske implementacije obiju metoda te je prikazana usporedba u obliku brzine izvođenja i vizualne kvalitete simulacija. Metoda uspoređivanja oblika napravljena je po uzoru na rad Meshless Deformations Based on Shape Matching [2] dok su kod metode s oprugama, ponajviše za algoritme raspoređivanja opruga, korištena autorova rješenja. U radu su opisani karakteristični dijelovi potrebni za implementaciju kao i problemi koji su se javljali tijekom izrade ovoga rada."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|57a035e5b1ae::14cc6e339ce130a10dbc5d9269dee6a3","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::345c9d171ef3c5d706d08041d506428c","value":"Croatian Scientific Bibliography - CROSBI"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-01-01"},"hostedby":{"key":"10|openaire____::345c9d171ef3c5d706d08041d506428c","value":"Croatian Scientific Bibliography - CROSBI"},"instancetype":{"classid":"master thesis","classname":"master thesis","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://www.bib.irb.hr/636614"]}],"language":{"classid":"hrv","classname":"Croatian","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635433838126,"originalId":["636614","50|57a035e5b1ae::14cc6e339ce130a10dbc5d9269dee6a3"],"pid":[],"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2013-01-01"}],"resourcetype":{"classid":"master thesis","classname":"master thesis","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"other","classname":"other","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"meka tijela; simulacija; usklađivanje oblika; opruge i mase"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Simulacija elastičnih objekata"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Soft bodies simulation"}],"tool":[]} +{"author":[{"affiliation":[],"fullname":"Sobočanec, Sandra","name":"Sandra","pid":[],"rank":1,"surname":"Sobočanec"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::345c9d171ef3c5d706d08041d506428c","value":"Croatian Scientific Bibliography - CROSBI"}],"contactgroup":[],"contactperson":[],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2006-01-01"},"dateofcollection":"2021-10-25T05:50:25+0000","dateoftransformation":"2021-10-26T05:22:35.748Z","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Propolis je tijekom stoljeća, a u posljednje vrijeme i u suvremenoj medicinskoj praksi ispitivan i korišten kao protuvirusno, protubakterijsko i protugljivično sredstvo. Biološki učinci propolisa vezani su uz antibiotska, protuupalna i antioksidacijska svojstva. Jedan od sastojaka propolisa su flavonoidi, čija je karakteristika visoki antioksidacijski potencijal zbog kojeg im se pripisuje sposobnost prevencije različitih bolesti kao što su različite vrste tumora, osteoporoza, kardiovaskularne i neurodegenerativne bolesti. U našim ispitivanjima koristili smo nativni propolis u dvije doze (100 mg kg-1, 300 mg kg-1), kojem smo istražili sastav, udio i vrstu flavonoida odgovornih za njegov oksidacijski/antioksidacijski učinak. Krizin je bio prisutan u najvećoj koncentraciji, dok je izoramnetin bio prisutan u najmanjoj. Antioksidacijski potencijal nativnog propolisa u in vitro uvjetima je bio relativno nizak. Međutim, propolis značajno povisuje antioksidacijski učinak in vivo uvjetima u jetri, plućima i mozgu miševa. Učinak nativnog propolisa pokazao ovisio je o dozi i to tako da je u manjoj dozi djelovao antioksidacijski (zaštitno), a u većoj dozi prooksidacijski (štetno). Oksidacijski status nakon obradaa nativnim propolisom nije pokazao tkivnu, ali je pokazao doznu ovisnost. Antioksidacijski status pokazao je tkivnu i doznu ovisnost ; u mozgu nije pokazao promjenu, dok je najveća promjena zabilježena u plućima i u manjoj mjeri u jetri. Promjena aktivnosti antioksidacijskih enzima u jetri nije korelirala s količinom njihove mRNA, dok je u plućima promjena u aktivnosti istih uglavnom bila proporcionalna promjeni u mRNA. Nije zapažena značajna promjena u ekpresiji proteina za sve ispitivane antioksidacijske enzime, niti obzirom na dozu, niti obzirom na ispitivano tkivo. Analizom DNA čipova utvrđeno je da propolis u dozi od 100 mg kg-1 u jetri ima zaštitni učinak, dok je u dozi od 300 mg kg-1 taj učinak prooksidacijski. U plućima je propolis u obje doze imao zaštitni učinak, dok u mozgu nije imao učinka na ispitivane gene. U hiperbaričkim uvjetima propolis je smanjio oksidacijski stres u sva tri ispitivana tkiva u dozi od 100 mg kg-1. U plućima je taj učinak propolis ostvario povećanjem aktivnosti antioksidacijskih enzima (MnSOD, KAT). Antioksidacijski učinak propolisa porijeklom iz Hrvatske do sada nije istraživan u in vivo sistemima. Isto tako po prvi puta se ispituje nativni propolis (tehnologijska inovacija) za razliku od do sada ispitivanih vodenih ili alkoholnih ekstrakata. Rezultati ove studije također pokazuju da sposobnost zaštitnog odgovora u plućima na oksidacijski stres koji postoji u novorođenih, ali se gubi u odraslih jedinki, može biti obnovljena uporabom nativnog propolisa tijekom hiperoksije. Ta spoznaja može pomoći u zaštiti od hiperoksije u populaciji odraslih tijekom terapije kisikom."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|57a035e5b1ae::15786c1f0f2930ea867a234fbcb8ef08","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::345c9d171ef3c5d706d08041d506428c","value":"Croatian Scientific Bibliography - CROSBI"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2006-01-01"},"hostedby":{"key":"10|openaire____::345c9d171ef3c5d706d08041d506428c","value":"Croatian Scientific Bibliography - CROSBI"},"instancetype":{"classid":"doctoral thesis","classname":"doctoral thesis","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://www.bib.irb.hr/286858"]}],"language":{"classid":"hrv","classname":"Croatian","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635433372341,"originalId":["286858","50|57a035e5b1ae::15786c1f0f2930ea867a234fbcb8ef08"],"pid":[],"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2006-01-01"}],"resourcetype":{"classid":"doctoral thesis","classname":"doctoral thesis","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"other","classname":"other","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"propolis; flavonoidi; miševi; oksidacijski/antioksidacijski status; hiperoksija; normobarički uvjeti"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Utjecaj propolisa na oksidacisjki/antioksidacijski status u CBA miša"},{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Propolis effect on oxidative/antioxidative status in CBA mice"}],"tool":[]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication new file mode 100644 index 000000000..562a54760 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/publication @@ -0,0 +1,3 @@ +{"author":[{"fullname":"Makkonen, Lasse","name":"Lasse","pid":[],"rank":1,"surname":"Makkonen"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Watson, Rick"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1997-01-01"},"dateofcollection":"2021-10-04T12:42:57.502Z","dateoftransformation":"2021-10-04T15:32:51.877Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::0b579a3501cf87921448e0a1c7fc8353","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1997-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0005","classname":"Contribution for newspaper or weekly magazine","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/5a6fbe30-2096-4106-96f4-ed36620d3f73"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635433424020,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2020-10-26T12:07:44Z","harvestDate":"2021-10-04T12:42:57.502Z","identifier":"oai:cris.vtt.fi:publications/5a6fbe30-2096-4106-96f4-ed36620d3f73","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::0b579a3501cf87921448e0a1c7fc8353","oai:cris.vtt.fi:publications/5a6fbe30-2096-4106-96f4-ed36620d3f73"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Irish Wind Energy Association"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Makkonen , L 1997 , Calculation of ice accretion on wind turbine blades . in R Watson (ed.) , EWEC '97: European Wind Energy Conference : Proceedings of the international conference . Irish Wind Energy Association , Slane , European Wind Energy Conference EWEC '97 , Dublin , Ireland , 6/10/97 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Calculation of ice accretion on wind turbine blades"}]} +{"author":[{"fullname":"Peltola, Juho","name":"Juho","pid":[],"rank":1,"surname":"Peltola"},{"fullname":"Karvonen, Lassi","name":"Lassi","pid":[],"rank":2,"surname":"Karvonen"},{"fullname":"Elfvengren, J","name":"J.","pid":[],"rank":3,"surname":"Elfvengren"},{"fullname":"Kolehmainen, J","name":"J.","pid":[],"rank":4,"surname":"Kolehmainen"},{"fullname":"Kallio, Sirpa","name":"Sirpa","pid":[],"rank":5,"surname":"Kallio"},{"fullname":"Pallares, D","name":"D.","pid":[],"rank":6,"surname":"Pallares"},{"fullname":"Johnsson, F","name":"F.","pid":[],"rank":7,"surname":"Johnsson"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2014-01-01"},"dateofcollection":"2021-10-04T12:31:44.326Z","dateoftransformation":"2021-10-04T17:18:21.611Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This work applies optical probe and particle image velocimetry (PIV) measurements for determining the local characteristics of the solids flow in a CFB riser and discusses benefits and limitations of the two methods. Measurements were carried out in an 8.5 m tall CFB riser with a cross-section of 0.7 m x 0.12 m. The optical probe was used to measure both solids volume fraction and velocity whereas the PIV was limited to solids velocity measurements. In addition, the vertical pressure profile and solids circulation rate were measured by means of pressure transducers. The work demonstrates the challenge of applying detailed solids flow measurements in CFB reactors. Both methods applied are associated with limitations and inaccuracies and it is concluded that a combination of both local measurement methods and the global pressure measurements will improve the interpretation of the solids flow field in CFB risers

"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::2a13bce11d3eb09889706369b114a476","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2014-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0005","classname":"Contribution for newspaper or weekly magazine","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/c7012d64-5947-4f40-b62f-6c3f232af0b0"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635433455504,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2021-01-26T12:04:31Z","harvestDate":"2021-10-04T12:31:44.326Z","identifier":"oai:cris.vtt.fi:publications/c7012d64-5947-4f40-b62f-6c3f232af0b0","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/c7012d64-5947-4f40-b62f-6c3f232af0b0","50|355e65625b88::2a13bce11d3eb09889706369b114a476"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Institute of Process Engineering"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Peltola , J , Karvonen , L , Elfvengren , J , Kolehmainen , J , Kallio , S , Pallares , D & Johnsson , F 2014 , Measurement of solids velocity and concentration in a large cold CFB model . in CFB-11: Proceedings of the 11th International Conference on Fluidized Bed Technology . Institute of Process Engineering , Beijing, China , pp. 589-594 , 11th International Conference on Fluidized Bed Technology, CFB 2014 , Beijing , China , 14/05/14 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Measurement of solids velocity and concentration in a large cold CFB model"}]} +{"author":[{"fullname":"Marszalec, Janusz","name":"Janusz","pid":[],"rank":1,"surname":"Marszalec"},{"fullname":"Järviluoma, Markku","name":"Markku","pid":[],"rank":2,"surname":"Järviluoma"},{"fullname":"Heikkilä, Tapio","name":"Tapio","pid":[],"rank":3,"surname":"Heikkilä"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"dateofcollection":"2021-10-04T12:42:38.338Z","dateoftransformation":"2021-10-05T01:02:19.804Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::b58939ea13408430070558e913a80bdf","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1117/12.135094"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0005","classname":"Contribution for newspaper or weekly magazine","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/a5d16720-e901-4605-8c46-2d8d920af7e2"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635433591966,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2021-01-01T04:23:59Z","harvestDate":"2021-10-04T12:42:38.338Z","identifier":"oai:cris.vtt.fi:publications/a5d16720-e901-4605-8c46-2d8d920af7e2","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::b58939ea13408430070558e913a80bdf","oai:cris.vtt.fi:publications/a5d16720-e901-4605-8c46-2d8d920af7e2"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"International Society for Optics and Photonics SPIE"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Marszalec , J , Järviluoma , M & Heikkilä , T 1991 , Planning of an active range sensor structure for pose estimation of 3-D regular objects . in Intelligent Robots and Computer Vision X: Neural, Biological, and 3-D Methods . International Society for Optics and Photonics SPIE , Proceedings of SPIE , vol. 1608 , Intelligent Robots and Computer Vision X , Boston , Massachusetts , United States , 10/11/91 . https://doi.org/10.1117/12.135094"}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Planning of an active range sensor structure for pose estimation of 3-D regular objects"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software new file mode 100644 index 000000000..da4dd7e9f --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/software @@ -0,0 +1 @@ +{"author":[{"affiliation":[],"fullname":"Franz, Max","name":"Max","pid":[],"rank":1,"surname":"Franz"},{"affiliation":[],"fullname":"Cheung, Manfred","name":"Manfred","pid":[],"rank":2,"surname":"Cheung"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"@mskcc"}],"fullname":"Sumer, Onur","name":"Onur","pid":[],"rank":3,"surname":"Sumer"},{"affiliation":[],"fullname":"Huck, Gerardo","name":"Gerardo","pid":[],"rank":4,"surname":"Huck"},{"affiliation":[],"fullname":"Fong, Dylan","name":"Dylan","pid":[],"rank":5,"surname":"Fong"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Northeastern University"}],"fullname":"Mullen, Tony","name":"Tony","pid":[],"rank":6,"surname":"Mullen"},{"affiliation":[],"fullname":"Ayhun","name":"","pid":[],"rank":7,"surname":""},{"affiliation":[],"fullname":"Chadkin, Bogdan","name":"Bogdan","pid":[],"rank":8,"surname":"Chadkin"},{"affiliation":[],"fullname":"Metincansiper","name":"","pid":[],"rank":9,"surname":""},{"affiliation":[],"fullname":"Stahl, Joseph","name":"Joseph","pid":[],"rank":10,"surname":"Stahl"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Trisotech"}],"fullname":"Gauthier, Mélanie","name":"Mélanie","pid":[],"rank":11,"surname":"Gauthier"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"@payoneer"}],"fullname":"Sherer, Eli","name":"Eli","pid":[],"rank":12,"surname":"Sherer"},{"affiliation":[],"fullname":"Bumbu","name":"","pid":[],"rank":13,"surname":""},{"affiliation":[],"fullname":"Sidlovsky, Yaroslav","name":"Yaroslav","pid":[],"rank":14,"surname":"Sidlovsky"},{"affiliation":[],"fullname":"Trott, Rich","name":"Rich","pid":[],"rank":15,"surname":"Trott"},{"affiliation":[],"fullname":"Beynon, Mike","name":"Mike","pid":[],"rank":16,"surname":"Beynon"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"University of Toronto"}],"fullname":"Lopes, Christian","name":"Christian","pid":[],"rank":17,"surname":"Lopes"},{"affiliation":[],"fullname":"Li, Alexander","name":"Alexander","pid":[],"rank":18,"surname":"Li"},{"affiliation":[],"fullname":"Cheng, Rui","name":"Rui","pid":[],"rank":19,"surname":"Cheng"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Bio + CS ⇝ ❤"}],"fullname":"Wilzbach, Sebastian","name":"Sebastian","pid":[],"rank":20,"surname":"Wilzbach"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"@atlassian"}],"fullname":"Dias, Mike","name":"Mike","pid":[],"rank":21,"surname":"Dias"},{"affiliation":[],"fullname":"Janit Mehta","name":"","pid":[],"rank":22,"surname":""},{"affiliation":[],"fullname":"Meira, Gui","name":"Gui","pid":[],"rank":23,"surname":"Meira"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"InfoTrack"}],"fullname":"Ryding, Daniel","name":"Daniel","pid":[],"rank":24,"surname":"Ryding"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Clarivate Analytics"}],"fullname":"Mosca, Roberto","name":"Roberto","pid":[],"rank":25,"surname":"Mosca"},{"affiliation":[],"fullname":"Jalil, F.","name":"F.","pid":[],"rank":26,"surname":"Jalil"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"@testdouble"}],"fullname":"Greenwood, Josh","name":"Josh","pid":[],"rank":27,"surname":"Greenwood"},{"affiliation":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Adpearance"}],"fullname":"Buchmann, Eric","name":"Eric","pid":[],"rank":28,"surname":"Buchmann"},{"affiliation":[],"fullname":", Dominic","name":"","pid":[],"rank":29,"surname":""},{"affiliation":[],"fullname":"Osborn, Connor","name":"Connor","pid":[],"rank":30,"surname":"Osborn"}],"bestaccessright":{"classid":"OPEN SOURCE","classname":"Open Source","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254","value":"Datacite"},{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO"},{"key":"10|re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","value":"Zenodo"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-10-29"},"dateofcollection":"2018-10-28T00:39:04.337Z","dateoftransformation":"","description":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Graph theory (network) library for visualisation and analysis"}],"documentationUrl":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|doi_________::0682c24009ab5c1624d1c032783feff1","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254","value":"Datacite"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-10-29"},"hostedby":{"key":"10|re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","value":"Zenodo"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5281/zenodo.1473694"}],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["http://dx.doi.org/10.5281/zenodo.1473694","https://zenodo.org/record/1473694"]},{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-01-01"},"hostedby":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5281/zenodo.1473694"}],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["http://dx.doi.org/10.5281/zenodo.1473694"]},{"accessright":{"classid":"OPEN SOURCE","classname":"Open Source","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","value":"Zenodo"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-01-01"},"hostedby":{"key":"10|re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","value":"Zenodo"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5281/zenodo.1473694"}],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["http://dx.doi.org/10.5281/zenodo.1473694"]},{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":{"key":"10|openaire____::9e3be59865b2c1c335d32dae2fe7b254","value":"Datacite"},"dateofacceptance":{"value":"2018-10-29"},"hostedby":{"key":"10|re3data_____::7b0ad08687b2c960d5aeef06f811d5e6","value":"Zenodo"},"instancetype":{"classid":"0029","classname":"Software","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:actionset","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5281/zenodo.1473694"}],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://dx.doi.org/10.5281/zenodo.1473694"]},{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-10-29"},"hostedby":{"key":"10|opendoar____::358aee4cc897452c00244351e4d91f69","value":"ZENODO"},"instancetype":{"classid":"0000","classname":"Unknown","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[{"dataInfo":{"deletedbyinference":false,"inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5281/zenodo.1473694"}],"refereed":{"classid":"0000","classname":"Unknown","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["http://dx.doi.org/10.5281/zenodo.1473694"]}],"language":{"classid":"und","classname":"Undetermined","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1635435250840,"license":[],"measures":[],"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fdatacite-api.dnet.d4science.org","datestamp":"","harvestDate":"2020-04-10T11:21:15.546Z","identifier":"10.5281/zenodo.1473694","metadataNamespace":""}},"originalId":["50|datacite____::0682c24009ab5c1624d1c032783feff1","10.5281/zenodo.1473694","oai:zenodo.org:1473694","50|od______2659::098cabdf6ac7d52b3d969c89a1ae95ba","50|r37b0ad08687::098cabdf6ac7d52b3d969c89a1ae95ba"],"pid":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.5281/zenodo.1473694"}],"programmingLanguage":{"classid":"","classname":"","schemeid":"","schemename":""},"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Zenodo"},"relevantdate":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"issued","classname":"issued","schemeid":"dnet:dataCite_date","schemename":"dnet:dataCite_date"},"value":"2018-10-29"}],"resourcetype":{"classid":"UNKNOWN","classname":"Unknown","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"software","classname":"software","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:datasetarchive","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Cytoscape/Cytoscape.Js V3.2.19"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates new file mode 100644 index 000000000..04ad7a79b --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/resolution/updates @@ -0,0 +1,4 @@ +unresolved::10.17026/dans-x3z-fsq5::doi +unresolved::10.17026/dans-xsw-qtnx::doi +unresolved::10.5281/zenodo.1473694::doi +unresolved::10.17632/fake::doi diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh index dc19f84b4..03aa535e1 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/updateCache.sh @@ -1,4 +1,4 @@ #!/usr/bin/env bash curl --request GET $1/cache/updateCache -sleep 20h \ No newline at end of file +sleep 6h \ No newline at end of file diff --git a/pom.xml b/pom.xml index 02bc5d8d4..71c55d1f0 100644 --- a/pom.xml +++ b/pom.xml @@ -550,7 +550,7 @@ org.apache.maven.plugins maven-site-plugin - 3.7.1 + 3.9.1 @@ -753,7 +753,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.8.21] + [2.8.22] [4.0.3] [6.0.5] [3.1.6]