1
0
Fork 0

resolved conflicts for #165

This commit is contained in:
Claudio Atzori 2021-11-26 16:15:11 +01:00
commit 1de881b796
304 changed files with 18805 additions and 2237 deletions

View File

@ -22,9 +22,20 @@
<id>dnet45-releases</id> <id>dnet45-releases</id>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url> <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository> </repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement> </distributionManagement>
<build> <build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement> <pluginManagement>
<plugins> <plugins>
<plugin> <plugin>
@ -35,7 +46,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version> <version>3.9.1</version>
</plugin> </plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
@ -43,6 +54,7 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties> </properties>
</project> </project>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -10,6 +10,9 @@
<packaging>pom</packaging> <packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description> <description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules> <modules>
<module>dhp-code-style</module> <module>dhp-code-style</module>
@ -17,4 +20,12 @@
<module>dhp-build-properties-maven-plugin</module> <module>dhp-build-properties-maven-plugin</module>
</modules> </modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project> </project>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -13,7 +13,51 @@
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
<packaging>jar</packaging> <packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description> <description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.application
import scala.io.Source
/**
* This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*
*/
trait SparkScalaApplication {
/**
* This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/**
* Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
parser.parseArgument(args)
parser
}
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark:SparkSession = null
def initialize():SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/**
* Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession():SparkSession = {
require(parser!= null)
val conf:SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}

View File

@ -28,7 +28,7 @@ public class HdfsSupport {
* @param configuration Configuration of hadoop env * @param configuration Configuration of hadoop env
*/ */
public static boolean exists(String path, Configuration configuration) { public static boolean exists(String path, Configuration configuration) {
logger.info("Removing path: {}", path); logger.info("Checking existence for path: {}", path);
return rethrowAsRuntimeException( return rethrowAsRuntimeException(
() -> { () -> {
Path f = new Path(path); Path f = new Path(path);

View File

@ -85,6 +85,13 @@ public class MakeTarArchive implements Serializable {
String p_string = p.toString(); String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) { if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1); String name = p_string.substring(p_string.lastIndexOf("/") + 1);
if (name.startsWith("part-") & name.length() > 10) {
String tmp = name.substring(0, 10);
if (name.contains(".")) {
tmp += name.substring(name.indexOf("."));
}
name = tmp;
}
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name); TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
entry.setSize(fileStatus.getLen()); entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen(); current_size += fileStatus.getLen();

View File

@ -27,8 +27,11 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final int ORCID_LEN = 19; public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10; public static final String TITLE_TEST = "test";
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions {
final String title = sp final String title = sp
.getValue() .getValue()
.toLowerCase(); .toLowerCase();
final String residual = Unidecode final String decoded = Unidecode.decode(title);
.decode(title)
.replaceAll(TITLE_FILTER_REGEX, ""); if (StringUtils.contains(decoded, TITLE_TEST)) {
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH; return decoded
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
}) })
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));

View File

@ -4,19 +4,19 @@ package eu.dnetlib.dhp.utils;
import java.io.*; import java.io.*;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.util.List; import java.util.*;
import java.util.Map; import java.util.stream.Collectors;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -26,6 +26,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import net.minidev.json.JSONArray; import net.minidev.json.JSONArray;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import scala.collection.Seq; import scala.collection.Seq;
@ -52,10 +54,56 @@ public class DHPUtils {
} }
} }
/**
* Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
* by he given format, layout, interpretation
* @param mdstoreManagerUrl the URL of the mdstore manager service
* @param format the mdstore format
* @param layout the mdstore layout
* @param interpretation the mdstore interpretation
* @param includeEmpty include Empty mdstores
* @return the set of hdfs paths
* @throws IOException in case of HTTP communication issues
*/
public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
final String format,
final String layout,
final String interpretation,
boolean includeEmpty) throws IOException {
final String url = mdstoreManagerUrl + "/mdstores/";
final ObjectMapper objectMapper = new ObjectMapper();
final HttpGet req = new HttpGet(url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String json = IOUtils.toString(response.getEntity().getContent());
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
return Arrays
.stream(mdstores)
.filter(md -> md.getFormat().equalsIgnoreCase(format))
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
.filter(md -> includeEmpty || md.getSize() > 0)
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
.collect(Collectors.toSet());
}
}
}
public static String generateIdentifier(final String originalId, final String nsPrefix) { public static String generateIdentifier(final String originalId, final String nsPrefix) {
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
} }
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
}
public static String getJPathString(final String jsonPath, final String json) { public static String getJPathString(final String jsonPath, final String json) {
try { try {
Object o = JsonPath.read(json, jsonPath); Object o = JsonPath.read(json, jsonPath);

View File

@ -107,7 +107,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560 --conf spark.sql.shuffle.partitions=5000
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg> <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -159,7 +159,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560 --conf spark.sql.shuffle.partitions=5000
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg> <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>

View File

@ -99,7 +99,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560 --conf spark.sql.shuffle.partitions=5000
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg> <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>

View File

@ -29,6 +29,13 @@
<goal>testCompile</goal> <goal>testCompile</goal>
</goals> </goals>
</execution> </execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions> </executions>
<configuration> <configuration>
<scalaVersion>${scala.version}</scalaVersion> <scalaVersion>${scala.version}</scalaVersion>

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class Constants {
public static final String DOI = "doi";
public static final String UPDATE_DATA_INFO_TYPE = "update";
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
public static final String FOS_CLASS_ID = "FOS";
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
public static final String NULL = "NULL";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() {
}
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
return Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.GetCSV;
public class GetFOSData implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetFOSData.class);
public static final char DEFAULT_DELIMITER = '\t';
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
GetFOSData.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json"))));
parser.parseArgument(args);
// the path where the original fos csv file is stored
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}", sourcePath);
// the path where to put the file as json
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
final String classForName = parser.get("classForName");
log.info("classForName {}", classForName);
final char delimiter = Optional
.ofNullable(parser.get("delimiter"))
.map(s -> s.charAt(0))
.orElse(DEFAULT_DELIMITER);
log.info("delimiter {}", delimiter);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem);
}
public void doRewrite(String inputPath, String outputFile, String classForName, char delimiter, FileSystem fs)
throws IOException, ClassNotFoundException {
// reads the csv and writes it as its json equivalent
try (InputStreamReader reader = new InputStreamReader(fs.open(new Path(inputPath)))) {
GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
}
}
}

View File

@ -0,0 +1,145 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.UPDATE_CLASS_NAME;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hdfs.client.HdfsUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipDeserialize;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.BipScore;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareBipFinder implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareBipFinder.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}: ", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
prepareResults(spark, sourcePath, outputPath);
});
}
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(inputPath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
.map((MapFunction<BipScore, Result>) v -> {
Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
r.setMeasures(getMeasure(v));
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/bip");
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_BIP_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
}

View File

@ -0,0 +1,133 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareFOSSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
distributeFOSdois(
spark,
sourcePath,
outputPath);
});
}
private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) {
Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class);
fosDataset.flatMap((FlatMapFunction<FOSDataModel, FOSDataModel>) v -> {
List<FOSDataModel> fosList = new ArrayList<>();
final String level1 = v.getLevel1();
final String level2 = v.getLevel2();
final String level3 = v.getLevel3();
Arrays
.stream(v.getDoi().split("\u0002"))
.forEach(d -> fosList.add(FOSDataModel.newInstance(d, level1, level2, level3)));
return fosList.iterator();
}, Encoders.bean(FOSDataModel.class))
.map((MapFunction<FOSDataModel, Result>) value -> {
Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(value.getDoi(), DOI));
r.setSubject(getSubjects(value));
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/fos");
}
private static List<StructuredProperty> getSubjects(FOSDataModel fos) {
return Arrays
.asList(getSubject(fos.getLevel1()), getSubject(fos.getLevel2()), getSubject(fos.getLevel3()))
.stream()
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
private static StructuredProperty getSubject(String sbj) {
if (sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp
.setQualifier(
OafMapperUtils
.qualifier(
FOS_CLASS_ID,
FOS_CLASS_NAME,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_SUBJECT_FOS_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return sp;
}
}

View File

@ -0,0 +1,79 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.createunresolvedentities.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
public class SparkSaveUnresolved implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
saveUnresolved(
spark,
sourcePath,
outputPath);
});
}
private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) {
spark
.read()
.textFile(sourcePath + "/*")
.map(
(MapFunction<String, Result>) l -> OBJECT_MAPPER.readValue(l, Result.class),
Encoders.bean(Result.class))
.groupByKey((MapFunction<Result, String>) r -> r.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next();
it.forEachRemaining(r -> ret.mergeFrom(r));
return ret;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* Class that maps the model of the bipFinder! input data.
* Only needed for deserialization purposes
*/
public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
public BipDeserialize() {
super();
}
public List<Score> get(String key) {
if (super.get(key) == null) {
return new ArrayList<>();
}
return super.get(key);
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import java.util.List;
/**
* Rewriting of the bipFinder input data by extracting the identifier of the result (doi)
*/
public class BipScore implements Serializable {
private String id; // doi
private List<Score> scoreList; // unit as given in the inputfile
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<Score> getScoreList() {
return scoreList;
}
public void setScoreList(List<Score> scoreList) {
this.scoreList = scoreList;
}
}

View File

@ -0,0 +1,71 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByPosition;
public class FOSDataModel implements Serializable {
@CsvBindByPosition(position = 1)
// @CsvBindByName(column = "doi")
private String doi;
@CsvBindByPosition(position = 2)
// @CsvBindByName(column = "level1")
private String level1;
@CsvBindByPosition(position = 3)
// @CsvBindByName(column = "level2")
private String level2;
@CsvBindByPosition(position = 4)
// @CsvBindByName(column = "level3")
private String level3;
public FOSDataModel() {
}
public FOSDataModel(String doi, String level1, String level2, String level3) {
this.doi = doi;
this.level1 = level1;
this.level2 = level2;
this.level3 = level3;
}
public static FOSDataModel newInstance(String d, String level1, String level2, String level3) {
return new FOSDataModel(d, level1, level2, level3);
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getLevel1() {
return level1;
}
public void setLevel1(String level1) {
this.level1 = level1;
}
public String getLevel2() {
return level2;
}
public void setLevel2(String level2) {
this.level2 = level2;
}
public String getLevel3() {
return level3;
}
public void setLevel3(String level3) {
this.level3 = level3;
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
public class KeyValue implements Serializable {
private String key;
private String value;
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import java.util.List;
/**
* represents the score in the input file
*/
public class Score implements Serializable {
private String id;
private List<KeyValue> unit;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public List<KeyValue> getUnit() {
return unit;
}
public void setUnit(List<KeyValue> unit) {
this.unit = unit;
}
}

View File

@ -1,41 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object ExportActionSetJobNode {
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(ExportActionSetJobNode.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o =>DataciteToOAFTransformation.toActionSet(o))
.filter(o => o!= null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -1,46 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object FilterCrossrefEntitiesSpark {
val log: Logger = LoggerFactory.getLogger(getClass.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
log.info("sourcePath: {}", sourcePath)
val targetPath = parser.get("targetPath")
log.info("targetPath: {}", targetPath)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -1,48 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
import spark.implicits._
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.filter(d => d != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -0,0 +1,181 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable {
public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
private static final String ID_PREFIX = "50|doi_________::";
private static final String TRUST = "0.91";
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
CreateActionSetSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}", inputPath.toString());
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final boolean shouldDuplicateRels = Optional
.ofNullable(parser.get("shouldDuplicateRels"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
extractContent(spark, inputPath, outputPath, shouldDuplicateRels);
});
}
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
boolean shouldDuplicateRels) {
spark
.sqlContext()
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
.flatMap(
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) value -> value != null)
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static List<Relation> createRelation(String value, boolean duplicate) {
String[] line = value.split(",");
if (!line[1].startsWith("10.")) {
return new ArrayList<>();
}
List<Relation> relationList = new ArrayList<>();
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
relationList
.addAll(
getRelations(
citing,
cited));
if (duplicate && line[1].endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
}
return relationList;
}
private static Collection<Relation> getRelations(String citing, String cited) {
return Arrays
.asList(
getRelation(citing, cited, ModelConstants.CITES),
getRelation(cited, citing, ModelConstants.IS_CITED_BY));
}
public static Relation getRelation(
String source,
String target,
String relclass) {
Relation r = new Relation();
r.setCollectedfrom(getCollectedFrom());
r.setSource(source);
r.setTarget(target);
r.setRelClass(relclass);
r.setRelType(ModelConstants.RESULT_RESULT);
r.setSubRelType(ModelConstants.CITATION);
r
.setDataInfo(
getDataInfo());
return r;
}
public static List<KeyValue> getCollectedFrom() {
KeyValue kv = new KeyValue();
kv.setKey(ModelConstants.OPENOCITATIONS_ID);
kv.setValue(ModelConstants.OPENOCITATIONS_NAME);
return Arrays.asList(kv);
}
public static DataInfo getDataInfo() {
DataInfo di = new DataInfo();
di.setInferred(false);
di.setDeletedbyinference(false);
di.setTrust(TRUST);
di
.setProvenanceaction(
getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS));
return di;
}
public static Qualifier getQualifier(String class_id, String class_name,
String qualifierSchema) {
Qualifier pa = new Qualifier();
pa.setClassid(class_id);
pa.setClassname(class_name);
pa.setSchemeid(qualifierSchema);
pa.setSchemename(qualifierSchema);
return pa;
}
}

View File

@ -0,0 +1,93 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import java.io.*;
import java.io.Serializable;
import java.util.Objects;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class GetOpenCitationsRefs implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class);
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
GetOpenCitationsRefs.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json"))));
parser.parseArgument(args);
final String[] inputFile = parser.get("inputFile").split(";");
log.info("inputFile {}", inputFile.toString());
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
for (String file : inputFile) {
ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
}
}
private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
throws IOException {
final Path path = new Path(inputFile);
FSDataInputStream oc_zip = fileSystem.open(path);
int count = 1;
try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
ZipEntry entry = null;
while ((entry = zis.getNextEntry()) != null) {
if (!entry.isDirectory()) {
String fileName = entry.getName();
fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
count++;
try (
FSDataOutputStream out = fileSystem
.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(zis, gzipOs);
}
}
}
}
}
}

View File

@ -60,14 +60,10 @@ object SparkCreateActionset {
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders)) val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
entities entities
.joinWith(idRelation, entities("_1").equalTo(idRelation("value"))) .joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
.map(p => p._1._2) .map(p => p._1._2)
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf") .write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
} }
} }

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.collection
import eu.dnetlib.dhp.schema.common.ModelSupport
import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation}
object CollectionUtils {
/**
* This method in pipeline to the transformation phase,
* generates relations in both verse, typically it should be a phase of flatMap
*
* @param i input OAF
* @return
* If the input OAF is an entity -> List(i)
* If the input OAF is a relation -> List(relation, inverseRelation)
*
*/
def fixRelations(i: Oaf): List[Oaf] = {
if (i.isInstanceOf[OafEntity])
return List(i)
else {
val r: Relation = i.asInstanceOf[Relation]
val currentRel = ModelSupport.findRelation(r.getRelClass)
if (currentRel != null) {
// Cleaning relation
r.setRelType(currentRel.getRelType)
r.setSubRelType(currentRel.getSubReltype)
r.setRelClass(currentRel.getRelClass)
val inverse = new Relation
inverse.setSource(r.getTarget)
inverse.setTarget(r.getSource)
inverse.setRelType(currentRel.getRelType)
inverse.setSubRelType(currentRel.getSubReltype)
inverse.setRelClass(currentRel.getInverseRelClass)
inverse.setCollectedfrom(r.getCollectedfrom)
inverse.setDataInfo(r.getDataInfo)
inverse.setProperties(r.getProperties)
inverse.setLastupdatetimestamp(r.getLastupdatetimestamp)
inverse.setValidated(r.getValidated)
inverse.setValidationDate(r.getValidationDate)
return List(r, inverse)
}
}
List()
}
}

View File

@ -1,12 +1,10 @@
package eu.dnetlib.dhp.actionmanager.datacite package eu.dnetlib.dhp.datacite
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.http.client.config.RequestConfig import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest} import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest}
import org.apache.http.entity.StringEntity import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.{HttpClientBuilder, HttpClients} import org.apache.http.impl.client.HttpClientBuilder
import java.io.IOException
abstract class AbstractRestClient extends Iterator[String] { abstract class AbstractRestClient extends Iterator[String] {

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.actionmanager.datacite package eu.dnetlib.dhp.datacite
import org.json4s.{DefaultFormats, JValue}
import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.jackson.JsonMethods.{compact, parse, render}
import org.json4s.{DefaultFormats, JValue}
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient { class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {

View File

@ -0,0 +1,134 @@
package eu.dnetlib.dhp.datacite
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue}
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils
import java.io.InputStream
import java.time.format.DateTimeFormatter
import java.util.Locale
import java.util.regex.Pattern
import scala.io.Source
/**
* This class represent the dataModel of the input Dataset of Datacite
* @param doi THE DOI
* @param timestamp timestamp of last update date
* @param isActive the record is active or deleted
* @param json the json native records
*/
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
/*
The following class are utility class used for the mapping from
json datacite to OAF Shema
*/
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class OAFRelations(relation:String, inverse:String, relType:String)
class DataciteModelConstants extends Serializable {
}
object DataciteModelConstants {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter"
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val DATACITE_NAME = "Datacite"
val dataInfo: DataInfo = dataciteDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME)
val subRelTypeMapping: Map[String,OAFRelations] = Map(
ModelConstants.REFERENCES -> OAFRelations(ModelConstants.REFERENCES, ModelConstants.IS_REFERENCED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_REFERENCED_BY -> OAFRelations(ModelConstants.IS_REFERENCED_BY,ModelConstants.REFERENCES, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations(ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.SUPPLEMENT),
ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations(ModelConstants.IS_SUPPLEMENT_TO,ModelConstants.IS_SUPPLEMENTED_BY,ModelConstants.SUPPLEMENT),
ModelConstants.HAS_PART -> OAFRelations(ModelConstants.HAS_PART,ModelConstants.IS_PART_OF, ModelConstants.PART),
ModelConstants.IS_PART_OF -> OAFRelations(ModelConstants.IS_PART_OF,ModelConstants.HAS_PART, ModelConstants.PART),
ModelConstants.IS_VERSION_OF-> OAFRelations(ModelConstants.IS_VERSION_OF,ModelConstants.HAS_VERSION,ModelConstants.VERSION),
ModelConstants.HAS_VERSION-> OAFRelations(ModelConstants.HAS_VERSION,ModelConstants.IS_VERSION_OF,ModelConstants.VERSION),
ModelConstants.IS_IDENTICAL_TO -> OAFRelations(ModelConstants.IS_IDENTICAL_TO,ModelConstants.IS_IDENTICAL_TO, ModelConstants.RELATIONSHIP),
ModelConstants.IS_CONTINUED_BY -> OAFRelations(ModelConstants.IS_CONTINUED_BY,ModelConstants.CONTINUES, ModelConstants.RELATIONSHIP),
ModelConstants.CONTINUES -> OAFRelations(ModelConstants.CONTINUES,ModelConstants.IS_CONTINUED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_NEW_VERSION_OF-> OAFRelations(ModelConstants.IS_NEW_VERSION_OF,ModelConstants.IS_PREVIOUS_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_PREVIOUS_VERSION_OF ->OAFRelations(ModelConstants.IS_PREVIOUS_VERSION_OF,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_SOURCE_OF -> OAFRelations(ModelConstants.IS_SOURCE_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_DERIVED_FROM -> OAFRelations(ModelConstants.IS_DERIVED_FROM,ModelConstants.IS_SOURCE_OF, ModelConstants.VERSION),
ModelConstants.CITES -> OAFRelations(ModelConstants.CITES,ModelConstants.IS_CITED_BY, ModelConstants.CITATION),
ModelConstants.IS_CITED_BY -> OAFRelations(ModelConstants.IS_CITED_BY,ModelConstants.CITES, ModelConstants.CITATION),
ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations(ModelConstants.IS_VARIANT_FORM_OF,ModelConstants.IS_DERIVED_FROM, ModelConstants.VERSION),
ModelConstants.IS_OBSOLETED_BY -> OAFRelations(ModelConstants.IS_OBSOLETED_BY,ModelConstants.IS_NEW_VERSION_OF, ModelConstants.VERSION),
ModelConstants.REVIEWS -> OAFRelations(ModelConstants.REVIEWS,ModelConstants.IS_REVIEWED_BY, ModelConstants.REVIEW),
ModelConstants.IS_REVIEWED_BY -> OAFRelations(ModelConstants.IS_REVIEWED_BY,ModelConstants.REVIEWS, ModelConstants.REVIEW),
ModelConstants.DOCUMENTS -> OAFRelations(ModelConstants.DOCUMENTS,ModelConstants.IS_DOCUMENTED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_DOCUMENTED_BY -> OAFRelations(ModelConstants.IS_DOCUMENTED_BY,ModelConstants.DOCUMENTS, ModelConstants.RELATIONSHIP),
ModelConstants.COMPILES -> OAFRelations(ModelConstants.COMPILES,ModelConstants.IS_COMPILED_BY, ModelConstants.RELATIONSHIP),
ModelConstants.IS_COMPILED_BY -> OAFRelations(ModelConstants.IS_COMPILED_BY,ModelConstants.COMPILES, ModelConstants.RELATIONSHIP)
)
val datacite_filter: List[String] = {
val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH)
require(stream!= null)
Source.fromInputStream(stream).getLines().toList
}
def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo(false,null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, trust)
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
}

View File

@ -1,7 +1,8 @@
package eu.dnetlib.dhp.actionmanager.datacite package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.datacite.DataciteModelConstants._
import eu.dnetlib.dhp.schema.action.AtomicAction import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
@ -12,121 +13,30 @@ import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat import java.text.SimpleDateFormat
import java.time.LocalDate import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import java.util.regex.Pattern
import java.util.{Date, Locale} import java.util.{Date, Locale}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.language.postfixOps
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation { object DataciteToOAFTransformation {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val subRelTypeMapping: Map[String,(String,String)] = Map(
"References" ->("IsReferencedBy","relationship"),
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
"IsPartOf" ->("HasPart","part"),
"HasPart" ->("IsPartOf","part"),
"IsVersionOf" ->("HasVersion","version"),
"HasVersion" ->("IsVersionOf","version"),
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
"IsContinuedBy" ->("Continues","relationship"),
"Continues" ->("IsContinuedBy","relationship"),
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
"IsDocumentedBy" ->("Documents","relationship"),
"IsSourceOf" ->("IsDerivedFrom","relationship"),
"Cites" ->("IsCitedBy","citation"),
"IsCitedBy" ->("Cites","citation"),
"IsDerivedFrom" ->("IsSourceOf","relationship"),
"IsVariantFormOf" ->("IsDerivedFrom","version"),
"IsReferencedBy" ->("References","relationship"),
"IsObsoletedBy" ->("IsNewVersionOf","version"),
"Reviews" ->("IsReviewedBy","review"),
"Documents" ->("IsDocumentedBy","relationship"),
"IsCompiledBy" ->("Compiles","relationship"),
"Compiles" ->("IsCompiledBy","relationship"),
"IsReviewedBy" ->("Reviews","review")
)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = { /**
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString * This method should skip record if json contains invalid text
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats * defined in gile datacite_filter
lazy val json: org.json4s.JValue = parse(s) * @param json
json.extract[Map[String, HostedByMapType]] * @return True if the record should be skipped
} */
def skip_record(json: String): Boolean = {
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH) datacite_filter.exists(f => json.contains(f))
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
} }
@deprecated("this method will be removed", "dhp")
def toActionSet(item: Oaf): (String, String) = { def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@ -206,6 +116,8 @@ object DataciteToOAFTransformation {
case _: Throwable => "" case _: Throwable => ""
} }
} }
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = { def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) { if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
@ -324,10 +236,7 @@ object DataciteToOAFTransformation {
val p = match_pattern.get._2 val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2") val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}" val targetId = s"$p${DHPUtils.md5(grantId)}"
List( List( generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo) )
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
} }
else else
List() List()
@ -336,7 +245,7 @@ object DataciteToOAFTransformation {
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = { def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
if (filter_json(input)) if (skip_record(input))
return List() return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
@ -515,8 +424,8 @@ object DataciteToOAFTransformation {
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES) val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) { if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) instance.setHostedby(OafMapperUtils.keyValue(generateDSId(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID), ModelConstants.UNKNOWN_REPOSITORY.getValue))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM) instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier) instance.setAccessright(access_rights_qualifier)
@ -570,7 +479,7 @@ object DataciteToOAFTransformation {
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo) rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType)._2 val subRelType = subRelTypeMapping(r.relationType).relType
rel.setRelType(REL_TYPE_VALUE) rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType) rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType) rel.setRelClass(r.relationType)
@ -580,22 +489,13 @@ object DataciteToOAFTransformation {
rel.setProperties(List(dateProps).asJava) rel.setProperties(List(dateProps).asJava)
rel.setSource(id) rel.setSource(id)
rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}") rel.setTarget(DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier,r.relatedIdentifierType))
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut) rel.getCollectedfrom.asScala.map(c => c.getValue).toList
rel rel
})(collection breakOut) })
} }
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = { def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::") val b = StringUtils.substringBefore(input, "::")

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.application.AbstractScalaApplication
import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations
import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH}
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord}
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
class GenerateDataciteDatasetSpark (propertyPath:String, args:Array[String], log:Logger) extends AbstractScalaApplication(propertyPath, args, log:Logger) {
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
override def run(): Unit = {
val sourcePath = parser.get("sourcePath")
log.info(s"SourcePath is '$sourcePath'")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
log.info(s"exportLinks is '$exportLinks'")
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
require(vocabularies != null)
val mdstoreOutputVersion = parser.get("mdstoreOutputVersion")
log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'")
val mapper = new ObjectMapper()
val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion])
val outputBasePath = cleanedMdStoreVersion.getHdfsPath
log.info(s"outputBasePath is '$outputBasePath'")
val targetPath = s"$outputBasePath/$MDSTORE_DATA_PATH"
log.info(s"targetPath is '$targetPath'")
generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark)
reportTotalSize(targetPath, outputBasePath)
}
/**
* For working with MDStore we need to store in a file on hdfs the size of
* the current dataset
* @param targetPath
* @param outputBasePath
*/
def reportTotalSize( targetPath: String, outputBasePath: String ):Unit = {
val total_items = spark.read.load(targetPath).count()
writeHdfsFile(spark.sparkContext.hadoopConfiguration, s"$total_items", outputBasePath + MDSTORE_SIZE_PATH)
}
/**
* Generate the transformed and cleaned OAF Dataset from the native one
* @param sourcePath sourcePath of the native Dataset in format JSON/Datacite
* @param exportLinks If true it generates unresolved links
* @param vocabularies vocabularies for cleaning
* @param targetPath the targetPath of the result Dataset
*/
def generateDataciteDataset(sourcePath: String, exportLinks: Boolean, vocabularies: VocabularyGroup, targetPath: String, spark:SparkSession):Unit = {
require(spark!= null)
import spark.implicits._
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.filter(d => d != null)
.flatMap(i => fixRelations(i)).filter(i => i != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
new GenerateDataciteDatasetSpark("/eu/dnetlib/dhp/datacite/generate_dataset_params.json", args, log).initialize().run()
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.dhp.actionmanager.datacite package eu.dnetlib.dhp.datacite
import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
@ -9,14 +8,14 @@ import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.functions.max
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.apache.spark.sql.functions.max
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.time.format.DateTimeFormatter._ import java.time.format.DateTimeFormatter.ISO_DATE_TIME
import java.time.{LocalDate, LocalDateTime, ZoneOffset} import java.time.{LocalDateTime, ZoneOffset}
import scala.io.Source import scala.io.Source
object ImportDatacite { object ImportDatacite {
@ -138,11 +137,11 @@ object ImportDatacite {
} }
} }
private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = { private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs: Int): Long = {
var from:Long = timestamp * 1000 var from: Long = timestamp * 1000
val delta:Long = 100000000L val delta: Long = 100000000L
var client: DataciteAPIImporter = null var client: DataciteAPIImporter = null
val now :Long =System.currentTimeMillis() val now: Long = System.currentTimeMillis()
var i = 0 var i = 0
try { try {
val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text])) val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
@ -168,7 +167,7 @@ object ImportDatacite {
start = System.currentTimeMillis start = System.currentTimeMillis
} }
} }
println(s"updating from value: $from -> ${from+delta}") println(s"updating from value: $from -> ${from + delta}")
from = from + delta from = from + delta
} }
} catch { } catch {
@ -183,4 +182,4 @@ object ImportDatacite {
i i
} }
} }

View File

@ -1,18 +1,14 @@
package eu.dnetlib.dhp.actionmanager.datacite package eu.dnetlib.dhp.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.LocalFileSystem
import org.apache.hadoop.hdfs.DistributedFileSystem
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.apache.spark.sql.functions.max import org.apache.spark.sql.functions.max
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.text.SimpleDateFormat import java.text.SimpleDateFormat
import java.util.{Date, Locale} import java.util.Locale
import scala.io.Source import scala.io.Source
object SparkDownloadUpdateDatacite { object SparkDownloadUpdateDatacite {
@ -21,7 +17,7 @@ object SparkDownloadUpdateDatacite {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf = new SparkConf val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString) val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args) parser.parseArgument(args)
val master = parser.get("master") val master = parser.get("master")
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
@ -42,9 +38,9 @@ object SparkDownloadUpdateDatacite {
import spark.implicits._ import spark.implicits._
val maxDate:String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0) val maxDate: String = spark.read.load(workingPath).as[Oaf].filter(s => s.isInstanceOf[Result]).map(r => r.asInstanceOf[Result].getDateofcollection).select(max("value")).first().getString(0)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val string_to_date =ISO8601FORMAT.parse(maxDate) val string_to_date = ISO8601FORMAT.parse(maxDate)
val ts = string_to_date.getTime val ts = string_to_date.getTime

View File

@ -1,14 +1,12 @@
package eu.dnetlib.dhp.sx.graph.bio package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils} import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Dataset, Instance, KeyValue, Oaf, Relation, StructuredProperty} import eu.dnetlib.dhp.schema.oaf._
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.{compact, parse, render} import org.json4s.jackson.JsonMethods.{compact, parse, render}
import collection.JavaConverters._
import scala.collection.JavaConverters._
object BioDBToOAF { object BioDBToOAF {
case class EBILinkItem(id: Long, links: String) {} case class EBILinkItem(id: Long, links: String) {}
@ -17,23 +15,23 @@ object BioDBToOAF {
case class UniprotDate(date: String, date_info: String) {} case class UniprotDate(date: String, date_info: String) {}
case class ScholixResolved(pid:String, pidType:String, typology:String, tilte:List[String], datasource:List[String], date:List[String], authors:List[String]){} case class ScholixResolved(pid: String, pidType: String, typology: String, tilte: List[String], datasource: List[String], date: List[String], authors: List[String]) {}
val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9") val DATA_INFO: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val SUBJ_CLASS = "Keywords" val SUBJ_CLASS = "Keywords"
val DATE_RELATION_KEY = "RelationDate" val DATE_RELATION_KEY = "RelationDate"
val resolvedURL:Map[String,String] = Map( val resolvedURL: Map[String, String] = Map(
"genbank"-> "https://www.ncbi.nlm.nih.gov/nuccore/", "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/", "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/", "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/", "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/",
"ena" -> "https://www.ebi.ac.uk/ena/browser/view/", "ena" -> "https://www.ebi.ac.uk/ena/browser/view/",
"clinicaltrials.gov"-> "https://clinicaltrials.gov/ct2/show/", "clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/",
"onim"-> "https://omim.org/entry/", "onim" -> "https://omim.org/entry/",
"refseq"-> "https://www.ncbi.nlm.nih.gov/nuccore/", "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/",
"geo"-> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc="
) )
@ -45,7 +43,7 @@ object BioDBToOAF {
val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier") val ElsevierCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier")
val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature") val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue("10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", "Springer Nature")
val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)") val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue("10|opendoar____::83e60e09c222f206c725385f53d7e567c", "EMBL-EBIs Protein Data Bank in Europe (PDBe)")
val pubmedCollectedFrom:KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") val pubmedCollectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
UNIPROTCollectedFrom.setDataInfo(DATA_INFO) UNIPROTCollectedFrom.setDataInfo(DATA_INFO)
PDBCollectedFrom.setDataInfo(DATA_INFO) PDBCollectedFrom.setDataInfo(DATA_INFO)
@ -58,9 +56,9 @@ object BioDBToOAF {
Map( Map(
"uniprot" -> UNIPROTCollectedFrom, "uniprot" -> UNIPROTCollectedFrom,
"pdb"-> PDBCollectedFrom, "pdb" -> PDBCollectedFrom,
"elsevier" ->ElsevierCollectedFrom, "elsevier" -> ElsevierCollectedFrom,
"ebi" ->EBICollectedFrom, "ebi" -> EBICollectedFrom,
"Springer Nature" -> springerNatureCollectedFrom, "Springer Nature" -> springerNatureCollectedFrom,
"NCBI Nucleotide" -> ncbiCollectedFrom, "NCBI Nucleotide" -> ncbiCollectedFrom,
"European Nucleotide Archive" -> enaCollectedFrom, "European Nucleotide Archive" -> enaCollectedFrom,
@ -68,7 +66,7 @@ object BioDBToOAF {
) )
} }
def crossrefLinksToOaf(input:String):Oaf = { def crossrefLinksToOaf(input: String): Oaf = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input) lazy val json = parse(input)
val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase
@ -77,16 +75,16 @@ object BioDBToOAF {
val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase
val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase
val relation_semantic= (json \ "RelationshipType" \ "Name").extract[String] val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String]
val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String]) val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String])
createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type),collectedFromMap("elsevier"),"relationship", relation_semantic, date) createRelation(target_pid, target_pid_type, generate_unresolved_id(source_pid, source_pid_type), collectedFromMap("elsevier"), "relationship", relation_semantic, date)
} }
def scholixResolvedToOAF(input:ScholixResolved):Oaf = { def scholixResolvedToOAF(input: ScholixResolved): Oaf = {
val d = new Dataset val d = new Dataset
@ -127,18 +125,18 @@ object BioDBToOAF {
d.setInstance(List(i).asJava) d.setInstance(List(i).asJava)
if (input.authors != null && input.authors.nonEmpty) { if (input.authors != null && input.authors.nonEmpty) {
val authors = input.authors.map(a =>{ val authors = input.authors.map(a => {
val authorOAF = new Author val authorOAF = new Author
authorOAF.setFullname(a) authorOAF.setFullname(a)
authorOAF authorOAF
}) })
d.setAuthor(authors.asJava) d.setAuthor(authors.asJava)
} }
if (input.date!= null && input.date.nonEmpty) { if (input.date != null && input.date.nonEmpty) {
val dt = input.date.head val dt = input.date.head
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO)) i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO)) d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO))
} }
d d
} }
@ -190,7 +188,7 @@ object BioDBToOAF {
OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null) OafMapperUtils.structuredProperty(s, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava) ).asJava)
} }
var i_date:Option[UniprotDate] = None var i_date: Option[UniprotDate] = None
if (dates.nonEmpty) { if (dates.nonEmpty) {
i_date = dates.find(d => d.date_info.contains("entry version")) i_date = dates.find(d => d.date_info.contains("entry version"))
@ -218,12 +216,12 @@ object BioDBToOAF {
if (references_pmid != null && references_pmid.nonEmpty) { if (references_pmid != null && references_pmid.nonEmpty) {
val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) val rel = createRelation(references_pmid.head, "pmid", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
rel.getCollectedfrom rel.getCollectedfrom
List(d, rel) List(d, rel)
} }
else if (references_doi != null && references_doi.nonEmpty) { else if (references_doi != null && references_doi.nonEmpty) {
val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null) val rel = createRelation(references_doi.head, "doi", d.getId, collectedFromMap("uniprot"), ModelConstants.RELATIONSHIP, ModelConstants.IS_RELATED_TO, if (i_date.isDefined) i_date.get.date else null)
List(d, rel) List(d, rel)
} }
else else
@ -231,13 +229,12 @@ object BioDBToOAF {
} }
def generate_unresolved_id(pid: String, pidType: String): String = {
def generate_unresolved_id(pid:String, pidType:String) :String = {
s"unresolved::$pid::$pidType" s"unresolved::$pid::$pidType"
} }
def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType:String, relClass:String, date:String):Relation = { def createRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, subRelType: String, relClass: String, date: String): Relation = {
val rel = new Relation val rel = new Relation
rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava) rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava)
@ -251,7 +248,7 @@ object BioDBToOAF {
rel.setTarget(s"unresolved::$pid::$pidType") rel.setTarget(s"unresolved::$pid::$pidType")
val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava) rel.setProperties(List(dateProps).asJava)
@ -262,8 +259,8 @@ object BioDBToOAF {
} }
def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date:String): Relation = { def createSupplementaryRelation(pid: String, pidType: String, sourceId: String, collectedFrom: KeyValue, date: String): Relation = {
createRelation(pid,pidType,sourceId,collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date) createRelation(pid, pidType, sourceId, collectedFrom, ModelConstants.SUPPLEMENT, ModelConstants.IS_SUPPLEMENT_TO, date)
} }
@ -338,7 +335,7 @@ object BioDBToOAF {
def EBITargetLinksFilter(input: EBILinks): Boolean = { def EBITargetLinksFilter(input: EBILinks): Boolean = {
input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot") input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase("pdb") || input.targetPidType.equalsIgnoreCase("uniprot")
} }

View File

@ -1,8 +1,9 @@
package eu.dnetlib.dhp.sx.graph.bio package eu.dnetlib.dhp.sx.bio
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} import eu.dnetlib.dhp.schema.oaf.Oaf
import BioDBToOAF.ScholixResolved import BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.collection.CollectionUtils
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
@ -13,7 +14,7 @@ object SparkTransformBioDatabaseToOAF {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/bio_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json")))
parser.parseArgument(args) parser.parseArgument(args)
val database: String = parser.get("database") val database: String = parser.get("database")
log.info("database: {}", database) log.info("database: {}", database)
@ -31,18 +32,17 @@ object SparkTransformBioDatabaseToOAF {
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
val sc = spark.sparkContext val sc = spark.sparkContext
implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
import spark.implicits._ import spark.implicits._
database.toUpperCase() match { database.toUpperCase() match {
case "UNIPROT" => case "UNIPROT" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).write.mode(SaveMode.Overwrite).save(targetPath) spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
case "PDB"=> case "PDB" =>
spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath) spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
case "SCHOLIX" => case "SCHOLIX" =>
spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).write.mode(SaveMode.Overwrite).save(targetPath) spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
case "CROSSREF_LINKS"=> case "CROSSREF_LINKS" =>
spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).write.mode(SaveMode.Overwrite).save(targetPath) spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))).flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null).write.mode(SaveMode.Overwrite).save(targetPath)
} }
} }

View File

@ -1,9 +1,9 @@
package eu.dnetlib.dhp.sx.graph.ebi package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.oaf.Result import eu.dnetlib.dhp.schema.oaf.Result
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf} import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PubMedToOaf}
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
@ -24,24 +24,24 @@ import scala.xml.pull.XMLEventReader
object SparkCreateBaselineDataFrame { object SparkCreateBaselineDataFrame {
def requestBaseLineUpdatePage(maxFile:String):List[(String,String)] = { def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
val data =requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/") val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
val result =data.lines.filter(l => l.startsWith("<a href=")).map{l => val result = data.lines.filter(l => l.startsWith("<a href=")).map { l =>
val end = l.lastIndexOf("\">") val end = l.lastIndexOf("\">")
val start = l.indexOf("<a href=\"") val start = l.indexOf("<a href=\"")
if (start>= 0 && end >start) if (start >= 0 && end > start)
l.substring(start+9, (end-start)) l.substring(start + 9, end - start)
else else
"" ""
}.filter(s =>s.endsWith(".gz") ).filter(s => s > maxFile).map(s => (s,s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList }.filter(s => s.endsWith(".gz")).filter(s => s > maxFile).map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")).toList
result result
} }
def downloadBaselinePart(url:String):InputStream = { def downloadBaselinePart(url: String): InputStream = {
val r = new HttpGet(url) val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig.custom()
@ -55,7 +55,7 @@ object SparkCreateBaselineDataFrame {
} }
def requestPage(url:String):String = { def requestPage(url: String): String = {
val r = new HttpGet(url) val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig.custom()
@ -90,25 +90,21 @@ object SparkCreateBaselineDataFrame {
} }
def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = {
def downloadBaseLineUpdate(baselinePath:String, hdfsServerUri:String ):Unit = {
val conf = new Configuration val conf = new Configuration
conf.set("fs.defaultFS", hdfsServerUri) conf.set("fs.defaultFS", hdfsServerUri)
val fs = FileSystem.get(conf) val fs = FileSystem.get(conf)
val p = new Path(baselinePath) val p = new Path(baselinePath)
val files = fs.listFiles(p,false) val files = fs.listFiles(p, false)
var max_file = "" var max_file = ""
while (files.hasNext) { while (files.hasNext) {
val c = files.next() val c = files.next()
val data = c.getPath.toString val data = c.getPath.toString
val fileName = data.substring(data.lastIndexOf("/")+1) val fileName = data.substring(data.lastIndexOf("/") + 1)
if (fileName> max_file) if (fileName > max_file)
max_file = fileName max_file = fileName
} }
@ -118,11 +114,7 @@ object SparkCreateBaselineDataFrame {
val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}") val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}")
val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true) val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true)
val i = downloadBaselinePart(u._2) val i = downloadBaselinePart(u._2)
val buffer = Array.fill[Byte](1024)(0) IOUtils.copy(i, fsDataOutputStream)
while(i.read(buffer)>0) {
fsDataOutputStream.write(buffer)
}
i.close()
println(s"Downloaded ${u._2} into $baselinePath/${u._1}") println(s"Downloaded ${u._2} into $baselinePath/${u._1}")
fsDataOutputStream.close() fsDataOutputStream.close()
} }
@ -134,11 +126,11 @@ object SparkCreateBaselineDataFrame {
override def zero: PMArticle = new PMArticle override def zero: PMArticle = new PMArticle
override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = { override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = {
if (b != null && b.getPmid!= null) b else a._2 if (b != null && b.getPmid != null) b else a._2
} }
override def merge(b1: PMArticle, b2: PMArticle): PMArticle = { override def merge(b1: PMArticle, b2: PMArticle): PMArticle = {
if (b1 != null && b1.getPmid!= null) b1 else b2 if (b1 != null && b1.getPmid != null) b1 else b2
} }
@ -153,7 +145,7 @@ object SparkCreateBaselineDataFrame {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/baseline_to_oaf_params.json"))) val parser = new ArgumentApplicationParser(IOUtils.toString(SparkEBILinksToOaf.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json")))
parser.parseArgument(args) parser.parseArgument(args)
val isLookupUrl: String = parser.get("isLookupUrl") val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl) log.info("isLookupUrl: {}", isLookupUrl)
@ -166,6 +158,9 @@ object SparkCreateBaselineDataFrame {
val hdfsServerUri = parser.get("hdfsServerUri") val hdfsServerUri = parser.get("hdfsServerUri")
log.info("hdfsServerUri: {}", targetPath) log.info("hdfsServerUri: {}", targetPath)
val skipUpdate = parser.get("skipUpdate")
log.info("skipUpdate: {}", skipUpdate)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
@ -175,32 +170,31 @@ object SparkCreateBaselineDataFrame {
.config(conf) .config(conf)
.appName(SparkEBILinksToOaf.getClass.getSimpleName) .appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
import spark.implicits._
val sc = spark.sparkContext val sc = spark.sparkContext
import spark.implicits._
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result]) implicit val resultEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) if (!"true".equalsIgnoreCase(skipUpdate)) {
downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri)
val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline",2000) val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000)
val ds:Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i =>{ val ds: Dataset[PMArticle] = spark.createDataset(k.filter(i => i._1.endsWith(".gz")).flatMap(i => {
val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes()))
new PMParser(xml) new PMParser(xml)
}))
} )) ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1)
.agg(pmArticleAggregator.toColumn)
ds.map(p => (p.getPmid,p))(Encoders.tuple(Encoders.STRING, PMEncoder)).groupByKey(_._1) .map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
.agg(pmArticleAggregator.toColumn) }
.map(p => p._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/baseline_dataset")
val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle] val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle]
exported_dataset exported_dataset
.map(a => PubMedToOaf.convert(a, vocabularies)).as[Result] .map(a => PubMedToOaf.convert(a, vocabularies)).as[Result]
.filter(p => p!= null) .filter(p => p != null)
.write.mode(SaveMode.Overwrite).save(targetPath) .write.mode(SaveMode.Overwrite).save(targetPath)
} }
} }

View File

@ -1,8 +1,9 @@
package eu.dnetlib.dhp.sx.graph.ebi package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal}
import eu.dnetlib.dhp.sx.graph.bio.pubmed.{PMArticle, PMAuthor, PMJournal} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.bio.pubmed.PMJournal
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.http.client.config.RequestConfig import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.HttpGet import org.apache.http.client.methods.HttpGet
@ -14,15 +15,15 @@ import org.slf4j.{Logger, LoggerFactory}
object SparkDownloadEBILinks { object SparkDownloadEBILinks {
def createEBILinks(pmid:Long):EBILinkItem = { def createEBILinks(pmid: Long): EBILinkItem = {
val res = requestLinks(pmid) val res = requestLinks(pmid)
if (res!=null) if (res != null)
return EBILinkItem(pmid, res) return EBILinkItem(pmid, res)
null null
} }
def requestPage(url:String):String = { def requestPage(url: String): String = {
val r = new HttpGet(url) val r = new HttpGet(url)
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig.custom() val config = RequestConfig.custom()
@ -56,16 +57,17 @@ object SparkDownloadEBILinks {
} }
} }
def requestLinks(PMID:Long):String = { def requestLinks(PMID: Long): String = {
requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json") requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json")
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val MAX_ITEM_PER_PARTITION = 20000 val MAX_ITEM_PER_PARTITION = 20000
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_download_update.json"))) val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json")))
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
@ -76,39 +78,39 @@ object SparkDownloadEBILinks {
import spark.implicits._ import spark.implicits._
implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle])
implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal])
implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor])
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
val workingPath = parser.get("workingPath") val workingPath = parser.get("workingPath")
log.info(s"workingPath -> $workingPath") log.info(s"workingPath -> $workingPath")
log.info("Getting max pubmedId where the links have been requested") log.info("Getting max pubmedId where the links have already requested")
val links:Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem] val links: Dataset[EBILinkItem] = spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem]
val lastPMIDRequested =links.map(l => l.id).select(max("value")).first.getLong(0) val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0)
log.info("Retrieving PMID to request links") log.info("Retrieving PMID to request links")
val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle] val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle]
pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request") pubmed.map(p => p.getPmid.toLong).where(s"value > $lastPMIDRequested").write.mode(SaveMode.Overwrite).save(s"$workingPath/id_to_request")
val pmidToReq:Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long] val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long]
val total = pmidToReq.count() val total = pmidToReq.count()
spark.createDataset(pmidToReq.rdd.repartition((total/MAX_ITEM_PER_PARTITION).toInt).map(pmid =>createEBILinks(pmid)).filter(l => l!= null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update") spark.createDataset(pmidToReq.rdd.repartition((total / MAX_ITEM_PER_PARTITION).toInt).map(pmid => createEBILinks(pmid)).filter(l => l != null)).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_update")
val updates:Dataset[EBILinkItem] =spark.read.load(s"$workingPath/links_update").as[EBILinkItem] val updates: Dataset[EBILinkItem] = spark.read.load(s"$workingPath/links_update").as[EBILinkItem]
links.union(updates).groupByKey(_.id) links.union(updates).groupByKey(_.id)
.reduceGroups{(x,y) => .reduceGroups { (x, y) =>
if (x == null || x.links ==null) if (x == null || x.links == null)
y y
if (y ==null || y.links ==null) if (y == null || y.links == null)
x x
if (x.links.length > y.links.length) if (x.links.length > y.links.length)
x x
else else
y y
}.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final") }.map(_._2).write.mode(SaveMode.Overwrite).save(s"$workingPath/links_final")

View File

@ -1,21 +1,22 @@
package eu.dnetlib.dhp.sx.graph.ebi package eu.dnetlib.dhp.sx.bio.ebi
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.sx.graph.bio import eu.dnetlib.dhp.sx.bio.BioDBToOAF
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.EBILinkItem import BioDBToOAF.EBILinkItem
import eu.dnetlib.dhp.collection.CollectionUtils
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD import org.apache.spark.sql._
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
object SparkEBILinksToOaf { object SparkEBILinksToOaf {
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val log: Logger = LoggerFactory.getLogger(getClass) val log: Logger = LoggerFactory.getLogger(getClass)
val conf: SparkConf = new SparkConf() val conf: SparkConf = new SparkConf()
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/ebi/ebi_to_df_params.json"))) val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json")))
parser.parseArgument(args) parser.parseArgument(args)
val spark: SparkSession = val spark: SparkSession =
SparkSession SparkSession
@ -24,19 +25,20 @@ object SparkEBILinksToOaf {
.appName(SparkEBILinksToOaf.getClass.getSimpleName) .appName(SparkEBILinksToOaf.getClass.getSimpleName)
.master(parser.get("master")).getOrCreate() .master(parser.get("master")).getOrCreate()
import spark.implicits._
val sourcePath = parser.get("sourcePath") val sourcePath = parser.get("sourcePath")
log.info(s"sourcePath -> $sourcePath") log.info(s"sourcePath -> $sourcePath")
val targetPath = parser.get("targetPath") val targetPath = parser.get("targetPath")
log.info(s"targetPath -> $targetPath") log.info(s"targetPath -> $targetPath")
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
import spark.implicits._ val ebLinks: Dataset[EBILinkItem] = spark.read.load(sourcePath).as[EBILinkItem].filter(l => l.links != null && l.links.startsWith("{"))
implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf])
val ebLinks:Dataset[EBILinkItem] = spark.read.load(s"${sourcePath}_dataset").as[EBILinkItem].filter(l => l.links!= null) ebLinks.flatMap(j => BioDBToOAF.parse_ebi_links(j.links))
ebLinks.flatMap(j =>BioDBToOAF.parse_ebi_links(j.links))
.filter(p => BioDBToOAF.EBITargetLinksFilter(p)) .filter(p => BioDBToOAF.EBITargetLinksFilter(p))
.flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)) .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p))
.flatMap(i=> CollectionUtils.fixRelations(i)).filter(i => i != null)
.write.mode(SaveMode.Overwrite).save(targetPath) .write.mode(SaveMode.Overwrite).save(targetPath)
} }
} }

View File

@ -0,0 +1,253 @@
package eu.dnetlib.dhp.sx.bio.pubmed;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
/**
* This class represent an instance of Pubmed Article extracted from the native XML
*
* @author Sandro La Bruzzo
*/
public class PMArticle implements Serializable {
/**
* the Pubmed Identifier
*/
private String pmid;
/**
* the DOI
*/
private String doi;
/**
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
*/
private String date;
/**
* This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself.
*/
private PMJournal journal;
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the <MedlineTA> element.
*/
private String title;
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*/
private String description;
/**
* the language in which an article was published is recorded in <Language>.
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
*/
private String language;
/**
* NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. *
*/
private final List<PMSubject> subjects = new ArrayList<>();
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*/
private final List<PMSubject> publicationTypes = new ArrayList<>();
/**
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
*/
private List<PMAuthor> authors = new ArrayList<>();
/**
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
*/
private final List<PMGrant> grants = new ArrayList<>();
/**
* get the DOI
* @return a DOI
*/
public String getDoi() {
return doi;
}
/**
* Set the DOI
* @param doi a DOI
*/
public void setDoi(String doi) {
this.doi = doi;
}
/**
* get the Pubmed Identifier
* @return the PMID
*/
public String getPmid() {
return pmid;
}
/**
* set the Pubmed Identifier
* @param pmid the Pubmed Identifier
*/
public void setPmid(String pmid) {
this.pmid = pmid;
}
/**
* the Pubmed Date extracted from <PubmedPubDate> Specifies a date significant to either the article's history or the citation's processing.
* All <History> dates will have a <Year>, <Month>, and <Day> elements. Some may have an <Hour>, <Minute>, and <Second> element(s).
*
* @return the Pubmed Date
*/
public String getDate() {
return date;
}
/**
* Set the pubmed Date
* @param date
*/
public void setDate(String date) {
this.date = date;
}
/**
* The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element.
* Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles.
* The NLM journal title abbreviation is exported in the <MedlineTA> element.
*
* @return the pubmed Journal Extracted
*/
public PMJournal getJournal() {
return journal;
}
/**
* Set the mapped pubmed Journal
* @param journal
*/
public void setJournal(PMJournal journal) {
this.journal = journal;
}
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*
* @return the extracted pubmed Title
*/
public String getTitle() {
return title;
}
/**
* set the pubmed title
* @param title
*/
public void setTitle(String title) {
this.title = title;
}
/**
* English-language abstracts are taken directly from the published article.
* If the article does not have a published abstract, the National Library of Medicine does not create one,
* thus the record lacks the <Abstract> and <AbstractText> elements. However, in the absence of a formally
* labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used.
*
* @return the Mapped Pubmed Article Abstracts
*/
public String getDescription() {
return description;
}
/**
* Set the Mapped Pubmed Article Abstracts
* @param description
*/
public void setDescription(String description) {
this.description = description;
}
/**
* Personal and collective (corporate) author names published with the article are found in <AuthorList>.
*
* @return get the Mapped Authors lists
*/
public List<PMAuthor> getAuthors() {
return authors;
}
/**
* Set the Mapped Authors lists
* @param authors
*/
public void setAuthors(List<PMAuthor> authors) {
this.authors = authors;
}
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*
* @return the mapped Subjects
*/
public List<PMSubject> getSubjects() {
return subjects;
}
/**
*
* the language in which an article was published is recorded in <Language>.
* All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single
* record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value.
* Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined.
*
* @return The mapped Language
*/
public String getLanguage() {
return language;
}
/**
*
* Set The mapped Language
*
* @param language the mapped Language
*/
public void setLanguage(String language) {
this.language = language;
}
/**
* This element is used to identify the type of article indexed for MEDLINE;
* it characterizes the nature of the information or the manner in which it is conveyed as well as the type of
* research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural).
*
* @return the mapped Publication Type
*/
public List<PMSubject> getPublicationTypes() {
return publicationTypes;
}
/**
* <GrantID> contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service
* or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations.
* @return the mapped grants
*/
public List<PMGrant> getGrants() {
return grants;
}
}

View File

@ -1,29 +1,59 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed; package eu.dnetlib.dhp.sx.bio.pubmed;
import java.io.Serializable; import java.io.Serializable;
/**
* The type Pubmed author.
*
* @author Sandro La Bruzzo
*/
public class PMAuthor implements Serializable { public class PMAuthor implements Serializable {
private String lastName; private String lastName;
private String foreName; private String foreName;
/**
* Gets last name.
*
* @return the last name
*/
public String getLastName() { public String getLastName() {
return lastName; return lastName;
} }
/**
* Sets last name.
*
* @param lastName the last name
*/
public void setLastName(String lastName) { public void setLastName(String lastName) {
this.lastName = lastName; this.lastName = lastName;
} }
/**
* Gets fore name.
*
* @return the fore name
*/
public String getForeName() { public String getForeName() {
return foreName; return foreName;
} }
/**
* Sets fore name.
*
* @param foreName the fore name
*/
public void setForeName(String foreName) { public void setForeName(String foreName) {
this.foreName = foreName; this.foreName = foreName;
} }
/**
* Gets full name.
*
* @return the full name
*/
public String getFullName() { public String getFullName() {
return String return String
.format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : ""); .format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : "");

View File

@ -0,0 +1,87 @@
package eu.dnetlib.dhp.sx.bio.pubmed;
/**
* The type Pm grant.
*
* @author Sandro La Bruzzo
*/
public class PMGrant {
private String grantID;
private String agency;
private String country;
/**
* Instantiates a new Pm grant.
*/
public PMGrant() {
}
/**
* Instantiates a new Pm grant.
*
* @param grantID the grant id
* @param agency the agency
* @param country the country
*/
public PMGrant(String grantID, String agency, String country) {
this.grantID = grantID;
this.agency = agency;
this.country = country;
}
/**
* Gets grant id.
*
* @return the grant id
*/
public String getGrantID() {
return grantID;
}
/**
* Sets grant id.
*
* @param grantID the grant id
*/
public void setGrantID(String grantID) {
this.grantID = grantID;
}
/**
* Gets agency.
*
* @return the agency
*/
public String getAgency() {
return agency;
}
/**
* Sets agency.
*
* @param agency the agency
*/
public void setAgency(String agency) {
this.agency = agency;
}
/**
* Gets country.
*
* @return the country
*/
public String getCountry() {
return country;
}
/**
* Sets country.
*
* @param country the country
*/
public void setCountry(String country) {
this.country = country;
}
}

View File

@ -1,8 +1,13 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed; package eu.dnetlib.dhp.sx.bio.pubmed;
import java.io.Serializable; import java.io.Serializable;
/**
* The type Pm journal.
*
* @author Sandro La Bruzzo
*/
public class PMJournal implements Serializable { public class PMJournal implements Serializable {
private String issn; private String issn;
@ -11,42 +16,92 @@ public class PMJournal implements Serializable {
private String date; private String date;
private String title; private String title;
/**
* Gets issn.
*
* @return the issn
*/
public String getIssn() { public String getIssn() {
return issn; return issn;
} }
/**
* Sets issn.
*
* @param issn the issn
*/
public void setIssn(String issn) { public void setIssn(String issn) {
this.issn = issn; this.issn = issn;
} }
/**
* Gets volume.
*
* @return the volume
*/
public String getVolume() { public String getVolume() {
return volume; return volume;
} }
/**
* Sets volume.
*
* @param volume the volume
*/
public void setVolume(String volume) { public void setVolume(String volume) {
this.volume = volume; this.volume = volume;
} }
/**
* Gets issue.
*
* @return the issue
*/
public String getIssue() { public String getIssue() {
return issue; return issue;
} }
/**
* Sets issue.
*
* @param issue the issue
*/
public void setIssue(String issue) { public void setIssue(String issue) {
this.issue = issue; this.issue = issue;
} }
/**
* Gets date.
*
* @return the date
*/
public String getDate() { public String getDate() {
return date; return date;
} }
/**
* Sets date.
*
* @param date the date
*/
public void setDate(String date) { public void setDate(String date) {
this.date = date; this.date = date;
} }
/**
* Gets title.
*
* @return the title
*/
public String getTitle() { public String getTitle() {
return title; return title;
} }
/**
* Sets title.
*
* @param title the title
*/
public void setTitle(String title) { public void setTitle(String title) {
this.title = title; this.title = title;
} }

View File

@ -1,7 +1,13 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed package eu.dnetlib.dhp.sx.bio.pubmed
import scala.xml.MetaData import scala.xml.MetaData
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}
/**
*
* @param xml
*/
class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] { class PMParser(xml:XMLEventReader) extends Iterator[PMArticle] {
var currentArticle:PMArticle = generateNextArticle() var currentArticle:PMArticle = generateNextArticle()

View File

@ -1,40 +1,83 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed; package eu.dnetlib.dhp.sx.bio.pubmed;
/**
* The type Pubmed subject.
*/
public class PMSubject { public class PMSubject {
private String value; private String value;
private String meshId; private String meshId;
private String registryNumber; private String registryNumber;
/**
* Instantiates a new Pm subject.
*/
public PMSubject() { public PMSubject() {
} }
/**
* Instantiates a new Pm subject.
*
* @param value the value
* @param meshId the mesh id
* @param registryNumber the registry number
*/
public PMSubject(String value, String meshId, String registryNumber) { public PMSubject(String value, String meshId, String registryNumber) {
this.value = value; this.value = value;
this.meshId = meshId; this.meshId = meshId;
this.registryNumber = registryNumber; this.registryNumber = registryNumber;
} }
/**
* Gets value.
*
* @return the value
*/
public String getValue() { public String getValue() {
return value; return value;
} }
/**
* Sets value.
*
* @param value the value
*/
public void setValue(String value) { public void setValue(String value) {
this.value = value; this.value = value;
} }
/**
* Gets mesh id.
*
* @return the mesh id
*/
public String getMeshId() { public String getMeshId() {
return meshId; return meshId;
} }
/**
* Sets mesh id.
*
* @param meshId the mesh id
*/
public void setMeshId(String meshId) { public void setMeshId(String meshId) {
this.meshId = meshId; this.meshId = meshId;
} }
/**
* Gets registry number.
*
* @return the registry number
*/
public String getRegistryNumber() { public String getRegistryNumber() {
return registryNumber; return registryNumber;
} }
/**
* Sets registry number.
*
* @param registryNumber the registry number
*/
public void setRegistryNumber(String registryNumber) { public void setRegistryNumber(String registryNumber) {
this.registryNumber = registryNumber; this.registryNumber = registryNumber;
} }

View File

@ -1,12 +1,16 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed package eu.dnetlib.dhp.sx.bio.pubmed
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf._
import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf._
import java.util.regex.Pattern
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import java.util.regex.Pattern
/**
*
*/
object PubMedToOaf { object PubMedToOaf {
val SUBJ_CLASS = "keywords" val SUBJ_CLASS = "keywords"
@ -14,8 +18,18 @@ object PubMedToOaf {
"pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/",
"doi" -> "https://dx.doi.org/" "doi" -> "https://dx.doi.org/"
) )
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
def cleanDoi(doi:String):String = {
/**
* Cleaning the DOI Applying regex in order to
* remove doi starting with URL
* @param doi input DOI
* @return cleaned DOI
*/
def cleanDoi(doi: String): String = {
val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$"
@ -29,6 +43,15 @@ object PubMedToOaf {
null null
} }
/**
*
* Create an instance of class extends Result
* starting from OAF instanceType value
*
* @param cobjQualifier OAF instance type
* @param vocabularies All dnet vocabularies
* @return the correct instance
*/
def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = {
val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid) val result_typologies = getVocabularyTerm(ModelConstants.DNET_RESULT_TYPOLOGIES, vocabularies, cobjQualifier.getClassid)
result_typologies.getClassid match { result_typologies.getClassid match {
@ -41,6 +64,12 @@ object PubMedToOaf {
} }
} }
/**
* Mapping the Pubmedjournal info into the OAF Journale
*
* @param j the pubmedJournal
* @return the OAF Journal
*/
def mapJournal(j: PMJournal): Journal = { def mapJournal(j: PMJournal): Journal = {
if (j == null) if (j == null)
return null return null
@ -48,6 +77,7 @@ object PubMedToOaf {
journal.setDataInfo(dataInfo) journal.setDataInfo(dataInfo)
journal.setName(j.getTitle) journal.setName(j.getTitle)
journal.setConferencedate(j.getDate)
journal.setVol(j.getVolume) journal.setVol(j.getVolume)
journal.setIssnPrinted(j.getIssn) journal.setIssnPrinted(j.getIssn)
journal.setIss(j.getIssue) journal.setIss(j.getIssue)
@ -56,69 +86,108 @@ object PubMedToOaf {
} }
/**
*
* Find vocabulary term into synonyms and term in the vocabulary
*
* @param vocabularyName the input vocabulary name
* @param vocabularies all the vocabularies
* @param term the term to find
*
* @return the cleaned term value
*/
def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = { def getVocabularyTerm(vocabularyName: String, vocabularies: VocabularyGroup, term: String): Qualifier = {
val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) val a = vocabularies.getSynonymAsQualifier(vocabularyName, term)
val b = vocabularies.getTermAsQualifier(vocabularyName, term) val b = vocabularies.getTermAsQualifier(vocabularyName, term)
if (a == null) b else a if (a == null) b else a
} }
val dataInfo: DataInfo = OafMapperUtils.dataInfo(false, null, false, false, ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, "0.9")
val collectedFrom: KeyValue = OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central")
/**
* Map the Pubmed Article into the OAF instance
*
*
* @param article the pubmed articles
* @param vocabularies the vocabularies
* @return The OAF instance if the mapping did not fail
*/
def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = { def convert(article: PMArticle, vocabularies: VocabularyGroup): Result = {
if (article.getPublicationTypes == null) if (article.getPublicationTypes == null)
return null return null
val i = new Instance
var pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
// MAP PMID into pid with classid = classname = pmid
val pidList: List[StructuredProperty] = List(OafMapperUtils.structuredProperty(article.getPmid, PidType.pmid.toString, PidType.pmid.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo))
if (pidList == null) if (pidList == null)
return null return null
var alternateIdentifier :StructuredProperty = null // MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi
var alternateIdentifier: StructuredProperty = null
if (article.getDoi != null) { if (article.getDoi != null) {
val normalizedPid = cleanDoi(article.getDoi) val normalizedPid = cleanDoi(article.getDoi)
if (normalizedPid!= null) if (normalizedPid != null)
alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo) alternateIdentifier = OafMapperUtils.structuredProperty(normalizedPid, PidType.doi.toString, PidType.doi.toString, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, dataInfo)
} }
// INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// If the article contains the typology Journal Article then we apply this type // If the article contains the typology Journal Article then we apply this type
//else We have to find a terms that match the vocabulary otherwise we discard it //else We have to find a terms that match the vocabulary otherwise we discard it
val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) val ja = article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue))
val pubmedInstance = new Instance
if (ja.isDefined) { if (ja.isDefined) {
val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) val cojbCategory = getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue)
i.setInstancetype(cojbCategory) pubmedInstance.setInstancetype(cojbCategory)
} else { } else {
val i_type = article.getPublicationTypes.asScala val i_type = article.getPublicationTypes.asScala
.map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)) .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue))
.find(q => q != null) .find(q => q != null)
if (i_type.isDefined) if (i_type.isDefined)
i.setInstancetype(i_type.get) pubmedInstance.setInstancetype(i_type.get)
else else
return null return null
} }
val result = createResult(i.getInstancetype, vocabularies) val result = createResult(pubmedInstance.getInstancetype, vocabularies)
if (result == null) if (result == null)
return result return result
result.setDataInfo(dataInfo) result.setDataInfo(dataInfo)
i.setPid(pidList.asJava) pubmedInstance.setPid(pidList.asJava)
if (alternateIdentifier!= null) if (alternateIdentifier != null)
i.setAlternateIdentifier(List(alternateIdentifier).asJava) pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava)
result.setInstance(List(i).asJava) result.setInstance(List(pubmedInstance).asJava)
i.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection breakOut) pubmedInstance.getPid.asScala.filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)).map(p => p.getValue)(collection.breakOut)
//CREATE URL From pmid
val urlLists: List[String] = pidList val urlLists: List[String] = pidList
.map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue))
.filter(t => t._1.nonEmpty) .filter(t => t._1.nonEmpty)
.map(t => t._1 + t._2) .map(t => t._1 + t._2)
if (urlLists != null) if (urlLists != null)
i.setUrl(urlLists.asJava) pubmedInstance.setUrl(urlLists.asJava)
i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
i.setCollectedfrom(collectedFrom) //ASSIGN DateofAcceptance
pubmedInstance.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
//ASSIGN COLLECTEDFROM
pubmedInstance.setCollectedfrom(collectedFrom)
result.setPid(pidList.asJava) result.setPid(pidList.asJava)
//END INSTANCE MAPPING
//--------------------------------------------------------------------------------------
// JOURNAL MAPPING
//--------------------------------------------------------------------------------------
if (article.getJournal != null && result.isInstanceOf[Publication]) if (article.getJournal != null && result.isInstanceOf[Publication])
result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal)) result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal))
result.setCollectedfrom(List(collectedFrom).asJava) result.setCollectedfrom(List(collectedFrom).asJava)
//END JOURNAL MAPPING
//--------------------------------------------------------------------------------------
// RESULT MAPPING
//--------------------------------------------------------------------------------------
result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo)) result.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo))
if (article.getTitle == null || article.getTitle.isEmpty) if (article.getTitle == null || article.getTitle.isEmpty)
@ -136,7 +205,7 @@ object PubMedToOaf {
} }
val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection breakOut) val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => OafMapperUtils.structuredProperty(s.getValue, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, dataInfo))(collection.breakOut)
if (subjects != null) if (subjects != null)
result.setSubject(subjects.asJava) result.setSubject(subjects.asJava)
@ -148,7 +217,7 @@ object PubMedToOaf {
author.setFullname(a.getFullName) author.setFullname(a.getFullName)
author.setRank(index + 1) author.setRank(index + 1)
author author
}(collection breakOut) }(collection.breakOut)
if (authors != null && authors.nonEmpty) if (authors != null && authors.nonEmpty)
@ -158,6 +227,9 @@ object PubMedToOaf {
result.setId(article.getPmid) result.setId(article.getPmid)
// END RESULT MAPPING
//--------------------------------------------------------------------------------------
val id = IdentifierFactory.createIdentifier(result) val id = IdentifierFactory.createIdentifier(result)
if (article.getPmid.equalsIgnoreCase(id)) if (article.getPmid.equalsIgnoreCase(id))
return null return null

View File

@ -0,0 +1,33 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName":"out",
"paramLongName":"outputPath",
"paramDescription": "the output path",
"paramRequired": true
},
{
"paramName": "ssm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramRequired": false
},
{
"paramName": "hnn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the path used to store the HostedByMap",
"paramRequired": true
},
{
"paramName": "cfn",
"paramLongName": "classForName",
"paramDescription": "the path used to store the HostedByMap",
"paramRequired": true
}
]

View File

@ -0,0 +1,30 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hiveJdbcUrl</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
</property>
<property>
<name>hiveDbName</name>
<value>openaire</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,174 @@
<workflow-app name="UnresolvedEntities" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>fosPath</name>
<description>the input path of the resources to be extended</description>
</property>
<property>
<name>bipScorePath</name>
<description>the path where to find the bipFinder scores</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="prepareInfo"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<fork name="prepareInfo">
<path start="prepareBip"/>
<path start="getFOS"/>
</fork>
<action name="prepareBip">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from bip finder!</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${bipScorePath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
</spark>
<ok to="join"/>
<error to="Kill"/>
</action>
<action name="getFOS">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSData</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--sourcePath</arg><arg>${fosPath}</arg>
<arg>--outputPath</arg><arg>${workingDir}/input/fos</arg>
<arg>--classForName</arg><arg>eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel</arg>
</java>
<ok to="prepareFos"/>
<error to="Kill"/>
</action>
<action name="prepareFos">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the unresolved from FOS!</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/input/fos</arg>
<arg>--outputPath</arg><arg>${workingDir}/prepared</arg>
</spark>
<ok to="join"/>
<error to="Kill"/>
</action>
<join name="join" to="produceUnresolved"/>
<action name="produceUnresolved">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Saves the result produced for bip and fos by grouping results with the same id</name>
<class>eu.dnetlib.dhp.actionmanager.createunresolvedentities.SparkSaveUnresolved</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/prepared</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "sp",
"paramLongName": "sourcePath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "sp",
"paramLongName": "sourcePath",
"paramDescription": "the URL from where to get the programme file",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "the path of the new ActionSet",
"paramRequired": true
}
]

View File

@ -1,95 +0,0 @@
<workflow-app name="Import_Datacite_and_transform_to_OAF" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>mainPath</name>
<description>the working path of Datacite stores</description>
</property>
<property>
<name>oafTargetPath</name>
<description>the target path where the OAF records are stored</description>
</property>
<property>
<name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description>
</property>
<property>
<name>blocksize</name>
<value>100</value>
<description>The request block size</description>
</property>
<property>
<name>exportLinks</name>
<value>false</value>
<description>instructs the transformation phase to produce the links or not</description>
</property>
</parameters>
<start to="resume_from"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<decision name="resume_from">
<switch>
<case to="TransformDatacite">${wf:conf('resumeFrom') eq 'TransformDatacite'}</case>
<default to="ImportDatacite"/>
</switch>
</decision>
<action name="ImportDatacite">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ImportDatacite</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.ImportDatacite</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
<arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
<arg>--blocksize</arg><arg>${blocksize}</arg>
</spark>
<ok to="TransformDatacite"/>
<error to="Kill"/>
</action>
<action name="TransformDatacite">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>TransformJob</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--targetPath</arg><arg>${oafTargetPath}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--exportLinks</arg><arg>${exportLinks}</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,84 +0,0 @@
<workflow-app name="Generate_Datacite_and_Crossref_dump_for_Scholexplorer" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>datacitePath</name>
<description>the path of Datacite spark dataset</description>
</property>
<property>
<name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description>
</property>
<property>
<name>crossrefPath</name>
<description>the path of Crossref spark dataset</description>
</property>
<property>
<name>targetPath</name>
<description>the path of Crossref spark dataset</description>
</property>
</parameters>
<start to="ImportDatacite"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ImportDatacite">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>ImportDatacite</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.GenerateDataciteDatasetSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${datacitePath}</arg>
<arg>--targetPath</arg><arg>${targetPath}/datacite_oaf</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--exportLinks</arg><arg>true</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="FilterCrossrefEntities"/>
<error to="Kill"/>
</action>
<action name="FilterCrossrefEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>FilterCrossrefEntities</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.FilterCrossrefEntitiesSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${crossrefPath}</arg>
<arg>--targetPath</arg><arg>${targetPath}/crossref_oaf</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,25 @@
[
{
"paramName": "ip",
"paramLongName": "inputPath",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "the working path",
"paramRequired": true
},
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "the hdfs name node",
"paramRequired": false
}, {
"paramName": "sdr",
"paramLongName": "shouldDuplicateRels",
"paramDescription": "the hdfs name node",
"paramRequired": false
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "if",
"paramLongName": "inputFile",
"paramDescription": "the zipped opencitations file",
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramDescription": "the working path",
"paramRequired": true
},
{
"paramName": "hnn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the hdfs name node",
"paramRequired": true
}
]

View File

@ -1,7 +1,4 @@
<configuration> <configuration>
<!-- OCEAN -->
<property> <property>
<name>jobTracker</name> <name>jobTracker</name>
<value>yarnRM</value> <value>yarnRM</value>
@ -10,41 +7,6 @@
<name>nameNode</name> <name>nameNode</name>
<value>hdfs://nameservice1</value> <value>hdfs://nameservice1</value>
</property> </property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<!-- GARR -->
<!-- <property>-->
<!-- <name>jobTracker</name>-->
<!-- <value>yarn</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>nameNode</name>-->
<!-- <value>hdfs://hadoop-rm1.garr-pa1.d4science.org:8020</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>hive_metastore_uris</name>-->
<!-- <value>thrift://hadoop-edge3.garr-pa1.d4science.org:9083</value>-->
<!-- </property>-->
<!-- <property>-->
<!-- <name>spark2YarnHistoryServerAddress</name>-->
<!-- <value>http://hadoop-rm2.garr-pa1.d4science.org:19888</value>-->
<!-- </property>-->
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property> <property>
<name>oozie.use.system.libpath</name> <name>oozie.use.system.libpath</name>
<value>true</value> <value>true</value>
@ -53,16 +15,44 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
<property>
<name>sparkExecutorNumber</name>
<value>4</value>
</property>
<property> <property>
<name>spark2EventLogDir</name> <name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value> <value>/user/spark/spark2ApplicationHistory</value>
</property> </property>
<property> <property>
<name>spark2ExtraListeners</name> <name>sparkDriverMemory</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value> <value>15G</value>
</property> </property>
<property> <property>
<name>spark2SqlQueryExecutionListeners</name> <name>sparkExecutorMemory</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value> <value>6G</value>
</property>
<property>
<name>sparkExecutorCores</name>
<value>1</value>
</property> </property>
</configuration> </configuration>

View File

@ -0,0 +1,2 @@
#!/bin/bash
for file in $(echo $1 | tr ";" "\n"); do curl -L $(echo $file | cut -d '@' -f 1 ) | hdfs dfs -put - $2/$(echo $file | cut -d '@' -f 2) ; done;

View File

@ -0,0 +1,91 @@
<workflow-app name="OpenCitations Integration" xmlns="uri:oozie:workflow:0.5">
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="download">${wf:conf('resumeFrom') eq 'DownloadDump'}</case>
<case to="extract">${wf:conf('resumeFrom') eq 'ExtractContent'}</case>
<default to="create_actionset"/> <!-- first action to be done when downloadDump is to be performed -->
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="download">
<shell xmlns="uri:oozie:shell-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapred.job.queue.name</name>
<value>${queueName}</value>
</property>
</configuration>
<exec>download.sh</exec>
<argument>${filelist}</argument>
<argument>${workingPath}/Original</argument>
<env-var>HADOOP_USER_NAME=${wf:user()}</env-var>
<file>download.sh</file>
<capture-output/>
</shell>
<ok to="extract"/>
<error to="Kill"/>
</action>
<action name="extract">
<java>
<main-class>eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs</main-class>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
<arg>--inputFile</arg><arg>${inputFile}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
</java>
<ok to="create_actionset"/>
<error to="Kill"/>
</action>
<action name="create_actionset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Produces the AS for OC</name>
<class>eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>--inputPath</arg><arg>${workingPath}/COCI</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,8 @@
[
{"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true},
{"paramName":"f", "paramLongName":"opencitationFile", "paramDescription": "the name of the file", "paramRequired": true},
{"paramName":"issm", "paramLongName":"isSparkSessionManaged", "paramDescription": "the name of the activities orcid file", "paramRequired": false},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the name of the activities orcid file", "paramRequired": true}
]

View File

@ -1,46 +1,52 @@
<workflow-app name="Datacite_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Collect_Datacite" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>sourcePath</name> <name>mainPath</name>
<description>the working path of Datacite stores</description> <description>the working path of Datacite stores</description>
</property> </property>
<property> <property>
<name>outputPath</name> <name>isLookupUrl</name>
<description>the path of Datacite ActionSet</description> <description>The IS lookUp service endopoint</description>
</property> </property>
<property>
<name>blocksize</name>
<value>100</value>
<description>The request block size</description>
</property>
</parameters> </parameters>
<start to="ExportDataset"/> <start to="ImportDatacite"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ExportDataset"> <action name="ImportDatacite">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>ExportDataset</name> <name>ImportDatacite</name>
<class>eu.dnetlib.dhp.actionmanager.datacite.ExportActionSetJobNode</class> <class>eu.dnetlib.dhp.datacite.ImportDatacite</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--targetPath</arg><arg>${mainPath}/datacite_update</arg>
<arg>--targetPath</arg><arg>${outputPath}</arg> <arg>--dataciteDumpPath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--master</arg><arg>yarn-cluster</arg> <arg>--master</arg><arg>yarn-cluster</arg>
<arg>--blocksize</arg><arg>${blocksize}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -7,8 +7,8 @@
}, },
{ {
"paramName": "t", "paramName": "mo",
"paramLongName": "targetPath", "paramLongName": "mdstoreOutputVersion",
"paramDescription": "the target mdstore path", "paramDescription": "the target mdstore path",
"paramRequired": true "paramRequired": true
}, },

View File

@ -0,0 +1,126 @@
<workflow-app name="transform_Datacite" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>mainPath</name>
<description>the working path of Datacite stores</description>
</property>
<property>
<name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description>
</property>
<property>
<name>mdStoreOutputId</name>
<description>the identifier of the cleaned MDStore</description>
</property>
<property>
<name>mdStoreManagerURI</name>
<description>the path of the cleaned mdstore</description>
</property>
</parameters>
<start to="StartTransaction"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="StartTransaction">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreOutputId}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="TransformJob"/>
<error to="EndReadRollBack"/>
</action>
<action name="TransformJob">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>TransformJob</name>
<class>eu.dnetlib.dhp.datacite.GenerateDataciteDatasetSpark</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${mainPath}/datacite_dump</arg>
<arg>--mdstoreOutputVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--exportLinks</arg><arg>true</arg>
<arg>--master</arg><arg>yarn-cluster</arg>
</spark>
<ok to="CommitVersion"/>
<error to="Kill"/>
</action>
<action name="CommitVersion">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="EndReadRollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>READ_UNLOCK</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
<capture-output/>
</java>
<ok to="RollBack"/>
<error to="Kill"/>
</action>
<action name="RollBack">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>ROLLBACK</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,51 @@
<workflow-app name="Transform_BioEntity_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the PDB Database Working Path</description>
</property>
<property>
<name>database</name>
<description>the PDB Database Working Path</description>
</property>
<property>
<name>targetPath</name>
<description>the Target Working dir path</description>
</property>
</parameters>
<start to="ConvertDB"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="ConvertDB">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Convert Bio DB to OAF Dataset</name>
<class>eu.dnetlib.dhp.sx.bio.SparkTransformBioDatabaseToOAF</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=2000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--master</arg><arg>yarn</arg>
<arg>--dbPath</arg><arg>${sourcePath}</arg>
<arg>--database</arg><arg>${database}</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -3,5 +3,6 @@
{"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true}, {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true}, {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true},
{"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false},
{"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true} {"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true}
] ]

View File

@ -0,0 +1,105 @@
<workflow-app name="Create EBI Dataset" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the Working Path</description>
</property>
<property>
<name>workingPath</name>
<description>the Working Path</description>
</property>
<property>
<name>targetPath</name>
<description>the OAF MDStore Path</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>resumeFrom</name>
<value>DownloadEBILinks</value>
<description>node to start</description>
</property>
</parameters>
<start to="resume_from"/>
<decision name="resume_from">
<switch>
<case to="DownloadEBILinks">${wf:conf('resumeFrom') eq 'DownloadEBILinks'}</case>
<case to="CreateEBIDataSet">${wf:conf('resumeFrom') eq 'CreateEBIDataSet'}</case>
<default to="DownloadEBILinks"/>
</switch>
</decision>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="DownloadEBILinks">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Incremental Download EBI Links</name>
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkDownloadEBILinks</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.shuffle.partitions=2000
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="OverrideFolders"/>
<error to="Kill"/>
</action>
<action name="OverrideFolders">
<fs>
<delete path="${sourcePath}/ebi_links_dataset_old"/>
<move source="${sourcePath}/ebi_links_dataset" target="${sourcePath}/ebi_links_dataset_old"/>
<move source="${workingPath}/links_final" target="${sourcePath}/ebi_links_dataset"/>
</fs>
<ok to="CreateEBIDataSet"/>
<error to="Kill"/>
</action>
<action name="CreateEBIDataSet">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Create OAF DataSet</name>
<class>eu.dnetlib.dhp.sx.bio.ebi.SparkEBILinksToOaf</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=2000
${sparkExtraOPT}
</spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/ebi_links_dataset</arg>
<arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--master</arg><arg>yarn</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -1,17 +1,22 @@
<workflow-app name="Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Download_Transform_Pubmed_Workflow" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>baselineWorkingPath</name> <name>baselineWorkingPath</name>
<description>the Baseline Working Path</description> <description>the Baseline Working Path</description>
</property> </property>
<property>
<name>targetPath</name>
<description>the Target Path</description>
</property>
<property> <property>
<name>isLookupUrl</name> <name>isLookupUrl</name>
<description>The IS lookUp service endopoint</description> <description>The IS lookUp service endopoint</description>
</property> </property>
<property>
<name>targetPath</name>
<description>The target path</description>
</property>
<property>
<name>skipUpdate</name>
<value>false</value>
<description>The request block size</description>
</property>
</parameters> </parameters>
<start to="ConvertDataset"/> <start to="ConvertDataset"/>
@ -24,9 +29,9 @@
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Convert Baseline to Dataset</name> <name>Convert Baseline to OAF Dataset</name>
<class>eu.dnetlib.dhp.sx.graph.ebi.SparkCreateBaselineDataFrame</class> <class>eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-aggregation-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
@ -41,6 +46,8 @@
<arg>--targetPath</arg><arg>${targetPath}</arg> <arg>--targetPath</arg><arg>${targetPath}</arg>
<arg>--master</arg><arg>yarn</arg> <arg>--master</arg><arg>yarn</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--hdfsServerUri</arg><arg>${nameNode}</arg>
<arg>--skipUpdate</arg><arg>${skipUpdate}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout. # A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
log4j.logger.org.apache.spark=FATAL
log4j.logger.org.spark_project=FATAL

View File

@ -0,0 +1,20 @@
##DHP-Aggregation
This module defines a set of oozie workflows for
1. the **collection** and **transformation** of metadata records.
2. the **integration** of new external information in the result
### Collection and Transformation
The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.
It defines [mappings](mappings.md) for transformation of different datasource (See mapping section).
### Integration of external information in the result
The workflows create new entity in the OpenAIRE format (OAF) which aim is to enrich the result already contained in the graph.
See integration section for more insight

View File

@ -0,0 +1,36 @@
DHP Aggregation - Integration method
=====================================
The integration method can be applied every time new information, which is not aggregated from the repositories
nor computed directly by OpenAIRE, should be added to the results of the graph.
The information integrated so far is:
1. Article impact measures
1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores
2. Result Subjects
1. Integration of Fields od Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in
results subjects.
The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id
and the element in the OAF model that should be used to map the information we want to integrate.
The id is set by using a particular encoding of the given PID
*unresolved:[pid]:[pidtype]*
where
1. *unresolved* is a constant value
2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307
3. *pidtype* is the persistent id type, e.g. doi
Such entities are matched against those available in the graph using the result.instance.pid values.
This mechanism can be used to integrate enrichments produced as associated by a given PID.
If a match will be found with one of the results already in the graph that said result will be enriched with the information
present in the new OAF.
All the objects for which a match is not found are discarded.

View File

@ -0,0 +1,7 @@
##DHP-Aggregation
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.

View File

@ -0,0 +1,18 @@
DHP Aggregation
===============
DHP-Aggregations contains different mappings from original data format into OAF Data Format,
which converge in the graph in different ways:
- Via Action Manager
- Direct in the MdStore on Hadoop
Below the list of the implemented mapping
Mappings
=======
1. [PubMed](pubmed.md)
2. [Datacite](datacite.md)

View File

@ -0,0 +1,66 @@
#Pubmed Mapping
This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/).
Collection
---------
The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with
the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html)
Parsing
-------
The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates
an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html)
Mapping
-------
The table below describes the mapping from the XML Native to the OAF mapping
| Xpath Source | Oaf Field | Notes |
| ----------- | ----------- | ----------- |
| //PMID | pid | classid = classname = pmid
| | **Instance Mapping** | |
|//PublicationType | InstanceType | If the article contains the typology **Journal Article** then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it
|//PMID | instance/PID | Map the pmid also in the pid in the instance |
| //ArticleId[./@IdType="doi" | instance/alternateIdentifier |classid = classname = doi
|//PMID | instance/URL | prepend to the PMId the base url https://pubmed.ncbi.nlm.nih.gov/
| //PubmedPubDate | instance/Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it
| FOR ALL INSTANCE | CollectedFrom | datasourceName: *Europe PubMed Central* DatasourceId:
| | **Journal Mapping** | |
|//Journal/PubDate| Journal/Conferencedate | map the date of the Journal
|//Journal/Title| Journal/Name | |
|//Journal/Volume| Journal/Vol | |
|//Journal/ISSN| Journal/issPrinted | |
|//Journal/Issue| Journal/Iss | |
| | **Publication Mapping** | |
| //PubmedPubDate | Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it
| //Title | title | with qualifier ModelConstants.MAIN_TITLE_QUALIFIER
| //AbstractText | Description ||
|//Language| Language| cleaning vocabulary -> dnet:languages
|//DescriptorName| Subject | classId, className = keyword
| | **Author Mapping** | |
|//Author/LastName| author.Surname| |
|//Author/ForeName| author.Forename| |
|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist |
|FOR ALL AUTHOR | author.rank| sequential number starting from 1|
#TODO
Missing item mapped

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

View File

@ -0,0 +1,34 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu name="Documentation">
<item name="Mappings" href="mappings.html" collapse="true">
<item name="Pubmed" href="pubmed.html"/>
<item name="Datacite" href="datacite.html"/>
</item>
<item name="Integration" href="integration.html" collapse="true">
</item>
<item name="General Information" href="about.html"/>
<item name="JavaDoc" href="apidocs/" />
<item name="ScalaDoc" href="scaladocs/" />
</menu>
<menu ref="reports"/>
</body>
</project>

View File

@ -0,0 +1,250 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static org.junit.jupiter.api.Assertions.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.schema.oaf.Result;
public class PrepareTest {
private static final Logger log = LoggerFactory.getLogger(ProduceTest.class);
private static Path workingDir;
private static SparkSession spark;
private static LocalFileSystem fs;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName());
fs = FileSystem.getLocal(new Configuration());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ProduceTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(PrepareTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void bipPrepareTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"--outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/bip")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(86, tmp.count());
String doi1 = "unresolved::10.0000/096020199389707::doi";
Assertions.assertEquals(1, tmp.filter(r -> r.getId().equals(doi1)).count());
Assertions.assertEquals(3, tmp.filter(r -> r.getId().equals(doi1)).collect().get(0).getMeasures().size());
Assertions
.assertEquals(
"6.34596412687e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("influence"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"0.641151896994", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("popularity_alt"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"2.33375102921e-09", tmp
.filter(r -> r.getId().equals(doi1))
.collect()
.get(0)
.getMeasures()
.stream()
.filter(sl -> sl.getId().equals("popularity"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
}
@Test
void getFOSFileTest() throws IOException, ClassNotFoundException {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/h2020_fos_sbs.csv")
.getPath();
final String outputPath = workingDir.toString() + "/fos.json";
new GetFOSData()
.doRewrite(
sourcePath, outputPath, "eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel",
'\t', fs);
BufferedReader in = new BufferedReader(
new InputStreamReader(fs.open(new org.apache.hadoop.fs.Path(outputPath))));
String line;
int count = 0;
while ((line = in.readLine()) != null) {
FOSDataModel fos = new ObjectMapper().readValue(line, FOSDataModel.class);
System.out.println(new ObjectMapper().writeValueAsString(fos));
count += 1;
}
assertEquals(38, count);
}
@Test
void fosPrepareTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", sourcePath,
"-outputPath", workingDir.toString() + "/work"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/work/fos")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
String doi1 = "unresolved::10.3390/s18072310::doi";
assertEquals(50, tmp.count());
assertEquals(1, tmp.filter(row -> row.getId().equals(doi1)).count());
assertTrue(
tmp
.filter(r -> r.getId().equals(doi1))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("engineering and technology"));
assertTrue(
tmp
.filter(r -> r.getId().equals(doi1))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("nano-technology"));
assertTrue(
tmp
.filter(r -> r.getId().equals(doi1))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("nanoscience & nanotechnology"));
String doi = "unresolved::10.1111/1365-2656.12831::doi";
assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
assertTrue(
tmp
.filter(r -> r.getId().equals(doi))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("psychology and cognitive sciences"));
assertTrue(
tmp
.filter(r -> r.getId().equals(doi))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("social sciences"));
assertFalse(
tmp
.filter(r -> r.getId().equals(doi))
.flatMap(r -> r.getSubject().iterator())
.map(sbj -> sbj.getValue())
.collect()
.contains("NULL"));
}
}

View File

@ -0,0 +1,234 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocalFileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class ProduceTest {
private static final Logger log = LoggerFactory.getLogger(ProduceTest.class);
private static Path workingDir;
private static SparkSession spark;
private static LocalFileSystem fs;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________";
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(ProduceTest.class.getSimpleName());
fs = FileSystem.getLocal(new Configuration());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(ProduceTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(ProduceTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void produceTest() throws Exception {
final String bipPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", bipPath,
"--outputPath", workingDir.toString() + "/work"
});
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", fosPath,
"-outputPath", workingDir.toString() + "/work"
});
SparkSaveUnresolved.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", workingDir.toString() + "/work",
"-outputPath", workingDir.toString() + "/unresolved"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/unresolved")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(135, tmp.count());
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")).count());
Assertions
.assertEquals(
3, tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertEquals(
3, tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
.collect()
.get(0)
.getMeasures()
.size());
List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
.flatMap(row -> row.getSubject().iterator())
.collect();
sbjs.forEach(sbj -> Assertions.assertEquals("FOS", sbj.getQualifier().getClassid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(
"Fields of Science and Technology classification", sbj.getQualifier().getClassname()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename()));
sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
sbjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible()));
sbjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
sbjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
sbjs
.forEach(
sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
sbjs
.forEach(
sbj -> Assertions
.assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS,
sbj.getDataInfo().getProvenanceaction().getSchemename()));
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("engineering and technology"));
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nano-technology"));
sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nanoscience & nanotechnology"));
List<Measure> measures = tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
.flatMap(row -> row.getMeasures().iterator())
.collect();
Assertions
.assertEquals(
"7.5597134689e-09", measures
.stream()
.filter(mes -> mes.getId().equals("influence"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"4.903880192", measures
.stream()
.filter(mes -> mes.getId().equals("popularity_alt"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
"1.17977512835e-08", measures
.stream()
.filter(mes -> mes.getId().equals("popularity"))
.collect(Collectors.toList())
.get(0)
.getUnit()
.get(0)
.getValue());
Assertions
.assertEquals(
49, tmp
.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi"))
.filter(row -> row.getSubject() != null)
.count());
Assertions
.assertEquals(
85,
tmp
.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi"))
.filter(r -> r.getMeasures() != null)
.count());
}
}

View File

@ -1,57 +0,0 @@
package eu.dnetlib.dhp.actionmanager.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.SerializationFeature
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
@BeforeEach
def setUp() :Unit = {
super.setUpVocabulary()
}
@Test
def testDateMapping:Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("record.json")).mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
res.foreach(r => {
println (mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -0,0 +1,335 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public class CreateOpenCitationsASTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory
.getLogger(CreateOpenCitationsASTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files
.createTempDirectory(CreateOpenCitationsASTest.class.getSimpleName());
log.info("using work dir {}", workingDir);
SparkConf conf = new SparkConf();
conf.setAppName(CreateOpenCitationsASTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
spark = SparkSession
.builder()
.appName(CreateOpenCitationsASTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
@Test
void testNumberofRelations() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-shouldDuplicateRels",
Boolean.TRUE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
assertEquals(60, tmp.count());
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
}
@Test
void testNumberofRelations2() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
assertEquals(44, tmp.count());
// tmp.foreach(r -> System.out.println(OBJECT_MAPPER.writeValueAsString(r)));
}
@Test
void testRelationsCollectedFrom() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
tmp.foreach(r -> {
assertEquals(ModelConstants.OPENOCITATIONS_NAME, r.getCollectedfrom().get(0).getValue());
assertEquals(ModelConstants.OPENOCITATIONS_ID, r.getCollectedfrom().get(0).getKey());
});
}
@Test
void testRelationsDataInfo() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
tmp.foreach(r -> {
assertEquals(false, r.getDataInfo().getInferred());
assertEquals(false, r.getDataInfo().getDeletedbyinference());
assertEquals("0.91", r.getDataInfo().getTrust());
assertEquals(
CreateActionSetSparkJob.OPENCITATIONS_CLASSID, r.getDataInfo().getProvenanceaction().getClassid());
assertEquals(
CreateActionSetSparkJob.OPENCITATIONS_CLASSNAME, r.getDataInfo().getProvenanceaction().getClassname());
assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemeid());
assertEquals(ModelConstants.DNET_PROVENANCE_ACTIONS, r.getDataInfo().getProvenanceaction().getSchemename());
});
}
@Test
void testRelationsSemantics() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
tmp.foreach(r -> {
assertEquals("citation", r.getSubRelType());
assertEquals("resultResult", r.getRelType());
});
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("Cites")).count());
assertEquals(22, tmp.filter(r -> r.getRelClass().equals("IsCitedBy")).count());
}
@Test
void testRelationsSourceTargetPrefix() throws Exception {
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
tmp.foreach(r -> {
assertEquals("50|doi_________::", r.getSource().substring(0, 17));
assertEquals("50|doi_________::", r.getTarget().substring(0, 17));
});
}
@Test
void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass()
.getResource(
"/eu/dnetlib/dhp/actionmanager/opencitations/inputFiles")
.getPath();
CreateActionSetSparkJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
inputPath,
"-outputPath",
workingDir.toString() + "/actionSet"
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<Relation> tmp = sc
.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
JavaRDD<Relation> check = tmp.filter(r -> r.getSource().equals(doi1) || r.getTarget().equals(doi1));
assertEquals(10, check.count());
check.foreach(r -> {
if (r.getSource().equals(doi2) || r.getSource().equals(doi3) || r.getSource().equals(doi4) ||
r.getSource().equals(doi5) || r.getSource().equals(doi6)) {
assertEquals(ModelConstants.IS_CITED_BY, r.getRelClass());
assertEquals(doi1, r.getTarget());
}
});
assertEquals(5, check.filter(r -> r.getSource().equals(doi1)).count());
check.filter(r -> r.getSource().equals(doi1)).foreach(r -> assertEquals(ModelConstants.CITES, r.getRelClass()));
}
}

View File

@ -0,0 +1,113 @@
package eu.dnetlib.dhp.datacite
import com.fasterxml.jackson.databind.{ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.commons.io.FileUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.{col, count}
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.extension.ExtendWith
import org.junit.jupiter.api.{AfterEach, BeforeEach, Test}
import org.mockito.junit.jupiter.MockitoExtension
import org.slf4j.{Logger, LoggerFactory}
import java.nio.file.{Files, Path}
import java.text.SimpleDateFormat
import java.util.Locale
import scala.io.Source
import org.junit.jupiter.api.Assertions._
@ExtendWith(Array(classOf[MockitoExtension]))
class DataciteToOAFTest extends AbstractVocabularyTest{
private var workingDir:Path = null
val log: Logger = LoggerFactory.getLogger(getClass)
@BeforeEach
def setUp() :Unit = {
workingDir= Files.createTempDirectory(getClass.getSimpleName)
super.setUpVocabulary()
}
@AfterEach
def tearDown() :Unit = {
FileUtils.deleteDirectory(workingDir.toFile)
}
@Test
def testDateMapping:Unit = {
val inputDate = "2021-07-14T11:52:54+0000"
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
val dt = ISO8601FORMAT.parse(inputDate)
println(dt.getTime)
}
@Test
def testConvert(): Unit = {
val path = getClass.getResource("/eu/dnetlib/dhp/actionmanager/datacite/dataset").getPath
val conf = new SparkConf()
val spark:SparkSession = SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
implicit val oafEncoder:Encoder[Oaf] = Encoders.kryo[Oaf]
val instance = new GenerateDataciteDatasetSpark(null, null, log)
val targetPath = s"$workingDir/result"
instance.generateDataciteDataset(path, exportLinks = true, vocabularies,targetPath, spark)
import spark.implicits._
val nativeSize =spark.read.load(path).count()
assertEquals(100, nativeSize)
val result:Dataset[Oaf] = spark.read.load(targetPath).as[Oaf]
result.map(s => s.getClass.getSimpleName).groupBy(col("value").alias("class")).agg(count("value").alias("Total")).show(false)
val t = spark.read.load(targetPath).count()
assertTrue(t >0)
spark.stop()
}
@Test
def testMapping() :Unit = {
val record =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/record.json")).mkString
val mapper = new ObjectMapper().enable(SerializationFeature.INDENT_OUTPUT)
val res:List[Oaf] =DataciteToOAFTransformation.generateOAF(record, 0L,0L, vocabularies, true )
res.foreach(r => {
println (mapper.writeValueAsString(r))
println("----------------------------")
})
}
}

View File

@ -1,13 +1,10 @@
package eu.dnetlib.dhp.sx.graph.bio.pubmed package eu.dnetlib.dhp.sx.bio
import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature} import com.fasterxml.jackson.databind.{DeserializationFeature, ObjectMapper, SerializationFeature}
import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest
import eu.dnetlib.dhp.schema.oaf.utils.{CleaningFunctions, OafMapperUtils, PidType}
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved
import eu.dnetlib.dhp.sx.graph.bio.BioDBToOAF import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PubMedToOaf}
import eu.dnetlib.dhp.sx.graph.bio.pubmed.PubMedToOaf.dataInfo
import eu.dnetlib.dhp.sx.graph.ebi.SparkDownloadEBILinks
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
@ -55,7 +52,7 @@ class BioScholixTest extends AbstractVocabularyTest{
@Test @Test
def testEBIData() = { def testEBIData() = {
val inputXML = Source.fromInputStream(getClass.getResourceAsStream("pubmed.xml")).mkString val inputXML = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed.xml")).mkString
val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes())) val xml = new XMLEventReader(Source.fromBytes(inputXML.getBytes()))
new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s))) new PMParser(xml).foreach(s =>println(mapper.writeValueAsString(s)))
} }
@ -65,7 +62,7 @@ class BioScholixTest extends AbstractVocabularyTest{
def testPubmedToOaf(): Unit = { def testPubmedToOaf(): Unit = {
assertNotNull(vocabularies) assertNotNull(vocabularies)
assertTrue(vocabularies.vocabularyExists("dnet:publication_resource")) assertTrue(vocabularies.vocabularyExists("dnet:publication_resource"))
val records:String =Source.fromInputStream(getClass.getResourceAsStream("pubmed_dump")).mkString val records:String =Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")).mkString
val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies)) val r:List[Oaf] = records.lines.toList.map(s=>mapper.readValue(s, classOf[PMArticle])).map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size) assertEquals(10, r.size)
assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p))) assertTrue(r.map(p => p.asInstanceOf[Result]).flatMap(p => p.getInstance().asScala.map(i => i.getInstancetype.getClassid)).exists(p => "0037".equalsIgnoreCase(p)))

View File

@ -0,0 +1,86 @@
{"10.3390/s18072310": [{"id": "influence", "unit": [{"value": "7.5597134689e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "4.903880192", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.17977512835e-08", "key": "score"}]}]}
{"10.0000/096020199389707": [{"id": "influence", "unit": [{"value": "6.34596412687e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.641151896994", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "2.33375102921e-09", "key": "score"}]}]}
{"10.00000/jpmc.2017.106": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
{"10.0000/9781845416881": [{"id": "influence", "unit": [{"value": "5.96492048955e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.12641925838e-08", "key": "score"}]}]}
{"10.0000/anziamj.v0i0.266": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.76260934675e-10", "key": "score"}]}]}
{"10.0000/anziamj.v48i0.79": [{"id": "influence", "unit": [{"value": "6.93311506443e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.002176782336", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "1.7668105708e-09", "key": "score"}]}]}
{"10.0000/anziamj.v50i0.1472": [{"id": "influence", "unit": [{"value": "6.26777280882e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.406656", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.39745193285e-09", "key": "score"}]}]}
{"10.0000/cja5553": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/czastest.16": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/czastest.17": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
{"10.0000/czastest.18": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
{"10.0000/czastest.20": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/czastest.21": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
{"10.0000/czastest.28": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "3.47956715615e-09", "key": "score"}]}]}
{"10.0000/czastest.60": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/czt.2019.1.2.15": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v4i02.36": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v4i02.37": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v4i02.38": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v5i01.32": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v6i01.24": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v6i01.27": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v6i02.41": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v6i02.44": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i01.40": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.01810569717e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i01.42": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i01.47": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i01.51": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i01.52": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i02.86": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i02.88": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v7i02.91": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v8i01.129": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v8i01.180": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
{"10.0000/geoekonomi.v8i01.87": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "4.65008652949e-09", "key": "score"}]}]}
{"10.0000/hbv2004w010": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hbv2101w001": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2101w002": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2101w003": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2101w004": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2101w005": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2101w006": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2101w007": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2102w001": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hbv2102w010": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "9.88840807598e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i1.13207": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i1.13208": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i1.13209": [{"id": "influence", "unit": [{"value": "6.32078461509e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "1.6", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.3168486939e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i1.13210": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i1.13211": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i1.13212": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "5.39172290649e-09", "key": "score"}]}]}
{"10.0000/hoplos.v1i2.13231": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i2.28782": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i2.28783": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i2.28784": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i2.28786": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i2.28787": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i2.28788": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i3.28234": [{"id": "influence", "unit": [{"value": "6.40470414877e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.6", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.89465099068e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i3.28236": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i3.28238": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i3.28239": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i3.28242": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v2i3.28243": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "6.26204125721e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i4.38186": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i4.38187": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i4.38190": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i4.38207": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i4.38209": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i5.41163": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i5.41166": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i5.41167": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i5.41168": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "7.28336930301e-09", "key": "score"}]}]}
{"10.0000/hoplos.v3i5.41229": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.36360": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.40796": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.41153": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.42511": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.42555": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.42752": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.42768": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i6.42795": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.41295": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.42830": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.42861": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}
{"10.0000/hoplos.v4i7.43096": [{"id": "influence", "unit": [{"value": "5.91019644836e-09", "key": "score"}]}, {"id": "popularity_alt", "unit": [{"value": "0.0", "key": "score"}]}, {"id": "popularity", "unit": [{"value": "8.48190886761e-09", "key": "score"}]}]}

View File

@ -0,0 +1,38 @@
{"doi":"10.3390/s18072310","level1":"engineering and technology","level2":"nano-technology","level3":"nanoscience & nanotechnology"}
{"doi":"10.1111/1365-2656.12831\u000210.17863/cam.24369","level1":"social sciences","level2":"psychology and cognitive sciences","level3":"NULL"}
{"doi":"10.3929/ethz-b-000187584\u000210.1002/chem.201701644","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"10.1080/01913123.2017.1367361","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
{"doi":"10.1051/e3sconf/20199207011","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"environmental sciences"}
{"doi":"10.1038/onc.2015.333","level1":"medical and health sciences","level2":"clinical medicine","level3":"oncology & carcinogenesis"}
{"doi":"10.1093/mnras/staa256","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
{"doi":"10.1016/j.jclepro.2018.07.166","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
{"doi":"10.1103/physrevlett.125.037403","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
{"doi":"10.1080/03602532.2017.1316285","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"10.1001/jamanetworkopen.2019.1868","level1":"medical and health sciences","level2":"other medical science","level3":"health policy & services"}
{"doi":"10.1128/mra.00874-18","level1":"natural sciences","level2":"biological sciences","level3":"plant biology & botany"}
{"doi":"10.1016/j.nancom.2018.03.001","level1":"engineering and technology","level2":"NULL","level3":"NULL"}
{"doi":"10.1112/topo.12174","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"10.12688/wellcomeopenres.15846.1","level1":"medical and health sciences","level2":"health sciences","level3":"NULL"}
{"doi":"10.21468/scipostphys.3.1.001","level1":"natural sciences","level2":"physical sciences","level3":"NULL"}
{"doi":"10.1088/1741-4326/ab6c77","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
{"doi":"10.1109/tpwrs.2019.2944747","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
{"doi":"10.1016/j.expthermflusci.2019.109994\u000210.17863/cam.46212","level1":"engineering and technology","level2":"mechanical engineering","level3":"mechanical engineering & transports"}
{"doi":"10.1109/tc.2018.2860012","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"computer hardware & architecture"}
{"doi":"10.1002/mma.6622","level1":"natural sciences","level2":"mathematics","level3":"numerical & computational mathematics"}
{"doi":"10.1051/radiopro/2020020","level1":"natural sciences","level2":"chemical sciences","level3":"NULL"}
{"doi":"10.1007/s12268-019-1003-4","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
{"doi":"10.3390/cancers12010236","level1":"medical and health sciences","level2":"health sciences","level3":"biochemistry & molecular biology"}
{"doi":"10.6084/m9.figshare.9912614\u000210.6084/m9.figshare.9912614.v1\u000210.1080/00268976.2019.1665199","level1":"natural sciences","level2":"chemical sciences","level3":"physical chemistry"}
{"doi":"10.1175/jpo-d-17-0239.1","level1":"natural sciences","level2":"biological sciences","level3":"marine biology & hydrobiology"}
{"doi":"10.1007/s13218-020-00674-7","level1":"engineering and technology","level2":"industrial biotechnology","level3":"industrial engineering & automation"}
{"doi":"10.1016/j.psyneuen.2016.02.003\u000210.1016/j.psyneuen.2016.02.00310.7892/boris.78886\u000210.7892/boris.78886","level1":"medical and health sciences","level2":"basic medicine","level3":"NULL"}
{"doi":"10.1109/ted.2018.2813542","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"electrical & electronic engineering"}
{"doi":"10.3989/scimar.04739.25a","level1":"natural sciences","level2":"biological sciences","level3":"NULL"}
{"doi":"10.3390/su12187503","level1":"natural sciences","level2":"earth and related environmental sciences","level3":"NULL"}
{"doi":"10.1016/j.ccell.2018.08.017","level1":"medical and health sciences","level2":"basic medicine","level3":"biochemistry & molecular biology"}
{"doi":"10.1103/physrevresearch.2.023322","level1":"natural sciences","level2":"physical sciences","level3":"nuclear & particles physics"}
{"doi":"10.1039/c8cp03234c","level1":"natural sciences","level2":"NULL","level3":"NULL"}
{"doi":"10.5281/zenodo.3696557\u000210.5281/zenodo.3696556\u000210.1109/jsac.2016.2545384","level1":"engineering and technology","level2":"electrical engineering, electronic engineering, information engineering","level3":"networking & telecommunications"}
{"doi":"10.1038/ng.3667\u000210.1038/ng.3667.\u000210.17615/tct6-4m26\u000210.17863/cam.15649","level1":"medical and health sciences","level2":"health sciences","level3":"genetics & heredity"}
{"doi":"10.1016/j.jclepro.2019.119065","level1":"engineering and technology","level2":"other engineering and technologies","level3":"building & construction"}
{"doi":"10.1111/pce.13392","level1":"agricultural and veterinary sciences","level2":"agriculture, forestry, and fisheries","level3":"agronomy & agriculture"}

View File

@ -0,0 +1,38 @@
dedup_wf_001::ddcc7a56fa13e49bcc59c6bdd19ad26c 10.3390/s18072310 engineering and technology nano-technology nanoscience & nanotechnology
dedup_wf_001::b76062d56e28224eac56111a4e1e5ecf 10.1111/1365-2656.1283110.17863/cam.24369 social sciences psychology and cognitive sciences NULL
dedup_wf_001::bb752acb8f403a25fa7851a302f7b7ac 10.3929/ethz-b-00018758410.1002/chem.201701644 natural sciences NULL NULL
dedup_wf_001::2f1435a9201ecf5cbbcb12c9b2d971cd 10.1080/01913123.2017.1367361 medical and health sciences clinical medicine oncology & carcinogenesis
dedup_wf_001::fc9e47ec16c67b101724320d4b030514 10.1051/e3sconf/20199207011 natural sciences earth and related environmental sciences environmental sciences
dedup_wf_001::caa1e5b4de387cb31751552f4f0f5d72 10.1038/onc.2015.333 medical and health sciences clinical medicine oncology & carcinogenesis
dedup_wf_001::c2a98df5637d69bf0524eaf40fe6bf11 10.1093/mnras/staa256 natural sciences physical sciences NULL
dedup_wf_001::c221262bdc77cbfd59859a402f0e3991 10.1016/j.jclepro.2018.07.166 engineering and technology other engineering and technologies building & construction
doiboost____::d56d9dc21f317b3e009d5b6c8ea87212 10.1103/physrevlett.125.037403 natural sciences physical sciences nuclear & particles physics
dedup_wf_001::8a7269c8ee6470b2fb4fd384bc389e08 10.1080/03602532.2017.1316285 natural sciences NULL NULL
dedup_wf_001::28342ebbc19833e4e1f4a2b23cf5ee20 10.1001/jamanetworkopen.2019.1868 medical and health sciences other medical science health policy & services
dedup_wf_001::c1e1daf2b55dd9ec8e1c7c7458bbc7bc 10.1128/mra.00874-18 natural sciences biological sciences plant biology & botany
dedup_wf_001::a2ef4a2720c71907180750e5871298ef 10.1016/j.nancom.2018.03.001 engineering and technology NULL NULL
dedup_wf_001::676f46a31519e83a89efcb1c626286fb 10.1112/topo.12174 natural sciences NULL NULL
dedup_wf_001::6f2761642f1e39313388e2c4060657dd 10.12688/wellcomeopenres.15846.1 medical and health sciences health sciences NULL
dedup_wf_001::e414c1dec599521a9635a60de0f6755b 10.21468/scipostphys.3.1.001 natural sciences physical sciences NULL
dedup_wf_001::f3395fe0f330164ea424dc61c86c9a3d 10.1088/1741-4326/ab6c77 natural sciences physical sciences nuclear & particles physics
dedup_wf_001::a4f32a97a783117012f1de11797e73f2 10.1109/tpwrs.2019.2944747 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering
dedup_wf_001::313ae1cd083ae1696d12dd1909f97df8 10.1016/j.expthermflusci.2019.10999410.17863/cam.46212 engineering and technology mechanical engineering mechanical engineering & transports
dedup_wf_001::2a300a7d3ca7347791ebcef986bc0682 10.1109/tc.2018.2860012 engineering and technology electrical engineering, electronic engineering, information engineering computer hardware & architecture
doiboost____::5b79bd7bd9f87361b4a4abc3cbb2df75 10.1002/mma.6622 natural sciences mathematics numerical & computational mathematics
dedup_wf_001::6a3f61f217a2519fbaddea1094e3bfc2 10.1051/radiopro/2020020 natural sciences chemical sciences NULL
dedup_wf_001::a3f0430309a639f4234a0e57b10f2dee 10.1007/s12268-019-1003-4 medical and health sciences basic medicine NULL
dedup_wf_001::b6b8a3a1cccbee459cf3343485efdb12 10.3390/cancers12010236 medical and health sciences health sciences biochemistry & molecular biology
dedup_wf_001::dd06ee7974730e7b09a4f03c83b3f9bd 10.6084/m9.figshare.991261410.6084/m9.figshare.9912614.v110.1080/00268976.2019.1665199 natural sciences chemical sciences physical chemistry
dedup_wf_001::027c78bef6f972b5e26dfea55d30fbe3 10.1175/jpo-d-17-0239.1 natural sciences biological sciences marine biology & hydrobiology
dedup_wf_001::43edc179aa9e1fbaf582c5203b18b519 10.1007/s13218-020-00674-7 engineering and technology industrial biotechnology industrial engineering & automation
dedup_wf_001::e7770e11cd6eb514bb52c07b5a8a80f0 10.1016/j.psyneuen.2016.02.00310.1016/j.psyneuen.2016.02.00310.7892/boris.7888610.7892/boris.78886 medical and health sciences basic medicine NULL
dedup_wf_001::80bc15d69bdc589149631f3439dde5aa 10.1109/ted.2018.2813542 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering
dedup_wf_001::42c1cfa33e7872944b920cff90f4d99e 10.3989/scimar.04739.25a natural sciences biological sciences NULL
dedup_wf_001::9bacdbbaa9da3658b7243d5de8e3ce14 10.3390/su12187503 natural sciences earth and related environmental sciences NULL
dedup_wf_001::59e43d3527dcfecb6097fbd5740c8950 10.1016/j.ccell.2018.08.017 medical and health sciences basic medicine biochemistry & molecular biology
doiboost____::e024d1b738df3b24bc58fa0228542571 10.1103/physrevresearch.2.023322 natural sciences physical sciences nuclear & particles physics
dedup_wf_001::66e9a3237fa8178886d26d3c2d5b9e66 10.1039/c8cp03234c natural sciences NULL NULL
dedup_wf_001::83737ab4205bae751571bb3b166efa18 10.5281/zenodo.369655710.5281/zenodo.369655610.1109/jsac.2016.2545384 engineering and technology electrical engineering, electronic engineering, information engineering networking & telecommunications
dedup_wf_001::e3f892db413a689e572dd256acad55fe 10.1038/ng.366710.1038/ng.3667.10.17615/tct6-4m2610.17863/cam.15649 medical and health sciences health sciences genetics & heredity
dedup_wf_001::14ba594e8fd081847bc3f50f56335003 10.1016/j.jclepro.2019.119065 engineering and technology other engineering and technologies building & construction
dedup_wf_001::08ac7b33a41bcea2d055ecd8585d632e 10.1111/pce.13392 agricultural and veterinary sciences agriculture, forestry, and fisheries agronomy & agriculture
1 dedup_wf_001::ddcc7a56fa13e49bcc59c6bdd19ad26c 10.3390/s18072310 engineering and technology nano-technology nanoscience & nanotechnology
2 dedup_wf_001::b76062d56e28224eac56111a4e1e5ecf 10.1111/1365-2656.1283110.17863/cam.24369 social sciences psychology and cognitive sciences NULL
3 dedup_wf_001::bb752acb8f403a25fa7851a302f7b7ac 10.3929/ethz-b-00018758410.1002/chem.201701644 natural sciences NULL NULL
4 dedup_wf_001::2f1435a9201ecf5cbbcb12c9b2d971cd 10.1080/01913123.2017.1367361 medical and health sciences clinical medicine oncology & carcinogenesis
5 dedup_wf_001::fc9e47ec16c67b101724320d4b030514 10.1051/e3sconf/20199207011 natural sciences earth and related environmental sciences environmental sciences
6 dedup_wf_001::caa1e5b4de387cb31751552f4f0f5d72 10.1038/onc.2015.333 medical and health sciences clinical medicine oncology & carcinogenesis
7 dedup_wf_001::c2a98df5637d69bf0524eaf40fe6bf11 10.1093/mnras/staa256 natural sciences physical sciences NULL
8 dedup_wf_001::c221262bdc77cbfd59859a402f0e3991 10.1016/j.jclepro.2018.07.166 engineering and technology other engineering and technologies building & construction
9 doiboost____::d56d9dc21f317b3e009d5b6c8ea87212 10.1103/physrevlett.125.037403 natural sciences physical sciences nuclear & particles physics
10 dedup_wf_001::8a7269c8ee6470b2fb4fd384bc389e08 10.1080/03602532.2017.1316285 natural sciences NULL NULL
11 dedup_wf_001::28342ebbc19833e4e1f4a2b23cf5ee20 10.1001/jamanetworkopen.2019.1868 medical and health sciences other medical science health policy & services
12 dedup_wf_001::c1e1daf2b55dd9ec8e1c7c7458bbc7bc 10.1128/mra.00874-18 natural sciences biological sciences plant biology & botany
13 dedup_wf_001::a2ef4a2720c71907180750e5871298ef 10.1016/j.nancom.2018.03.001 engineering and technology NULL NULL
14 dedup_wf_001::676f46a31519e83a89efcb1c626286fb 10.1112/topo.12174 natural sciences NULL NULL
15 dedup_wf_001::6f2761642f1e39313388e2c4060657dd 10.12688/wellcomeopenres.15846.1 medical and health sciences health sciences NULL
16 dedup_wf_001::e414c1dec599521a9635a60de0f6755b 10.21468/scipostphys.3.1.001 natural sciences physical sciences NULL
17 dedup_wf_001::f3395fe0f330164ea424dc61c86c9a3d 10.1088/1741-4326/ab6c77 natural sciences physical sciences nuclear & particles physics
18 dedup_wf_001::a4f32a97a783117012f1de11797e73f2 10.1109/tpwrs.2019.2944747 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering
19 dedup_wf_001::313ae1cd083ae1696d12dd1909f97df8 10.1016/j.expthermflusci.2019.10999410.17863/cam.46212 engineering and technology mechanical engineering mechanical engineering & transports
20 dedup_wf_001::2a300a7d3ca7347791ebcef986bc0682 10.1109/tc.2018.2860012 engineering and technology electrical engineering, electronic engineering, information engineering computer hardware & architecture
21 doiboost____::5b79bd7bd9f87361b4a4abc3cbb2df75 10.1002/mma.6622 natural sciences mathematics numerical & computational mathematics
22 dedup_wf_001::6a3f61f217a2519fbaddea1094e3bfc2 10.1051/radiopro/2020020 natural sciences chemical sciences NULL
23 dedup_wf_001::a3f0430309a639f4234a0e57b10f2dee 10.1007/s12268-019-1003-4 medical and health sciences basic medicine NULL
24 dedup_wf_001::b6b8a3a1cccbee459cf3343485efdb12 10.3390/cancers12010236 medical and health sciences health sciences biochemistry & molecular biology
25 dedup_wf_001::dd06ee7974730e7b09a4f03c83b3f9bd 10.6084/m9.figshare.991261410.6084/m9.figshare.9912614.v110.1080/00268976.2019.1665199 natural sciences chemical sciences physical chemistry
26 dedup_wf_001::027c78bef6f972b5e26dfea55d30fbe3 10.1175/jpo-d-17-0239.1 natural sciences biological sciences marine biology & hydrobiology
27 dedup_wf_001::43edc179aa9e1fbaf582c5203b18b519 10.1007/s13218-020-00674-7 engineering and technology industrial biotechnology industrial engineering & automation
28 dedup_wf_001::e7770e11cd6eb514bb52c07b5a8a80f0 10.1016/j.psyneuen.2016.02.00310.1016/j.psyneuen.2016.02.00310.7892/boris.7888610.7892/boris.78886 medical and health sciences basic medicine NULL
29 dedup_wf_001::80bc15d69bdc589149631f3439dde5aa 10.1109/ted.2018.2813542 engineering and technology electrical engineering, electronic engineering, information engineering electrical & electronic engineering
30 dedup_wf_001::42c1cfa33e7872944b920cff90f4d99e 10.3989/scimar.04739.25a natural sciences biological sciences NULL
31 dedup_wf_001::9bacdbbaa9da3658b7243d5de8e3ce14 10.3390/su12187503 natural sciences earth and related environmental sciences NULL
32 dedup_wf_001::59e43d3527dcfecb6097fbd5740c8950 10.1016/j.ccell.2018.08.017 medical and health sciences basic medicine biochemistry & molecular biology
33 doiboost____::e024d1b738df3b24bc58fa0228542571 10.1103/physrevresearch.2.023322 natural sciences physical sciences nuclear & particles physics
34 dedup_wf_001::66e9a3237fa8178886d26d3c2d5b9e66 10.1039/c8cp03234c natural sciences NULL NULL
35 dedup_wf_001::83737ab4205bae751571bb3b166efa18 10.5281/zenodo.369655710.5281/zenodo.369655610.1109/jsac.2016.2545384 engineering and technology electrical engineering, electronic engineering, information engineering networking & telecommunications
36 dedup_wf_001::e3f892db413a689e572dd256acad55fe 10.1038/ng.366710.1038/ng.3667.10.17615/tct6-4m2610.17863/cam.15649 medical and health sciences health sciences genetics & heredity
37 dedup_wf_001::14ba594e8fd081847bc3f50f56335003 10.1016/j.jclepro.2019.119065 engineering and technology other engineering and technologies building & construction
38 dedup_wf_001::08ac7b33a41bcea2d055ecd8585d632e 10.1111/pce.13392 agricultural and veterinary sciences agriculture, forestry, and fisheries agronomy & agriculture

Some files were not shown because too many files have changed in this diff Show More