forked from antonis.lempesis/dnet-hadoop
Merge branch 'master' of code-repo.d4science.org:D-Net/dnet-hadoop
This commit is contained in:
commit
9a2d74ac82
|
@ -20,5 +20,5 @@
|
|||
/*/build
|
||||
/build
|
||||
spark-warehouse
|
||||
/*/*/job-override.properties
|
||||
/**/job-override.properties
|
||||
|
||||
|
|
|
@ -42,6 +42,10 @@
|
|||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.lib.ExtensionFunctionCall;
|
||||
import net.sf.saxon.lib.ExtensionFunctionDefinition;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.om.StructuredQName;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
|
||||
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
|
||||
|
||||
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
|
||||
|
||||
public abstract String getName();
|
||||
public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException;
|
||||
|
||||
@Override
|
||||
public StructuredQName getFunctionQName() {
|
||||
return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public ExtensionFunctionCall makeCallExpression() {
|
||||
return new ExtensionFunctionCall() {
|
||||
@Override
|
||||
public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
return doCall(context, arguments);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,67 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Item;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Calendar;
|
||||
import java.util.GregorianCalendar;
|
||||
|
||||
public class ExtractYear extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] dateFormats = { "yyyy-MM-dd", "yyyy/MM/dd" };
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "extractYear";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
final Item item = arguments[0].head();
|
||||
if (item == null) {
|
||||
return new StringValue("");
|
||||
}
|
||||
return new StringValue(_year(item.getStringValue()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
Calendar c = new GregorianCalendar();
|
||||
for (String format : dateFormats) {
|
||||
try {
|
||||
c.setTime(new SimpleDateFormat(format).parse(s));
|
||||
String year = String.valueOf(c.get(Calendar.YEAR));
|
||||
return year;
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
public class NormalizeDate extends AbstractExtensionFunction {
|
||||
|
||||
private static final String[] normalizeDateFormats = { "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" };
|
||||
|
||||
private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "normalizeDate";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
String s = arguments[0].head().getStringValue();
|
||||
return new StringValue(_year(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
private String _year(String s) {
|
||||
final String date = s != null ? s.trim() : "";
|
||||
|
||||
for (String format : normalizeDateFormats) {
|
||||
try {
|
||||
Date parse = new SimpleDateFormat(format).parse(date);
|
||||
String res = new SimpleDateFormat(normalizeOutFormat).format(parse);
|
||||
return res;
|
||||
} catch (ParseException e) {}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.expr.XPathContext;
|
||||
import net.sf.saxon.om.Item;
|
||||
import net.sf.saxon.om.Sequence;
|
||||
import net.sf.saxon.trans.XPathException;
|
||||
import net.sf.saxon.value.SequenceType;
|
||||
import net.sf.saxon.value.StringValue;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class PickFirst extends AbstractExtensionFunction {
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return "pickFirst";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
|
||||
if (arguments == null | arguments.length == 0) {
|
||||
return new StringValue("");
|
||||
}
|
||||
|
||||
final String s1 = getValue(arguments[0]);
|
||||
final String s2 = getValue(arguments[1]);
|
||||
|
||||
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
|
||||
}
|
||||
|
||||
private String getValue(final Sequence arg) throws XPathException {
|
||||
if (arg != null) {
|
||||
final Item item = arg.head();
|
||||
if (item != null) {
|
||||
return item.getStringValue();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMinimumNumberOfArguments() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getMaximumNumberOfArguments() {
|
||||
return 2;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
return new SequenceType[] { SequenceType.OPTIONAL_ITEM };
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) {
|
||||
return SequenceType.SINGLE_STRING;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.dhp.utils.saxon;
|
||||
|
||||
import net.sf.saxon.Configuration;
|
||||
import net.sf.saxon.TransformerFactoryImpl;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.StringReader;
|
||||
|
||||
public class SaxonTransformerFactory {
|
||||
|
||||
/**
|
||||
* Creates the index record transformer from the given XSLT
|
||||
* @param xslt
|
||||
* @return
|
||||
* @throws TransformerException
|
||||
*/
|
||||
public static Transformer newInstance(final String xslt) throws TransformerException {
|
||||
|
||||
final TransformerFactoryImpl factory = new TransformerFactoryImpl();
|
||||
final Configuration conf = factory.getConfiguration();
|
||||
conf.registerExtensionFunction(new ExtractYear());
|
||||
conf.registerExtensionFunction(new NormalizeDate());
|
||||
conf.registerExtensionFunction(new PickFirst());
|
||||
|
||||
return factory.newTransformer(new StreamSource(new StringReader(xslt)));
|
||||
}
|
||||
|
||||
}
|
|
@ -1,3 +1,11 @@
|
|||
Description of the project
|
||||
--------------------------
|
||||
This project defines **serialization schemas** of Avro data store files that are used to pass data between workflow nodes in the system.
|
||||
This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them.
|
||||
Namely it defines the model for
|
||||
|
||||
- **research product (result)** which subclasses in publication, dataset, other research product, software
|
||||
- **data source** object describing the data provider (institutional repository, aggregators, cris systems)
|
||||
- **organization** research bodies managing a data source or participating to a research project
|
||||
- **project** research project
|
||||
|
||||
Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline.
|
||||
|
|
|
@ -26,6 +26,11 @@
|
|||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
|
|
|
@ -40,9 +40,9 @@ public class Datasource extends OafEntity implements Serializable {
|
|||
|
||||
private List<Field<String>> odlanguages;
|
||||
|
||||
private List< Field<String>> odcontenttypes;
|
||||
private List<Field<String>> odcontenttypes;
|
||||
|
||||
private List< Field<String>> accessinfopackage;
|
||||
private List<Field<String>> accessinfopackage;
|
||||
|
||||
// re3data fields
|
||||
private Field<String> releasestartdate;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -36,7 +37,7 @@ public class GeoLocation implements Serializable {
|
|||
this.place = place;
|
||||
}
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(point) &&
|
||||
StringUtils.isBlank(box) &&
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -40,6 +41,7 @@ public class KeyValue implements Serializable {
|
|||
return isBlank()?"":String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : "");
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(key) && StringUtils.isBlank(value);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -50,6 +51,8 @@ public class Qualifier implements Serializable {
|
|||
schemeid != null ? schemeid : "",
|
||||
schemename != null ? schemename : "");
|
||||
}
|
||||
|
||||
@JsonIgnore
|
||||
public boolean isBlank() {
|
||||
return StringUtils.isBlank(classid) &&
|
||||
StringUtils.isBlank(classname) &&
|
||||
|
|
|
@ -113,7 +113,7 @@
|
|||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<ok to="ExtractPublication"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
|
|
@ -1,53 +1,43 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkGraphImporterJob {
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.config("hive.metastore.uris", parser.get("hive_metastore_uris"))
|
||||
.enableHiveSupport()
|
||||
.getOrCreate();
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkGraphImporterJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_graph_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
final SparkSession spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkGraphImporterJob.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.config("hive.metastore.uris", parser.get("hive_metastore_uris"))
|
||||
.enableHiveSupport()
|
||||
.getOrCreate();
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String hiveDbName = parser.get("hive_db_name");
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String hiveDbName = parser.get("hive_db_name");
|
||||
|
||||
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
|
||||
spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName));
|
||||
|
||||
// Read the input file and convert it into RDD of serializable object
|
||||
GraphMappingUtils.types.forEach((name, clazz) -> {
|
||||
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
|
||||
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
|
||||
// Read the input file and convert it into RDD of serializable object
|
||||
GraphMappingUtils.types.forEach((name, clazz) -> {
|
||||
spark.createDataset(sc.sequenceFile(inputPath + "/" + name, Text.class, Text.class)
|
||||
.map(s -> new ObjectMapper().readValue(s._2().toString(), clazz))
|
||||
.rdd(), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.saveAsTable(hiveDbName + "." + name);
|
||||
});
|
||||
|
||||
spark.createDataset(inputRDD
|
||||
.filter(s -> s._1().equals(clazz.getName()))
|
||||
.map(Tuple2::_2)
|
||||
.map(s -> new ObjectMapper().readValue(s, clazz))
|
||||
.rdd(), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.saveAsTable(hiveDbName + "." + name);
|
||||
});
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
sparkDriverMemory=8G
|
||||
sparkExecutorMemory=8G
|
||||
#isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp
|
||||
isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl
|
||||
sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03
|
||||
outputPath=/tmp/openaire_provision
|
||||
format=TMF
|
||||
batchSize=2000
|
||||
sparkExecutorCoresForIndexing=64
|
||||
reuseRecords=true
|
|
@ -0,0 +1,92 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.1.6-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-graph-provision</artifactId>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.mycila.xmltool</groupId>
|
||||
<artifactId>xmltool</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-solrj</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.lucidworks.spark</groupId>
|
||||
<artifactId>spark-solr</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.noggit</groupId>
|
||||
<artifactId>noggit</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-transports-http</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,257 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.graph.model.*;
|
||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.graph.utils.GraphMappingUtils;
|
||||
import eu.dnetlib.dhp.graph.utils.XmlRecordFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.asRelatedEntity;
|
||||
|
||||
/**
|
||||
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
|
||||
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
|
||||
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
|
||||
*
|
||||
* The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
|
||||
* by E, finally grouped by E.id;
|
||||
*
|
||||
* Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
|
||||
* 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
|
||||
* it seems that deserializing it with jackson's object mapper has higher memory footprint.
|
||||
*
|
||||
* 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
|
||||
* 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
|
||||
* and E_target = T. Objects in T are heavily pruned by all the unnecessary information
|
||||
*
|
||||
* 4) perform the join as (((T join R) union S) groupby S.id) yield S -> [ <T, R> ]
|
||||
*/
|
||||
public class GraphJoiner implements Serializable {
|
||||
|
||||
public static final int MAX_RELS = 100;
|
||||
|
||||
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
|
||||
|
||||
private SparkSession spark;
|
||||
|
||||
private ContextMapper contextMapper;
|
||||
|
||||
private String inputPath;
|
||||
|
||||
private String outPath;
|
||||
|
||||
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String inputPath, String outPath) {
|
||||
this.spark = spark;
|
||||
this.contextMapper = contextMapper;
|
||||
this.inputPath = inputPath;
|
||||
this.outPath = outPath;
|
||||
}
|
||||
|
||||
public GraphJoiner adjacencyLists() {
|
||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
||||
|
||||
// read each entity
|
||||
JavaPairRDD<String, TypedRow> datasource = readPathEntity(sc, getInputPath(), "datasource");
|
||||
JavaPairRDD<String, TypedRow> organization = readPathEntity(sc, getInputPath(), "organization");
|
||||
JavaPairRDD<String, TypedRow> project = readPathEntity(sc, getInputPath(), "project");
|
||||
JavaPairRDD<String, TypedRow> dataset = readPathEntity(sc, getInputPath(), "dataset");
|
||||
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(sc, getInputPath(), "otherresearchproduct");
|
||||
JavaPairRDD<String, TypedRow> software = readPathEntity(sc, getInputPath(), "software");
|
||||
JavaPairRDD<String, TypedRow> publication = readPathEntity(sc, getInputPath(), "publication");
|
||||
|
||||
// create the union between all the entities
|
||||
final String entitiesPath = getOutPath() + "/entities";
|
||||
datasource
|
||||
.union(organization)
|
||||
.union(project)
|
||||
.union(dataset)
|
||||
.union(otherresearchproduct)
|
||||
.union(software)
|
||||
.union(publication)
|
||||
.map(e -> new EntityRelEntity().setSource(e._2()))
|
||||
.map(GraphMappingUtils::serialize)
|
||||
.saveAsTextFile(entitiesPath, GzipCodec.class);
|
||||
|
||||
JavaPairRDD<String, EntityRelEntity> entities = sc.textFile(entitiesPath)
|
||||
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
|
||||
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
|
||||
|
||||
// reads the relationships
|
||||
final JavaPairRDD<String, EntityRelEntity> relation = readPathRelation(sc, getInputPath())
|
||||
.filter(r -> !r.getDeleted()) //only consider those that are not virtually deleted
|
||||
.map(p -> new EntityRelEntity().setRelation(p))
|
||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getSourceId(), p))
|
||||
.groupByKey()
|
||||
.map(p -> Iterables.limit(p._2(), MAX_RELS))
|
||||
.flatMap(p -> p.iterator())
|
||||
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
|
||||
|
||||
//final String bySource = getOutPath() + "/1_join_by_target";
|
||||
JavaPairRDD<String, EntityRelEntity> bySource = relation
|
||||
.join(entities
|
||||
.filter(e -> !e._2().getSource().getDeleted())
|
||||
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
|
||||
.map(s -> new EntityRelEntity()
|
||||
.setRelation(s._2()._1().getRelation())
|
||||
.setTarget(s._2()._2().getSource()))
|
||||
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
|
||||
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, false, schemaLocation, new HashSet<>());
|
||||
entities
|
||||
.union(bySource)
|
||||
.groupByKey() // by source id
|
||||
.map(l -> toJoinedEntity(l))
|
||||
.mapToPair(je -> new Tuple2<>(
|
||||
new Text(je.getEntity().getId()),
|
||||
new Text(recordFactory.build(je))))
|
||||
.saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public GraphJoiner asXML() {
|
||||
final JavaSparkContext sc = new JavaSparkContext(getSpark().sparkContext());
|
||||
final XmlRecordFactory recordFactory = new XmlRecordFactory(contextMapper, true, "", new HashSet<>());
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
final String joinedEntitiesPath = getOutPath() + "/1_joined_entities";
|
||||
sc.textFile(joinedEntitiesPath)
|
||||
.map(s -> mapper.readValue(s, JoinedEntity.class))
|
||||
.mapToPair(je -> new Tuple2<>(new Text(je.getEntity().getId()), new Text(recordFactory.build(je))))
|
||||
.saveAsHadoopFile(getOutPath() + "/2_xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public SparkSession getSpark() {
|
||||
return spark;
|
||||
}
|
||||
|
||||
public String getInputPath() {
|
||||
return inputPath;
|
||||
}
|
||||
|
||||
public String getOutPath() {
|
||||
return outPath;
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private OafEntity parseOaf(final String json, final String type) {
|
||||
final ObjectMapper o = new ObjectMapper();
|
||||
try {
|
||||
switch (GraphMappingUtils.EntityType.valueOf(type)) {
|
||||
case publication:
|
||||
return o.readValue(json, Publication.class);
|
||||
case dataset:
|
||||
return o.readValue(json, Dataset.class);
|
||||
case otherresearchproduct:
|
||||
return o.readValue(json, OtherResearchProduct.class);
|
||||
case software:
|
||||
return o.readValue(json, Software.class);
|
||||
case datasource:
|
||||
return o.readValue(json, Datasource.class);
|
||||
case organization:
|
||||
return o.readValue(json, Organization.class);
|
||||
case project:
|
||||
return o.readValue(json, Project.class);
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid type: " + type);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
|
||||
final ObjectMapper o = new ObjectMapper();
|
||||
final JoinedEntity j = new JoinedEntity();
|
||||
final Links links2 = new Links();
|
||||
for(EntityRelEntity rel : p._2()) {
|
||||
if (rel.hasMainEntity() & j.getEntity() == null) {
|
||||
j.setType(rel.getSource().getType());
|
||||
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType()));
|
||||
}
|
||||
if (rel.hasRelatedEntity()) {
|
||||
try {
|
||||
links2.add(
|
||||
new eu.dnetlib.dhp.graph.model.Tuple2()
|
||||
.setRelation(o.readValue(rel.getRelation().getOaf(), Relation.class))
|
||||
.setRelatedEntity(o.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
|
||||
} catch (IOException e) {
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
j.setLinks(links2);
|
||||
if (j.getEntity() == null) {
|
||||
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
|
||||
}
|
||||
return j;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file <className, entity json serialization>,
|
||||
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
|
||||
* @param sc
|
||||
* @param inputPath
|
||||
* @param type
|
||||
* @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
|
||||
*/
|
||||
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
|
||||
return sc.sequenceFile(inputPath + "/" + type, Text.class, Text.class)
|
||||
.mapToPair((PairFunction<Tuple2<Text, Text>, String, TypedRow>) item -> {
|
||||
final String s = item._2().toString();
|
||||
final DocumentContext json = JsonPath.parse(s);
|
||||
final String id = json.read("$.id");
|
||||
return new Tuple2<>(id, new TypedRow()
|
||||
.setSourceId(id)
|
||||
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
||||
.setType(type)
|
||||
.setOaf(s));
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
|
||||
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
|
||||
* @param sc
|
||||
* @param inputPath
|
||||
* @return the JavaRDD<TypedRow> containing all the relationships
|
||||
*/
|
||||
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
|
||||
return sc.sequenceFile(inputPath + "/relation", Text.class, Text.class)
|
||||
.map(item -> {
|
||||
final String s = item._2().toString();
|
||||
final DocumentContext json = JsonPath.parse(s);
|
||||
return new TypedRow()
|
||||
.setSourceId(json.read("$.source"))
|
||||
.setTargetId(json.read("$.target"))
|
||||
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
|
||||
.setType("relation")
|
||||
.setOaf(s);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,188 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import com.lucidworks.spark.util.SolrSupport;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.dhp.graph.utils.StreamingInputDocumentFactory;
|
||||
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.rdd.RDD;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
public class SparkXmlIndexingJob {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class);
|
||||
|
||||
private static final Integer DEFAULT_BATCH_SIZE = 1000;
|
||||
|
||||
private static final String LAYOUT = "index";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlIndexingJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_update_index.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
final String format = parser.get("format");
|
||||
final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE;
|
||||
|
||||
final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final String fields = getLayoutSource(isLookup, format);
|
||||
final String xslt = getLayoutTransformer(isLookup);
|
||||
|
||||
final String dsId = getDsId(format, isLookup);
|
||||
final String zkHost = getZkHost(isLookup);
|
||||
final String version = getRecordDatestamp();
|
||||
|
||||
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
|
||||
|
||||
log.info("indexRecordTransformer: " + indexRecordXslt);
|
||||
|
||||
final String master = parser.get("master");
|
||||
final SparkConf conf = new SparkConf()
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
|
||||
try(SparkSession spark = getSession(conf, master)) {
|
||||
|
||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||
|
||||
RDD<SolrInputDocument> docs = sc.sequenceFile(inputPath, Text.class, Text.class)
|
||||
.map(t -> t._2().toString())
|
||||
.map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s))
|
||||
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
|
||||
.rdd();
|
||||
|
||||
SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession getSession(SparkConf conf, String master) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
|
||||
.master(master)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static String toIndexRecord(Transformer tr, final String record) {
|
||||
final StreamResult res = new StreamResult(new StringWriter());
|
||||
try {
|
||||
tr.transform(new StreamSource(new StringReader(record)), res);
|
||||
return res.getWriter().toString();
|
||||
} catch (Throwable e) {
|
||||
System.out.println("XPathException on record:\n" + record);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates the XSLT responsible for building the index xml records.
|
||||
*
|
||||
* @param format Metadata format name (DMF|TMF)
|
||||
* @param xslt xslt for building the index record transformer
|
||||
* @param fields the list of fields
|
||||
* @return the javax.xml.transform.Transformer
|
||||
* @throws ISLookUpException could happen
|
||||
* @throws IOException could happen
|
||||
* @throws TransformerException could happen
|
||||
*/
|
||||
private static String getLayoutTransformer(String format, String fields, String xslt) throws TransformerException {
|
||||
|
||||
final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt);
|
||||
final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter());
|
||||
|
||||
layoutTransformer.setParameter("format", format);
|
||||
layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt);
|
||||
|
||||
return layoutToXsltXslt.getWriter().toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* method return a solr-compatible string representation of a date, used to mark all records as indexed today
|
||||
* @return the parsed date
|
||||
*/
|
||||
public static String getRecordDatestamp() {
|
||||
return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date());
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the list of fields associated to the given MDFormat name
|
||||
*
|
||||
* @param isLookup the ISLookup service stub
|
||||
* @param format the Metadata format name
|
||||
* @return the string representation of the list of fields to be indexed
|
||||
*
|
||||
* @throws ISLookUpDocumentNotFoundException
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getLayoutSource(final ISLookUpService isLookup, final String format) throws ISLookUpDocumentNotFoundException, ISLookUpException {
|
||||
return doLookup(isLookup, String.format(
|
||||
"collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", format, LAYOUT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the openaireLayoutToRecordStylesheet
|
||||
*
|
||||
* @param isLookup the ISLookup service stub
|
||||
* @return the string representation of the XSLT contained in the transformation rule profile
|
||||
*
|
||||
* @throws ISLookUpDocumentNotFoundException
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException {
|
||||
return doLookup(isLookup, "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" +
|
||||
"//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()");
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name
|
||||
* @param format
|
||||
* @param isLookup
|
||||
* @return the IndexDS identifier
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException {
|
||||
return doLookup(isLookup, String.format("collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" +
|
||||
"//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", format));
|
||||
}
|
||||
|
||||
/**
|
||||
* Method retrieves from the information system the zookeeper quorum of the Solr server
|
||||
* @param isLookup
|
||||
* @return the zookeeper quorum of the Solr server
|
||||
* @throws ISLookUpException
|
||||
*/
|
||||
private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException {
|
||||
return doLookup(isLookup, "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()");
|
||||
}
|
||||
|
||||
private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException {
|
||||
log.info(String.format("running xquery: %s", xquery));
|
||||
final String res = isLookup.getResourceProfileByQuery(xquery);
|
||||
log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ..."));
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
package eu.dnetlib.dhp.graph;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.graph.utils.ContextMapper;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public class SparkXmlRecordBuilderJob {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/graph/input_params_build_adjacency_lists.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String master = parser.get("master");
|
||||
final SparkConf conf = new SparkConf()
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
|
||||
try(SparkSession spark = getSession(conf, master)) {
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
final String outputPath = parser.get("outputPath");
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
|
||||
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
if (fs.exists(new Path(outputPath))) {
|
||||
fs.delete(new Path(outputPath), true);
|
||||
fs.mkdirs(new Path(outputPath));
|
||||
}
|
||||
|
||||
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), inputPath, outputPath)
|
||||
.adjacencyLists();
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession getSession(SparkConf conf, String master) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
|
||||
.master(master)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class EntityRelEntity implements Serializable {
|
||||
|
||||
private TypedRow source;
|
||||
private TypedRow relation;
|
||||
private TypedRow target;
|
||||
|
||||
public EntityRelEntity() {
|
||||
}
|
||||
|
||||
public EntityRelEntity(TypedRow source) {
|
||||
this.source = source;
|
||||
}
|
||||
|
||||
//helpers
|
||||
public Boolean hasMainEntity() {
|
||||
return getSource() != null & getRelation() == null & getTarget() == null;
|
||||
}
|
||||
|
||||
public Boolean hasRelatedEntity() {
|
||||
return getSource() == null & getRelation() != null & getTarget() != null;
|
||||
}
|
||||
|
||||
|
||||
public TypedRow getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public EntityRelEntity setSource(TypedRow source) {
|
||||
this.source = source;
|
||||
return this;
|
||||
}
|
||||
|
||||
public TypedRow getRelation() {
|
||||
return relation;
|
||||
}
|
||||
|
||||
public EntityRelEntity setRelation(TypedRow relation) {
|
||||
this.relation = relation;
|
||||
return this;
|
||||
}
|
||||
|
||||
public TypedRow getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
public EntityRelEntity setTarget(TypedRow target) {
|
||||
this.target = target;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class JoinedEntity implements Serializable {
|
||||
|
||||
private String type;
|
||||
|
||||
private OafEntity entity;
|
||||
|
||||
private Links links;
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public JoinedEntity setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
|
||||
public OafEntity getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public JoinedEntity setEntity(OafEntity entity) {
|
||||
this.entity = entity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Links getLinks() {
|
||||
return links;
|
||||
}
|
||||
|
||||
public JoinedEntity setLinks(Links links) {
|
||||
this.links = links;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class Links extends ArrayList<Tuple2> {
|
||||
}
|
|
@ -0,0 +1,257 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
public class RelatedEntity implements Serializable {
|
||||
|
||||
private String id;
|
||||
private String type;
|
||||
|
||||
// common fields
|
||||
private StructuredProperty title;
|
||||
private String websiteurl; // datasource, organizations, projects
|
||||
|
||||
// results
|
||||
private String dateofacceptance;
|
||||
private String publisher;
|
||||
private List<StructuredProperty> pid;
|
||||
private String codeRepositoryUrl;
|
||||
private Qualifier resulttype;
|
||||
private List<KeyValue> collectedfrom;
|
||||
private List<Instance> instances;
|
||||
|
||||
// datasource
|
||||
private String officialname;
|
||||
private Qualifier datasourcetype;
|
||||
private Qualifier datasourcetypeui;
|
||||
private Qualifier openairecompatibility;
|
||||
//private String aggregatortype;
|
||||
|
||||
// organization
|
||||
private String legalname;
|
||||
private String legalshortname;
|
||||
private Qualifier country;
|
||||
|
||||
// project
|
||||
private String projectTitle;
|
||||
private String code;
|
||||
private String acronym;
|
||||
private Qualifier contracttype;
|
||||
private List<String> fundingtree;
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public RelatedEntity setId(String id) {
|
||||
this.id = id;
|
||||
return this;
|
||||
}
|
||||
|
||||
public StructuredProperty getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public RelatedEntity setTitle(StructuredProperty title) {
|
||||
this.title = title;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getDateofacceptance() {
|
||||
return dateofacceptance;
|
||||
}
|
||||
|
||||
public RelatedEntity setDateofacceptance(String dateofacceptance) {
|
||||
this.dateofacceptance = dateofacceptance;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getPublisher() {
|
||||
return publisher;
|
||||
}
|
||||
|
||||
public RelatedEntity setPublisher(String publisher) {
|
||||
this.publisher = publisher;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<StructuredProperty> getPid() {
|
||||
return pid;
|
||||
}
|
||||
|
||||
public RelatedEntity setPid(List<StructuredProperty> pid) {
|
||||
this.pid = pid;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getCodeRepositoryUrl() {
|
||||
return codeRepositoryUrl;
|
||||
}
|
||||
|
||||
public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) {
|
||||
this.codeRepositoryUrl = codeRepositoryUrl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getResulttype() {
|
||||
return resulttype;
|
||||
}
|
||||
|
||||
public RelatedEntity setResulttype(Qualifier resulttype) {
|
||||
this.resulttype = resulttype;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<KeyValue> getCollectedfrom() {
|
||||
return collectedfrom;
|
||||
}
|
||||
|
||||
public RelatedEntity setCollectedfrom(List<KeyValue> collectedfrom) {
|
||||
this.collectedfrom = collectedfrom;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<Instance> getInstances() {
|
||||
return instances;
|
||||
}
|
||||
|
||||
public RelatedEntity setInstances(List<Instance> instances) {
|
||||
this.instances = instances;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getOfficialname() {
|
||||
return officialname;
|
||||
}
|
||||
|
||||
public RelatedEntity setOfficialname(String officialname) {
|
||||
this.officialname = officialname;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getWebsiteurl() {
|
||||
return websiteurl;
|
||||
}
|
||||
|
||||
public RelatedEntity setWebsiteurl(String websiteurl) {
|
||||
this.websiteurl = websiteurl;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getDatasourcetype() {
|
||||
return datasourcetype;
|
||||
}
|
||||
|
||||
public RelatedEntity setDatasourcetype(Qualifier datasourcetype) {
|
||||
this.datasourcetype = datasourcetype;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getDatasourcetypeui() {
|
||||
return datasourcetypeui;
|
||||
}
|
||||
|
||||
public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) {
|
||||
this.datasourcetypeui = datasourcetypeui;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getOpenairecompatibility() {
|
||||
return openairecompatibility;
|
||||
}
|
||||
|
||||
public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) {
|
||||
this.openairecompatibility = openairecompatibility;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getLegalname() {
|
||||
return legalname;
|
||||
}
|
||||
|
||||
public RelatedEntity setLegalname(String legalname) {
|
||||
this.legalname = legalname;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getLegalshortname() {
|
||||
return legalshortname;
|
||||
}
|
||||
|
||||
public RelatedEntity setLegalshortname(String legalshortname) {
|
||||
this.legalshortname = legalshortname;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getCountry() {
|
||||
return country;
|
||||
}
|
||||
|
||||
public RelatedEntity setCountry(Qualifier country) {
|
||||
this.country = country;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getCode() {
|
||||
return code;
|
||||
}
|
||||
|
||||
public RelatedEntity setCode(String code) {
|
||||
this.code = code;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getAcronym() {
|
||||
return acronym;
|
||||
}
|
||||
|
||||
public RelatedEntity setAcronym(String acronym) {
|
||||
this.acronym = acronym;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Qualifier getContracttype() {
|
||||
return contracttype;
|
||||
}
|
||||
|
||||
public RelatedEntity setContracttype(Qualifier contracttype) {
|
||||
this.contracttype = contracttype;
|
||||
return this;
|
||||
}
|
||||
|
||||
public List<String> getFundingtree() {
|
||||
return fundingtree;
|
||||
}
|
||||
|
||||
public RelatedEntity setFundingtree(List<String> fundingtree) {
|
||||
this.fundingtree = fundingtree;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getProjectTitle() {
|
||||
return projectTitle;
|
||||
}
|
||||
|
||||
public RelatedEntity setProjectTitle(String projectTitle) {
|
||||
this.projectTitle = projectTitle;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public RelatedEntity setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class Tuple2 {
|
||||
|
||||
private Relation relation;
|
||||
|
||||
private RelatedEntity relatedEntity;
|
||||
|
||||
public Relation getRelation() {
|
||||
return relation;
|
||||
}
|
||||
|
||||
public Tuple2 setRelation(Relation relation) {
|
||||
this.relation = relation;
|
||||
return this;
|
||||
}
|
||||
|
||||
public RelatedEntity getRelatedEntity() {
|
||||
return relatedEntity;
|
||||
}
|
||||
|
||||
public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) {
|
||||
this.relatedEntity = relatedEntity;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package eu.dnetlib.dhp.graph.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class TypedRow implements Serializable {
|
||||
|
||||
private String sourceId;
|
||||
|
||||
private String targetId;
|
||||
|
||||
private Boolean deleted;
|
||||
|
||||
private String type;
|
||||
|
||||
private String oaf;
|
||||
|
||||
public String getSourceId() {
|
||||
return sourceId;
|
||||
}
|
||||
|
||||
public TypedRow setSourceId(String sourceId) {
|
||||
this.sourceId = sourceId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getTargetId() {
|
||||
return targetId;
|
||||
}
|
||||
|
||||
public TypedRow setTargetId(String targetId) {
|
||||
this.targetId = targetId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Boolean getDeleted() {
|
||||
return deleted;
|
||||
}
|
||||
|
||||
public TypedRow setDeleted(Boolean deleted) {
|
||||
this.deleted = deleted;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public TypedRow setType(String type) {
|
||||
this.type = type;
|
||||
return this;
|
||||
}
|
||||
|
||||
public String getOaf() {
|
||||
return oaf;
|
||||
}
|
||||
|
||||
public TypedRow setOaf(String oaf) {
|
||||
this.oaf = oaf;
|
||||
return this;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class ContextDef implements Serializable {
|
||||
|
||||
private String id;
|
||||
private String label;
|
||||
private String name;
|
||||
private String type;
|
||||
|
||||
public ContextDef(final String id, final String label, final String name, final String type) {
|
||||
super();
|
||||
this.setId(id);
|
||||
this.setLabel(label);
|
||||
this.setName(name);
|
||||
this.setType(type);
|
||||
}
|
||||
|
||||
public String getLabel() {
|
||||
return label;
|
||||
}
|
||||
|
||||
public void setLabel(final String label) {
|
||||
this.label = label;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(final String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final String type) {
|
||||
this.type = type;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.SAXReader;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class ContextMapper extends HashMap<String, ContextDef> implements Serializable {
|
||||
|
||||
private static final long serialVersionUID = 2159682308502487305L;
|
||||
|
||||
private final static String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return <entry id=\"{$x/@id}\" label=\"{$x/@label|$x/@name}\" name=\"{$x/name()}\" type=\"{$x/@type}\"/>";
|
||||
|
||||
public static ContextMapper fromIS(final String isLookupUrl) throws DocumentException, ISLookUpException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
StringBuilder sb = new StringBuilder("<ContextDSResources>");
|
||||
Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY));
|
||||
sb.append("</ContextDSResources>");
|
||||
return fromXml(sb.toString());
|
||||
}
|
||||
|
||||
public static ContextMapper fromXml(final String xml) throws DocumentException {
|
||||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||
for (Object o : doc.selectNodes("//entry")) {
|
||||
Node node = (Node) o;
|
||||
String id = node.valueOf("./@id");
|
||||
String label = node.valueOf("./@label");
|
||||
String name = node.valueOf("./@name");
|
||||
String type = node.valueOf("./@type") + "";
|
||||
|
||||
contextMapper.put(id, new ContextDef(id, label, name, type));
|
||||
}
|
||||
return contextMapper;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,254 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonInclude;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.BiMap;
|
||||
import com.google.common.collect.HashBiMap;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import eu.dnetlib.dhp.graph.model.EntityRelEntity;
|
||||
import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.graph.model.TypedRow;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import net.minidev.json.JSONArray;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.commons.lang3.StringUtils.*;
|
||||
|
||||
public class GraphMappingUtils {
|
||||
|
||||
public enum EntityType {
|
||||
publication, dataset, otherresearchproduct, software, datasource, organization, project
|
||||
}
|
||||
|
||||
public enum MainEntityType {
|
||||
result, datasource, organization, project
|
||||
}
|
||||
|
||||
public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
|
||||
|
||||
public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
|
||||
|
||||
private static BiMap<String, String> relClassMapping = HashBiMap.create();
|
||||
|
||||
static {
|
||||
relClassMapping.put("isAuthorInstitutionOf", "hasAuthorInstitution");
|
||||
relClassMapping.put("isMergedIn", "merges");
|
||||
relClassMapping.put("isProducedBy", "produces");
|
||||
relClassMapping.put("hasParticipant", "isParticipant");
|
||||
relClassMapping.put("isProvidedBy", "provides");
|
||||
relClassMapping.put("isRelatedTo", "isRelatedTo");
|
||||
relClassMapping.put("isAmongTopNSimilarDocuments", "hasAmongTopNSimilarDocuments");
|
||||
relClassMapping.put("isRelatedTo", "isRelatedTo");
|
||||
relClassMapping.put("isSupplementTo", "isSupplementedBy");
|
||||
}
|
||||
|
||||
public static String getInverseRelClass(final String relClass) {
|
||||
String res = relClassMapping.get(relClass);
|
||||
if (isNotBlank(res)) {
|
||||
return res;
|
||||
}
|
||||
res = relClassMapping.inverse().get(relClass);
|
||||
|
||||
if (isNotBlank(res)) {
|
||||
return res;
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("unable to find an inverse relationship class for term: " + relClass);
|
||||
}
|
||||
|
||||
private static final String schemeTemplate = "dnet:%s_%s_relations";
|
||||
|
||||
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
entityMapping.put(EntityType.publication, MainEntityType.result);
|
||||
entityMapping.put(EntityType.dataset, MainEntityType.result);
|
||||
entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result);
|
||||
entityMapping.put(EntityType.software, MainEntityType.result);
|
||||
entityMapping.put(EntityType.datasource, MainEntityType.datasource);
|
||||
entityMapping.put(EntityType.organization, MainEntityType.organization);
|
||||
entityMapping.put(EntityType.project, MainEntityType.project);
|
||||
}
|
||||
|
||||
public static String getScheme(final String sourceType, final String targetType) {
|
||||
return String.format(schemeTemplate,
|
||||
entityMapping.get(EntityType.valueOf(sourceType)).name(),
|
||||
entityMapping.get(EntityType.valueOf(targetType)).name());
|
||||
}
|
||||
|
||||
public static String getMainType(final String type) {
|
||||
return entityMapping.get(EntityType.valueOf(type)).name();
|
||||
}
|
||||
|
||||
public static boolean isResult(String type) {
|
||||
return MainEntityType.result.name().equals(getMainType(type));
|
||||
}
|
||||
|
||||
public static Predicate<String> instanceFilter = s -> instanceFieldFilter.contains(s);
|
||||
|
||||
public static EntityRelEntity asRelatedEntity(EntityRelEntity e) {
|
||||
|
||||
final DocumentContext j = JsonPath.parse(e.getSource().getOaf());
|
||||
final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType());
|
||||
|
||||
switch (EntityType.valueOf(e.getSource().getType())) {
|
||||
case publication:
|
||||
case dataset:
|
||||
case otherresearchproduct:
|
||||
case software:
|
||||
mapTitle(j, re);
|
||||
re.setDateofacceptance(j.read("$.dateofacceptance.value"));
|
||||
re.setPublisher(j.read("$.publisher.value"));
|
||||
|
||||
JSONArray pids = j.read("$.pid");
|
||||
re.setPid(pids.stream()
|
||||
.map(p -> asStructuredProperty((LinkedHashMap<String, Object>) p))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
re.setResulttype(asQualifier(j.read("$.resulttype")));
|
||||
|
||||
JSONArray collfrom = j.read("$.collectedfrom");
|
||||
re.setCollectedfrom(collfrom.stream()
|
||||
.map(c -> asKV((LinkedHashMap<String, Object>) c))
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
// will throw exception when the instance is not found
|
||||
JSONArray instances = j.read("$.instance");
|
||||
re.setInstances(instances.stream()
|
||||
.map(i -> {
|
||||
final LinkedHashMap<String, Object> p = (LinkedHashMap<String, Object>) i;
|
||||
final Field<String> license = new Field<String>();
|
||||
license.setValue((String) ((LinkedHashMap<String, Object>) p.get("license")).get("value"));
|
||||
final Instance instance = new Instance();
|
||||
instance.setLicense(license);
|
||||
instance.setAccessright(asQualifier((LinkedHashMap<String, String>) p.get("accessright")));
|
||||
instance.setInstancetype(asQualifier((LinkedHashMap<String, String>) p.get("instancetype")));
|
||||
instance.setHostedby(asKV((LinkedHashMap<String, Object>) p.get("hostedby")));
|
||||
//TODO mapping of distributionlocation
|
||||
instance.setCollectedfrom(asKV((LinkedHashMap<String, Object>) p.get("collectedfrom")));
|
||||
|
||||
Field<String> dateofacceptance = new Field<String>();
|
||||
dateofacceptance.setValue((String) ((LinkedHashMap<String, Object>) p.get("dateofacceptance")).get("value"));
|
||||
instance.setDateofacceptance(dateofacceptance);
|
||||
return instance;
|
||||
}).collect(Collectors.toList()));
|
||||
|
||||
//TODO still to be mapped
|
||||
//re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
|
||||
|
||||
break;
|
||||
case datasource:
|
||||
re.setOfficialname(j.read("$.officialname.value"));
|
||||
re.setWebsiteurl(j.read("$.websiteurl.value"));
|
||||
re.setDatasourcetype(asQualifier(j.read("$.datasourcetype")));
|
||||
re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility")));
|
||||
|
||||
break;
|
||||
case organization:
|
||||
re.setLegalname(j.read("$.legalname.value"));
|
||||
re.setLegalshortname(j.read("$.legalshortname.value"));
|
||||
re.setCountry(asQualifier(j.read("$.country")));
|
||||
|
||||
break;
|
||||
case project:
|
||||
re.setProjectTitle(j.read("$.title.value"));
|
||||
re.setCode(j.read("$.code.value"));
|
||||
re.setAcronym(j.read("$.acronym.value"));
|
||||
re.setContracttype(asQualifier(j.read("$.contracttype")));
|
||||
|
||||
JSONArray f = j.read("$.fundingtree");
|
||||
if (!f.isEmpty()) {
|
||||
re.setFundingtree(f.stream()
|
||||
.map(s -> ((LinkedHashMap<String, String>) s).get("value"))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
return new EntityRelEntity().setSource(
|
||||
new TypedRow()
|
||||
.setSourceId(e.getSource().getSourceId())
|
||||
.setDeleted(e.getSource().getDeleted())
|
||||
.setType(e.getSource().getType())
|
||||
.setOaf(serialize(re)));
|
||||
}
|
||||
|
||||
private static KeyValue asKV(LinkedHashMap<String, Object> j) {
|
||||
final KeyValue kv = new KeyValue();
|
||||
kv.setKey((String) j.get("key"));
|
||||
kv.setValue((String) j.get("value"));
|
||||
return kv;
|
||||
}
|
||||
|
||||
private static void mapTitle(DocumentContext j, RelatedEntity re) {
|
||||
final JSONArray a = j.read("$.title");
|
||||
if (!a.isEmpty()) {
|
||||
final StructuredProperty sp = asStructuredProperty((LinkedHashMap<String, Object>) a.get(0));
|
||||
if (StringUtils.isNotBlank(sp.getValue())) {
|
||||
re.setTitle(sp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static StructuredProperty asStructuredProperty(LinkedHashMap<String, Object> j) {
|
||||
final StructuredProperty sp = new StructuredProperty();
|
||||
final String value = (String) j.get("value");
|
||||
if (StringUtils.isNotBlank(value)) {
|
||||
sp.setValue((String) j.get("value"));
|
||||
sp.setQualifier(asQualifier((LinkedHashMap<String, String>) j.get("qualifier")));
|
||||
}
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static Qualifier asQualifier(LinkedHashMap<String, String> j) {
|
||||
final Qualifier q = new Qualifier();
|
||||
|
||||
final String classid = j.get("classid");
|
||||
if (StringUtils.isNotBlank(classid)) {
|
||||
q.setClassid(classid);
|
||||
}
|
||||
|
||||
final String classname = j.get("classname");
|
||||
if (StringUtils.isNotBlank(classname)) {
|
||||
q.setClassname(classname);
|
||||
}
|
||||
|
||||
final String schemeid = j.get("schemeid");
|
||||
if (StringUtils.isNotBlank(schemeid)) {
|
||||
q.setSchemeid(schemeid);
|
||||
}
|
||||
|
||||
final String schemename = j.get("schemename");
|
||||
if (StringUtils.isNotBlank(schemename)) {
|
||||
q.setSchemename(schemename);
|
||||
}
|
||||
return q;
|
||||
}
|
||||
|
||||
public static String serialize(final Object o) {
|
||||
try {
|
||||
return new ObjectMapper()
|
||||
.setSerializationInclusion(JsonInclude.Include.NON_NULL)
|
||||
.writeValueAsString(o);
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new IllegalArgumentException("unable to serialize: " + o.toString(), e);
|
||||
}
|
||||
}
|
||||
|
||||
public static String removePrefix(final String s) {
|
||||
if (s.contains("|")) return substringAfter(s, "|");
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
|
||||
|
||||
public class ISLookupClientFactory {
|
||||
|
||||
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);
|
||||
|
||||
public static ISLookUpService getLookUpService(final String isLookupUrl) {
|
||||
return getServiceStub(ISLookUpService.class, isLookupUrl);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
|
||||
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
|
||||
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
|
||||
jaxWsProxyFactory.setServiceClass(clazz);
|
||||
jaxWsProxyFactory.setAddress(endpoint);
|
||||
return (T) jaxWsProxyFactory.create();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class LicenseComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
if (left == null && right == null) return 0;
|
||||
if (left == null) return 1;
|
||||
if (right == null) return -1;
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if (lClass.equals(rClass)) return 0;
|
||||
|
||||
if (lClass.equals("OPEN SOURCE")) return -1;
|
||||
if (rClass.equals("OPEN SOURCE")) return 1;
|
||||
|
||||
if (lClass.equals("OPEN")) return -1;
|
||||
if (rClass.equals("OPEN")) return 1;
|
||||
|
||||
if (lClass.equals("6MONTHS")) return -1;
|
||||
if (rClass.equals("6MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("12MONTHS")) return -1;
|
||||
if (rClass.equals("12MONTHS")) return 1;
|
||||
|
||||
if (lClass.equals("EMBARGO")) return -1;
|
||||
if (rClass.equals("EMBARGO")) return 1;
|
||||
|
||||
if (lClass.equals("RESTRICTED")) return -1;
|
||||
if (rClass.equals("RESTRICTED")) return 1;
|
||||
|
||||
if (lClass.equals("CLOSED")) return -1;
|
||||
if (rClass.equals("CLOSED")) return 1;
|
||||
|
||||
if (lClass.equals("UNKNOWN")) return -1;
|
||||
if (rClass.equals("UNKNOWN")) return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,253 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import javax.xml.stream.*;
|
||||
import javax.xml.stream.events.Namespace;
|
||||
import javax.xml.stream.events.StartElement;
|
||||
import javax.xml.stream.events.XMLEvent;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
||||
/**
|
||||
* Optimized version of the document parser, drop in replacement of InputDocumentFactory.
|
||||
*
|
||||
* <p>
|
||||
* Faster because:
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>Doesn't create a DOM for the full document</li>
|
||||
* <li>Doesn't execute xpaths agains the DOM</li>
|
||||
* <li>Quickly serialize the 'result' element directly in a string.</li>
|
||||
* <li>Uses less memory: less pressure on GC and allows more threads to process this in parallel</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* This class is fully reentrant and can be invoked in parallel.
|
||||
* </p>
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
public class StreamingInputDocumentFactory {
|
||||
|
||||
private static final String INDEX_FIELD_PREFIX = "__";
|
||||
|
||||
private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion";
|
||||
|
||||
private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid";
|
||||
|
||||
private static final String RESULT = "result";
|
||||
|
||||
private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT;
|
||||
|
||||
private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier";
|
||||
|
||||
private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'");
|
||||
|
||||
private final static List<String> dateFormats = Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy");
|
||||
|
||||
private static final String DEFAULTDNETRESULT = "dnetResult";
|
||||
|
||||
private static final String TARGETFIELDS = "targetFields";
|
||||
|
||||
private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier";
|
||||
|
||||
private static final String ROOT_ELEMENT = "indexRecord";
|
||||
|
||||
private static final int MAX_FIELD_LENGTH = 25000;
|
||||
|
||||
private ThreadLocal<XMLInputFactory> inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance());
|
||||
|
||||
private ThreadLocal<XMLOutputFactory> outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance());
|
||||
|
||||
private ThreadLocal<XMLEventFactory> eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance());
|
||||
|
||||
private String version;
|
||||
|
||||
private String dsId;
|
||||
|
||||
private String resultName = DEFAULTDNETRESULT;
|
||||
|
||||
public StreamingInputDocumentFactory(final String version, final String dsId) {
|
||||
this(version, dsId, DEFAULTDNETRESULT);
|
||||
}
|
||||
|
||||
public StreamingInputDocumentFactory(final String version, final String dsId, final String resultName) {
|
||||
this.version = version;
|
||||
this.dsId = dsId;
|
||||
this.resultName = resultName;
|
||||
}
|
||||
|
||||
public SolrInputDocument parseDocument(final String inputDocument) {
|
||||
|
||||
final StringWriter results = new StringWriter();
|
||||
final List<Namespace> nsList = Lists.newLinkedList();
|
||||
try {
|
||||
|
||||
XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument));
|
||||
|
||||
final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>());
|
||||
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent event = parser.nextEvent();
|
||||
if ((event != null) && event.isStartElement()) {
|
||||
final String localName = event.asStartElement().getName().getLocalPart();
|
||||
|
||||
if (ROOT_ELEMENT.equals(localName)) {
|
||||
nsList.addAll(getNamespaces(event));
|
||||
} else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) {
|
||||
final XMLEvent text = parser.nextEvent();
|
||||
String recordId = getText(text);
|
||||
indexDocument.addField(INDEX_RECORD_ID, recordId);
|
||||
} else if (TARGETFIELDS.equals(localName)) {
|
||||
parseTargetFields(indexDocument, parser);
|
||||
} else if (resultName.equals(localName)) {
|
||||
copyResult(indexDocument, results, parser, nsList, resultName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (version != null) {
|
||||
indexDocument.addField(DS_VERSION, version);
|
||||
}
|
||||
|
||||
if (dsId != null) {
|
||||
indexDocument.addField(DS_ID, dsId);
|
||||
}
|
||||
|
||||
if (!indexDocument.containsKey(INDEX_RECORD_ID)) {
|
||||
indexDocument.clear();
|
||||
System.err.println("missing indexrecord id:\n" + inputDocument);
|
||||
}
|
||||
|
||||
return indexDocument;
|
||||
} catch (XMLStreamException e) {
|
||||
return new SolrInputDocument();
|
||||
}
|
||||
}
|
||||
|
||||
private List<Namespace> getNamespaces(final XMLEvent event) {
|
||||
final List<Namespace> res = Lists.newLinkedList();
|
||||
@SuppressWarnings("unchecked")
|
||||
Iterator<Namespace> nsIter = event.asStartElement().getNamespaces();
|
||||
while (nsIter.hasNext()) {
|
||||
Namespace ns = nsIter.next();
|
||||
res.add(ns);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the targetFields block and add fields to the solr document.
|
||||
*
|
||||
* @param indexDocument
|
||||
* @param parser
|
||||
* @throws XMLStreamException
|
||||
*/
|
||||
protected void parseTargetFields(final SolrInputDocument indexDocument, final XMLEventReader parser) throws XMLStreamException {
|
||||
|
||||
boolean hasFields = false;
|
||||
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent targetEvent = parser.nextEvent();
|
||||
if (targetEvent.isEndElement() && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (targetEvent.isStartElement()) {
|
||||
final String fieldName = targetEvent.asStartElement().getName().getLocalPart();
|
||||
final XMLEvent text = parser.nextEvent();
|
||||
|
||||
String data = getText(text);
|
||||
|
||||
addField(indexDocument, fieldName, data);
|
||||
hasFields = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!hasFields) {
|
||||
indexDocument.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy the /indexRecord/result element and children, preserving namespace declarations etc.
|
||||
*
|
||||
* @param indexDocument
|
||||
* @param results
|
||||
* @param parser
|
||||
* @param nsList
|
||||
* @throws XMLStreamException
|
||||
*/
|
||||
protected void copyResult(final SolrInputDocument indexDocument,
|
||||
final StringWriter results,
|
||||
final XMLEventReader parser,
|
||||
final List<Namespace> nsList,
|
||||
final String dnetResult) throws XMLStreamException {
|
||||
final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results);
|
||||
|
||||
for (Namespace ns : nsList) {
|
||||
eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI());
|
||||
}
|
||||
|
||||
StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator());
|
||||
|
||||
// new root record
|
||||
writer.add(newRecord);
|
||||
|
||||
// copy the rest as it is
|
||||
while (parser.hasNext()) {
|
||||
final XMLEvent resultEvent = parser.nextEvent();
|
||||
|
||||
// TODO: replace with depth tracking instead of close tag tracking.
|
||||
if (resultEvent.isEndElement() && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) {
|
||||
writer.add(eventFactory.get().createEndElement("", null, RESULT));
|
||||
break;
|
||||
}
|
||||
|
||||
writer.add(resultEvent);
|
||||
}
|
||||
writer.close();
|
||||
indexDocument.addField(INDEX_RESULT, results.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper used to add a field to a solr doc. It avoids to add empy fields
|
||||
*
|
||||
* @param indexDocument
|
||||
* @param field
|
||||
* @param value
|
||||
*/
|
||||
private final void addField(final SolrInputDocument indexDocument, final String field, final String value) {
|
||||
String cleaned = value.trim();
|
||||
if (!cleaned.isEmpty()) {
|
||||
// log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n");
|
||||
indexDocument.addField(field.toLowerCase(), cleaned);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper used to get the string from a text element.
|
||||
*
|
||||
* @param text
|
||||
* @return the
|
||||
*/
|
||||
protected final String getText(final XMLEvent text) {
|
||||
if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + text.asEndElement().getName().getLocalPart());
|
||||
return "";
|
||||
|
||||
final String data = text.asCharacters().getData();
|
||||
if (data != null && data.length() > MAX_FIELD_LENGTH) {
|
||||
return data.substring(0, MAX_FIELD_LENGTH);
|
||||
}
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,107 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.stringtemplate.v4.ST;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix;
|
||||
import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.escapeXml;
|
||||
|
||||
public class TemplateFactory {
|
||||
|
||||
private TemplateResources resources;
|
||||
|
||||
private final static char DELIMITER = '$';
|
||||
|
||||
public TemplateFactory() {
|
||||
try {
|
||||
resources = new TemplateResources();
|
||||
} catch (IOException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public String buildBody(final String type, final List<String> metadata, final List<String> rels, final List<String> children, final List<String> extraInfo) {
|
||||
ST body = getTemplate(resources.getEntity());
|
||||
|
||||
body.add("name", type);
|
||||
body.add("metadata", metadata);
|
||||
body.add("rels", rels);
|
||||
body.add("children", children);
|
||||
body.add("extrainfo", extraInfo);
|
||||
|
||||
return body.render();
|
||||
}
|
||||
|
||||
public String getChild(final String name, final String id, final List<String> metadata) {
|
||||
return getTemplate(resources.getChild())
|
||||
.add("name", name)
|
||||
.add("hasId", !(id == null))
|
||||
.add("id", id != null ? escapeXml(removePrefix(id)) : "")
|
||||
.add("metadata", metadata)
|
||||
.render();
|
||||
}
|
||||
|
||||
public String buildRecord(
|
||||
final OafEntity entity,
|
||||
final String schemaLocation,
|
||||
final String body) {
|
||||
return getTemplate(resources.getRecord())
|
||||
.add("id", escapeXml(removePrefix(entity.getId())))
|
||||
.add("dateofcollection", entity.getDateofcollection())
|
||||
.add("dateoftransformation", entity.getDateoftransformation())
|
||||
.add("schemaLocation", schemaLocation)
|
||||
.add("it", body)
|
||||
.render();
|
||||
}
|
||||
|
||||
public String getRel(final String type,
|
||||
final String objIdentifier,
|
||||
final Collection<String> fields,
|
||||
final String semanticclass,
|
||||
final String semantischeme,
|
||||
final DataInfo info) {
|
||||
return getTemplate(resources.getRel())
|
||||
.add("type", type)
|
||||
.add("objIdentifier", escapeXml(removePrefix(objIdentifier)))
|
||||
.add("class", semanticclass)
|
||||
.add("scheme", semantischeme)
|
||||
.add("metadata", fields)
|
||||
.add("inferred", info.getInferred())
|
||||
.add("trust", info.getTrust())
|
||||
.add("inferenceprovenance", info.getInferenceprovenance())
|
||||
.add("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")
|
||||
.render();
|
||||
}
|
||||
|
||||
public String getInstance(final String resultId, final List<String> instancemetadata, final List<String> webresources) {
|
||||
return getTemplate(resources.getInstance())
|
||||
.add("instanceId", escapeXml(removePrefix(resultId)))
|
||||
.add("metadata", instancemetadata)
|
||||
.add("webresources", webresources
|
||||
.stream()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(w -> getWebResource(w))
|
||||
.collect(Collectors.toList()))
|
||||
.render();
|
||||
}
|
||||
|
||||
private String getWebResource(final String identifier) {
|
||||
return getTemplate(resources.getWebresource())
|
||||
.add("identifier", escapeXml(identifier))
|
||||
.render();
|
||||
}
|
||||
|
||||
// HELPERS
|
||||
|
||||
private ST getTemplate(final String res) {
|
||||
return new ST(res, DELIMITER, DELIMITER);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.google.common.io.Resources;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class TemplateResources {
|
||||
|
||||
private String record = read("eu/dnetlib/dhp/graph/template/record.st");
|
||||
|
||||
private String instance = read("eu/dnetlib/dhp/graph/template/instance.st");
|
||||
|
||||
private String rel = read("eu/dnetlib/dhp/graph/template/rel.st");
|
||||
|
||||
private String webresource = read("eu/dnetlib/dhp/graph/template/webresource.st");
|
||||
|
||||
private String child = read("eu/dnetlib/dhp/graph/template/child.st");
|
||||
|
||||
private String entity = read("eu/dnetlib/dhp/graph/template/entity.st");
|
||||
|
||||
private static String read(final String classpathResource) throws IOException {
|
||||
return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public TemplateResources() throws IOException {
|
||||
|
||||
}
|
||||
|
||||
public String getEntity() {
|
||||
return entity;
|
||||
}
|
||||
|
||||
public String getRecord() {
|
||||
return record;
|
||||
}
|
||||
|
||||
public String getInstance() {
|
||||
return instance;
|
||||
}
|
||||
|
||||
public String getRel() {
|
||||
return rel;
|
||||
}
|
||||
|
||||
public String getWebresource() {
|
||||
return webresource;
|
||||
}
|
||||
|
||||
public String getChild() {
|
||||
return child;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,962 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.mycila.xmltool.XMLDoc;
|
||||
import com.mycila.xmltool.XMLTag;
|
||||
import eu.dnetlib.dhp.graph.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.graph.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.graph.model.Tuple2;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.Element;
|
||||
import org.dom4j.Node;
|
||||
import org.dom4j.io.OutputFormat;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.dom4j.io.XMLWriter;
|
||||
|
||||
import javax.xml.transform.*;
|
||||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.io.StringWriter;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.*;
|
||||
import static eu.dnetlib.dhp.graph.utils.XmlSerializationUtils.*;
|
||||
import static org.apache.commons.lang3.StringUtils.isNotBlank;
|
||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||
|
||||
public class XmlRecordFactory implements Serializable {
|
||||
|
||||
private Set<String> specialDatasourceTypes;
|
||||
|
||||
private ContextMapper contextMapper;
|
||||
|
||||
private String schemaLocation;
|
||||
|
||||
private Set<String> contextes = Sets.newHashSet();
|
||||
|
||||
private boolean indent = false;
|
||||
|
||||
public XmlRecordFactory(
|
||||
final ContextMapper contextMapper, final boolean indent,
|
||||
final String schemaLocation, final Set<String> otherDatasourceTypesUForUI) {
|
||||
|
||||
this.contextMapper = contextMapper;
|
||||
this.schemaLocation = schemaLocation;
|
||||
this.specialDatasourceTypes = otherDatasourceTypesUForUI;
|
||||
|
||||
this.indent = indent;
|
||||
}
|
||||
|
||||
public String build(final JoinedEntity je) {
|
||||
final OafEntity entity = je.getEntity();
|
||||
TemplateFactory templateFactory = new TemplateFactory();
|
||||
try {
|
||||
final List<String> metadata = metadata(je.getType(), entity);
|
||||
|
||||
// rels has to be processed before the contexts because they enrich the contextMap with the funding info.
|
||||
final List<String> relations = listRelations(je, templateFactory);
|
||||
|
||||
metadata.addAll(buildContexts(getMainType(je.getType())));
|
||||
metadata.add(parseDataInfo(entity.getDataInfo()));
|
||||
|
||||
final String body = templateFactory.buildBody(
|
||||
getMainType(je.getType()),
|
||||
metadata,
|
||||
relations,
|
||||
listChildren(je, templateFactory), listExtraInfo(je));
|
||||
|
||||
return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
|
||||
} catch (final Throwable e) {
|
||||
throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e);
|
||||
}
|
||||
}
|
||||
|
||||
private String printXML(String xml, boolean indent) {
|
||||
try {
|
||||
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||
OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat();
|
||||
format.setExpandEmptyElements(false);
|
||||
format.setSuppressDeclaration(true);
|
||||
StringWriter sw = new StringWriter();
|
||||
XMLWriter writer = new XMLWriter(sw, format);
|
||||
writer.write(doc);
|
||||
return sw.toString();
|
||||
} catch (IOException | DocumentException e) {
|
||||
throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> metadata(final String type, final OafEntity entity) {
|
||||
|
||||
final List<String> metadata = Lists.newArrayList();
|
||||
|
||||
if (entity.getCollectedfrom() != null) {
|
||||
metadata.addAll(entity.getCollectedfrom()
|
||||
.stream()
|
||||
.map(kv -> mapKeyValue("collectedfrom", kv))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (entity.getOriginalId() != null) {
|
||||
metadata.addAll(entity.getOriginalId()
|
||||
.stream()
|
||||
.map(s -> asXmlElement("originalId", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (entity.getPid() != null) {
|
||||
metadata.addAll(entity.getPid()
|
||||
.stream()
|
||||
.map(p -> mapStructuredProperty("pid", p))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (GraphMappingUtils.isResult(type)) {
|
||||
final Result r = (Result) entity;
|
||||
|
||||
if (r.getTitle() != null) {
|
||||
metadata.addAll(r.getTitle()
|
||||
.stream()
|
||||
.map(t -> mapStructuredProperty("title", t))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getBestaccessright() != null) {
|
||||
metadata.add(mapQualifier("bestaccessright", r.getBestaccessright()));
|
||||
}
|
||||
if (r.getAuthor() != null) {
|
||||
metadata.addAll(r.getAuthor()
|
||||
.stream()
|
||||
.map(a -> {
|
||||
final StringBuilder sb = new StringBuilder("<creator rank=\"" + a.getRank() + "\"");
|
||||
if (isNotBlank(a.getName())) {
|
||||
sb.append(" name=\"" + escapeXml(a.getName()) + "\"");
|
||||
}
|
||||
if (isNotBlank(a.getSurname())) {
|
||||
sb.append(" surname=\"" + escapeXml(a.getSurname()) + "\"");
|
||||
}
|
||||
if (a.getPid() != null) {
|
||||
a.getPid().stream()
|
||||
.filter(sp -> isNotBlank(sp.getQualifier().getClassid()) && isNotBlank(sp.getValue()))
|
||||
.forEach(sp -> {
|
||||
String pidType = escapeXml(sp.getQualifier().getClassid()).replaceAll("\\W", "");
|
||||
String pidValue = escapeXml(sp.getValue());
|
||||
|
||||
// ugly hack: some records provide swapped pidtype and pidvalue
|
||||
if (authorPidTypes.contains(pidValue.toLowerCase().trim())) {
|
||||
sb.append(String.format(" %s=\"%s\"", pidValue, pidType));
|
||||
} else {
|
||||
pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", "");
|
||||
if (isNotBlank(pidType)) {
|
||||
sb.append(String.format(" %s=\"%s\"",
|
||||
pidType,
|
||||
pidValue.toLowerCase().replaceAll("orcid", "")));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
sb.append(">" + escapeXml(a.getFullname()) + "</creator>");
|
||||
return sb.toString();
|
||||
}).collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getContributor() != null) {
|
||||
metadata.addAll(r.getContributor()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("contributor", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getCountry() != null) {
|
||||
metadata.addAll(r.getCountry()
|
||||
.stream()
|
||||
.map(c -> mapQualifier("country", c))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getCoverage() != null) {
|
||||
metadata.addAll(r.getCoverage()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("coverage", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getDateofacceptance() != null) {
|
||||
metadata.add(asXmlElement("dateofacceptance", r.getDateofacceptance().getValue()));
|
||||
}
|
||||
if (r.getDescription() != null) {
|
||||
metadata.addAll(r.getDescription()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("description", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getEmbargoenddate() != null) {
|
||||
metadata.add(asXmlElement("embargoenddate", r.getEmbargoenddate().getValue()));
|
||||
}
|
||||
if (r.getSubject() != null) {
|
||||
metadata.addAll(r.getSubject()
|
||||
.stream()
|
||||
.map(s -> mapStructuredProperty("subject", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getLanguage() != null) {
|
||||
metadata.add(mapQualifier("language", r.getLanguage()));
|
||||
}
|
||||
if (r.getRelevantdate() != null) {
|
||||
metadata.addAll(r.getRelevantdate()
|
||||
.stream()
|
||||
.map(s -> mapStructuredProperty("relevantdate", s))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getPublisher() != null) {
|
||||
metadata.add(asXmlElement("publisher", r.getPublisher().getValue()));
|
||||
}
|
||||
if (r.getSource() != null) {
|
||||
metadata.addAll(r.getSource()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("source", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getFormat() != null) {
|
||||
metadata.addAll(r.getFormat()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("format", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (r.getResulttype() != null) {
|
||||
metadata.add(mapQualifier("resulttype", r.getResulttype()));
|
||||
}
|
||||
if (r.getResourcetype() != null) {
|
||||
metadata.add(mapQualifier("resourcetype", r.getResourcetype()));
|
||||
}
|
||||
|
||||
metadata.add(mapQualifier("bestaccessright", getBestAccessright(r)));
|
||||
|
||||
if (r.getContext() != null) {
|
||||
contextes.addAll(r.getContext()
|
||||
.stream()
|
||||
.map(c -> c.getId())
|
||||
.collect(Collectors.toList()));
|
||||
if (contextes.contains("dh-ch::subcommunity::2")) {
|
||||
contextes.add("clarin");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (EntityType.valueOf(type)) {
|
||||
case publication:
|
||||
final Publication pub = (Publication) entity;
|
||||
|
||||
if (pub.getJournal() != null) {
|
||||
final Journal j = pub.getJournal();
|
||||
metadata.add(mapJournal(j));
|
||||
}
|
||||
|
||||
break;
|
||||
case dataset:
|
||||
final Dataset d = (Dataset) entity;
|
||||
if (d.getDevice() != null) {
|
||||
metadata.add(asXmlElement("device", d.getDevice().getValue()));
|
||||
}
|
||||
if (d.getLastmetadataupdate() != null) {
|
||||
metadata.add(asXmlElement("lastmetadataupdate", d.getLastmetadataupdate().getValue()));
|
||||
}
|
||||
if (d.getMetadataversionnumber() != null) {
|
||||
metadata.add(asXmlElement("metadataversionnumber", d.getMetadataversionnumber().getValue()));
|
||||
}
|
||||
if (d.getSize() != null) {
|
||||
metadata.add(asXmlElement("size", d.getSize().getValue()));
|
||||
}
|
||||
if (d.getStoragedate() != null) {
|
||||
metadata.add(asXmlElement("storagedate", d.getStoragedate().getValue()));
|
||||
}
|
||||
if (d.getVersion() != null) {
|
||||
metadata.add(asXmlElement("version", d.getVersion().getValue()));
|
||||
}
|
||||
//TODO d.getGeolocation()
|
||||
|
||||
break;
|
||||
case otherresearchproduct:
|
||||
final OtherResearchProduct orp = (OtherResearchProduct) entity;
|
||||
|
||||
if (orp.getContactperson() != null) {
|
||||
metadata.addAll(orp.getContactperson()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("contactperson", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (orp.getContactgroup() != null) {
|
||||
metadata.addAll(orp.getContactgroup()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("contactgroup", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (orp.getTool() != null) {
|
||||
metadata.addAll(orp.getTool()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("tool", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
case software:
|
||||
final Software s = (Software) entity;
|
||||
|
||||
if (s.getDocumentationUrl() != null) {
|
||||
metadata.addAll(s.getDocumentationUrl()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("documentationUrl", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (s.getLicense() != null) {
|
||||
metadata.addAll(s.getLicense()
|
||||
.stream()
|
||||
.map(l -> mapStructuredProperty("license", l))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (s.getCodeRepositoryUrl() != null) {
|
||||
metadata.add(asXmlElement("codeRepositoryUrl", s.getCodeRepositoryUrl().getValue()));
|
||||
}
|
||||
if (s.getProgrammingLanguage() != null) {
|
||||
metadata.add(mapQualifier("programmingLanguage", s.getProgrammingLanguage()));
|
||||
}
|
||||
break;
|
||||
case datasource:
|
||||
final Datasource ds = (Datasource) entity;
|
||||
|
||||
if (ds.getDatasourcetype() != null) {
|
||||
mapDatasourceType(metadata, ds.getDatasourcetype());
|
||||
}
|
||||
if (ds.getOpenairecompatibility() != null) {
|
||||
metadata.add(mapQualifier("openairecompatibility", ds.getOpenairecompatibility()));
|
||||
}
|
||||
if (ds.getOfficialname() != null) {
|
||||
metadata.add(asXmlElement("officialname", ds.getOfficialname().getValue()));
|
||||
}
|
||||
if (ds.getEnglishname() != null) {
|
||||
metadata.add(asXmlElement("englishname", ds.getEnglishname().getValue()));
|
||||
}
|
||||
if (ds.getWebsiteurl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", ds.getWebsiteurl().getValue()));
|
||||
}
|
||||
if (ds.getLogourl() != null) {
|
||||
metadata.add(asXmlElement("logourl", ds.getLogourl().getValue()));
|
||||
}
|
||||
if (ds.getContactemail() != null) {
|
||||
metadata.add(asXmlElement("contactemail", ds.getContactemail().getValue()));
|
||||
}
|
||||
if (ds.getNamespaceprefix() != null) {
|
||||
metadata.add(asXmlElement("namespaceprefix", ds.getNamespaceprefix().getValue()));
|
||||
}
|
||||
if (ds.getLatitude() != null) {
|
||||
metadata.add(asXmlElement("latitude", ds.getLatitude().getValue()));
|
||||
}
|
||||
if (ds.getLongitude() != null) {
|
||||
metadata.add(asXmlElement("longitude", ds.getLongitude().getValue()));
|
||||
}
|
||||
if (ds.getDateofvalidation() != null) {
|
||||
metadata.add(asXmlElement("dateofvalidation", ds.getDateofvalidation().getValue()));
|
||||
}
|
||||
if (ds.getDescription() != null) {
|
||||
metadata.add(asXmlElement("description", ds.getDescription().getValue()));
|
||||
}
|
||||
if (ds.getOdnumberofitems() != null) {
|
||||
metadata.add(asXmlElement("odnumberofitems", ds.getOdnumberofitems().getValue()));
|
||||
}
|
||||
if (ds.getOdnumberofitemsdate() != null) {
|
||||
metadata.add(asXmlElement("odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue()));
|
||||
}
|
||||
if (ds.getOdpolicies() != null) {
|
||||
metadata.add(asXmlElement("odpolicies", ds.getOdpolicies().getValue()));
|
||||
}
|
||||
if (ds.getOdlanguages() != null) {
|
||||
metadata.addAll(ds.getOdlanguages()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("odlanguages", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getOdcontenttypes() != null) {
|
||||
metadata.addAll(ds.getOdcontenttypes()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("odcontenttypes", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getAccessinfopackage() != null) {
|
||||
metadata.addAll(ds.getAccessinfopackage()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("accessinfopackage", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getReleaseenddate() != null) {
|
||||
metadata.add(asXmlElement("releasestartdate", ds.getReleaseenddate().getValue()));
|
||||
}
|
||||
if (ds.getReleaseenddate() != null) {
|
||||
metadata.add(asXmlElement("releaseenddate", ds.getReleaseenddate().getValue()));
|
||||
}
|
||||
if (ds.getMissionstatementurl() != null) {
|
||||
metadata.add(asXmlElement("missionstatementurl", ds.getMissionstatementurl().getValue()));
|
||||
}
|
||||
if (ds.getDataprovider() != null) {
|
||||
metadata.add(asXmlElement("dataprovider", ds.getDataprovider().getValue().toString()));
|
||||
}
|
||||
if (ds.getServiceprovider() != null) {
|
||||
metadata.add(asXmlElement("serviceprovider", ds.getServiceprovider().getValue().toString()));
|
||||
}
|
||||
if (ds.getDatabaseaccesstype() != null) {
|
||||
metadata.add(asXmlElement("databaseaccesstype", ds.getDatabaseaccesstype().getValue()));
|
||||
}
|
||||
if (ds.getDatauploadtype() != null) {
|
||||
metadata.add(asXmlElement("datauploadtype", ds.getDatauploadtype().getValue()));
|
||||
}
|
||||
if (ds.getDatabaseaccessrestriction() != null) {
|
||||
metadata.add(asXmlElement("databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue()));
|
||||
}
|
||||
if (ds.getDatauploadrestriction() != null) {
|
||||
metadata.add(asXmlElement("datauploadrestriction", ds.getDatauploadrestriction().getValue()));
|
||||
}
|
||||
if (ds.getVersioning() != null) {
|
||||
metadata.add(asXmlElement("versioning", ds.getVersioning().getValue().toString()));
|
||||
}
|
||||
if (ds.getCitationguidelineurl() != null) {
|
||||
metadata.add(asXmlElement("citationguidelineurl", ds.getCitationguidelineurl().getValue()));
|
||||
}
|
||||
if (ds.getQualitymanagementkind() != null) {
|
||||
metadata.add(asXmlElement("qualitymanagementkind", ds.getQualitymanagementkind().getValue()));
|
||||
}
|
||||
if (ds.getPidsystems() != null) {
|
||||
metadata.add(asXmlElement("pidsystems", ds.getPidsystems().getValue()));
|
||||
}
|
||||
if (ds.getCertificates() != null) {
|
||||
metadata.add(asXmlElement("certificates", ds.getCertificates().getValue()));
|
||||
}
|
||||
if (ds.getPolicies() != null) {
|
||||
metadata.addAll(ds.getPolicies()
|
||||
.stream()
|
||||
.map(kv -> mapKeyValue("policies", kv))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (ds.getJournal() != null) {
|
||||
metadata.add(mapJournal(ds.getJournal()));
|
||||
}
|
||||
if (ds.getSubjects() != null) {
|
||||
metadata.addAll(ds.getSubjects()
|
||||
.stream()
|
||||
.map(sp -> mapStructuredProperty("subject", sp))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
break;
|
||||
case organization:
|
||||
final Organization o = (Organization) entity;
|
||||
|
||||
if (o.getLegalshortname() != null) {
|
||||
metadata.add(asXmlElement("legalshortname", o.getLegalshortname().getValue()));
|
||||
}
|
||||
if (o.getLegalname() != null) {
|
||||
metadata.add(asXmlElement("legalname", o.getLegalname().getValue()));
|
||||
}
|
||||
if (o.getAlternativeNames() != null) {
|
||||
metadata.addAll(o.getAlternativeNames()
|
||||
.stream()
|
||||
.map(c -> asXmlElement("alternativeNames", c.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (o.getWebsiteurl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", o.getWebsiteurl().getValue()));
|
||||
}
|
||||
if (o.getLogourl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", o.getLogourl().getValue()));
|
||||
}
|
||||
|
||||
if (o.getEclegalbody() != null) {
|
||||
metadata.add(asXmlElement("eclegalbody", o.getEclegalbody().getValue()));
|
||||
}
|
||||
if (o.getEclegalperson() != null) {
|
||||
metadata.add(asXmlElement("eclegalperson", o.getEclegalperson().getValue()));
|
||||
}
|
||||
if (o.getEcnonprofit() != null) {
|
||||
metadata.add(asXmlElement("ecnonprofit", o.getEcnonprofit().getValue()));
|
||||
}
|
||||
if (o.getEcresearchorganization() != null) {
|
||||
metadata.add(asXmlElement("ecresearchorganization", o.getEcresearchorganization().getValue()));
|
||||
}
|
||||
if (o.getEchighereducation() != null) {
|
||||
metadata.add(asXmlElement("echighereducation", o.getEchighereducation().getValue()));
|
||||
}
|
||||
if (o.getEcinternationalorganization() != null) {
|
||||
metadata.add(asXmlElement("ecinternationalorganizationeurinterests", o.getEcinternationalorganization().getValue()));
|
||||
}
|
||||
if (o.getEcinternationalorganization() != null) {
|
||||
metadata.add(asXmlElement("ecinternationalorganization", o.getEcinternationalorganization().getValue()));
|
||||
}
|
||||
if (o.getEcenterprise() != null) {
|
||||
metadata.add(asXmlElement("ecenterprise", o.getEcenterprise().getValue()));
|
||||
}
|
||||
if (o.getEcsmevalidated() != null) {
|
||||
metadata.add(asXmlElement("ecsmevalidated", o.getEcsmevalidated().getValue()));
|
||||
}
|
||||
if (o.getEcnutscode() != null) {
|
||||
metadata.add(asXmlElement("ecnutscode", o.getEcnutscode().getValue()));
|
||||
}
|
||||
if (o.getCountry() != null) {
|
||||
metadata.add(mapQualifier("country", o.getCountry()));
|
||||
}
|
||||
|
||||
break;
|
||||
case project:
|
||||
|
||||
final Project p = (Project) entity;
|
||||
|
||||
if (p.getWebsiteurl() != null) {
|
||||
metadata.add(asXmlElement("websiteurl", p.getWebsiteurl().getValue()));
|
||||
}
|
||||
if (p.getCode() != null) {
|
||||
metadata.add(asXmlElement("code", p.getCode().getValue()));
|
||||
}
|
||||
if (p.getAcronym() != null) {
|
||||
metadata.add(asXmlElement("acronym", p.getAcronym().getValue()));
|
||||
}
|
||||
if (p.getTitle() != null) {
|
||||
metadata.add(asXmlElement("title", p.getTitle().getValue()));
|
||||
}
|
||||
if (p.getStartdate() != null) {
|
||||
metadata.add(asXmlElement("startdate", p.getStartdate().getValue()));
|
||||
}
|
||||
if (p.getEnddate() != null) {
|
||||
metadata.add(asXmlElement("enddate", p.getEnddate().getValue()));
|
||||
}
|
||||
if (p.getCallidentifier() != null) {
|
||||
metadata.add(asXmlElement("callidentifier", p.getCallidentifier().getValue()));
|
||||
}
|
||||
if (p.getKeywords() != null) {
|
||||
metadata.add(asXmlElement("keywords", p.getKeywords().getValue()));
|
||||
}
|
||||
if (p.getDuration() != null) {
|
||||
metadata.add(asXmlElement("duration", p.getDuration().getValue()));
|
||||
}
|
||||
if (p.getEcarticle29_3() != null) {
|
||||
metadata.add(asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue()));
|
||||
}
|
||||
if (p.getSubjects() != null) {
|
||||
metadata.addAll(p.getSubjects()
|
||||
.stream()
|
||||
.map(sp -> mapStructuredProperty("subject", sp))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (p.getContracttype() != null) {
|
||||
metadata.add(mapQualifier("contracttype", p.getContracttype()));
|
||||
}
|
||||
if (p.getEcsc39() != null) {
|
||||
metadata.add(asXmlElement("ecsc39", p.getEcsc39().getValue()));
|
||||
}
|
||||
if (p.getContactfullname() != null) {
|
||||
metadata.add(asXmlElement("contactfullname", p.getContactfullname().getValue()));
|
||||
}
|
||||
if (p.getContactfax() != null) {
|
||||
metadata.add(asXmlElement("contactfax", p.getContactfax().getValue()));
|
||||
}
|
||||
if (p.getContactphone() != null) {
|
||||
metadata.add(asXmlElement("contactphone", p.getContactphone().getValue()));
|
||||
}
|
||||
if (p.getContactemail() != null) {
|
||||
metadata.add(asXmlElement("contactemail", p.getContactemail().getValue()));
|
||||
}
|
||||
if (p.getSummary() != null) {
|
||||
metadata.add(asXmlElement("summary", p.getSummary().getValue()));
|
||||
}
|
||||
if (p.getCurrency() != null) {
|
||||
metadata.add(asXmlElement("currency", p.getCurrency().getValue()));
|
||||
}
|
||||
if (p.getTotalcost() != null) {
|
||||
metadata.add(asXmlElement("totalcost", p.getTotalcost().toString()));
|
||||
}
|
||||
if (p.getFundedamount() != null) {
|
||||
metadata.add(asXmlElement("fundedamount", p.getFundedamount().toString()));
|
||||
}
|
||||
if (p.getFundingtree() != null) {
|
||||
metadata.addAll(p.getFundingtree()
|
||||
.stream()
|
||||
.map(ft -> asXmlElement("fundingtree", ft.getValue()))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid entity type: " + type);
|
||||
}
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
private void mapDatasourceType(List<String> metadata, final Qualifier dsType) {
|
||||
metadata.add(mapQualifier("datasourcetype", dsType));
|
||||
|
||||
if (specialDatasourceTypes.contains(dsType.getClassid())) {
|
||||
dsType.setClassid("other");
|
||||
dsType.setClassname("other");
|
||||
}
|
||||
metadata.add(mapQualifier("datasourcetypeui", dsType));
|
||||
}
|
||||
|
||||
private Qualifier getBestAccessright(final Result r) {
|
||||
Qualifier bestAccessRight = new Qualifier();
|
||||
bestAccessRight.setClassid("UNKNOWN");
|
||||
bestAccessRight.setClassname("not available");
|
||||
bestAccessRight.setSchemeid("dnet:access_modes");
|
||||
bestAccessRight.setSchemename("dnet:access_modes");
|
||||
|
||||
final LicenseComparator lc = new LicenseComparator();
|
||||
for (final Instance instance : r.getInstance()) {
|
||||
if (lc.compare(bestAccessRight, instance.getAccessright()) > 0) {
|
||||
bestAccessRight = instance.getAccessright();
|
||||
}
|
||||
}
|
||||
return bestAccessRight;
|
||||
}
|
||||
|
||||
private List<String> listRelations(final JoinedEntity je, TemplateFactory templateFactory) {
|
||||
final List<String> rels = Lists.newArrayList();
|
||||
|
||||
for (final Tuple2 link : je.getLinks()) {
|
||||
|
||||
final Relation rel = link.getRelation();
|
||||
final RelatedEntity re = link.getRelatedEntity();
|
||||
final String targetType = link.getRelatedEntity().getType();
|
||||
|
||||
final List<String> metadata = Lists.newArrayList();
|
||||
switch (EntityType.valueOf(targetType)) {
|
||||
case publication:
|
||||
case dataset:
|
||||
case otherresearchproduct:
|
||||
case software:
|
||||
if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) {
|
||||
metadata.add(mapStructuredProperty("title", re.getTitle()));
|
||||
}
|
||||
if (isNotBlank(re.getDateofacceptance())) {
|
||||
metadata.add(asXmlElement("dateofacceptance", re.getDateofacceptance()));
|
||||
}
|
||||
if (isNotBlank(re.getPublisher())) {
|
||||
metadata.add(asXmlElement("publisher", re.getPublisher()));
|
||||
}
|
||||
if (isNotBlank(re.getCodeRepositoryUrl())) {
|
||||
metadata.add(asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
|
||||
}
|
||||
if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
|
||||
metadata.add(mapQualifier("resulttype", re.getResulttype()));
|
||||
}
|
||||
if (re.getCollectedfrom() != null) {
|
||||
metadata.addAll(re.getCollectedfrom()
|
||||
.stream()
|
||||
.map(kv -> mapKeyValue("collectedfrom", kv))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
if (re.getPid() != null) {
|
||||
metadata.addAll(re.getPid()
|
||||
.stream()
|
||||
.map(p -> mapStructuredProperty("pid", p))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
case datasource:
|
||||
if (isNotBlank(re.getOfficialname())) {
|
||||
metadata.add(asXmlElement("officialname", re.getOfficialname()));
|
||||
}
|
||||
if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) {
|
||||
mapDatasourceType(metadata, re.getDatasourcetype());
|
||||
}
|
||||
if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) {
|
||||
metadata.add(mapQualifier("openairecompatibility", re.getOpenairecompatibility()));
|
||||
}
|
||||
break;
|
||||
case organization:
|
||||
if (isNotBlank(re.getLegalname())) {
|
||||
metadata.add(asXmlElement("legalname", re.getLegalname()));
|
||||
}
|
||||
if (isNotBlank(re.getLegalshortname())) {
|
||||
metadata.add(asXmlElement("legalshortname", re.getLegalshortname()));
|
||||
}
|
||||
if (re.getCountry() != null & !re.getCountry().isBlank()) {
|
||||
metadata.add(mapQualifier("country", re.getCountry()));
|
||||
}
|
||||
break;
|
||||
case project:
|
||||
if (isNotBlank(re.getProjectTitle())) {
|
||||
metadata.add(asXmlElement("title", re.getProjectTitle()));
|
||||
}
|
||||
if (isNotBlank(re.getCode())) {
|
||||
metadata.add(asXmlElement("code", re.getCode()));
|
||||
}
|
||||
if (isNotBlank(re.getAcronym())) {
|
||||
metadata.add(asXmlElement("acronym", re.getAcronym()));
|
||||
}
|
||||
if (re.getContracttype() != null & !re.getContracttype().isBlank()) {
|
||||
metadata.add(mapQualifier("contracttype", re.getContracttype()));
|
||||
}
|
||||
if (re.getFundingtree() != null) {
|
||||
metadata.addAll(re.getFundingtree()
|
||||
.stream()
|
||||
.peek(ft -> fillContextMap(ft))
|
||||
.map(ft -> getRelFundingTree(ft))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid target type: " + targetType);
|
||||
|
||||
}
|
||||
final DataInfo info = rel.getDataInfo();
|
||||
|
||||
rels.add(templateFactory.getRel(
|
||||
targetType,
|
||||
rel.getTarget(),
|
||||
Sets.newHashSet(metadata),
|
||||
getInverseRelClass(rel.getRelClass()),
|
||||
getScheme(targetType, re.getType()),
|
||||
info));
|
||||
}
|
||||
return rels;
|
||||
}
|
||||
|
||||
private List<String> listChildren(final JoinedEntity je, TemplateFactory templateFactory) {
|
||||
|
||||
final List<String> children = Lists.newArrayList();
|
||||
|
||||
if (MainEntityType.result.toString().equals(getMainType(je.getType()))) {
|
||||
final List<Instance> instances = ((Result) je.getEntity()).getInstance();
|
||||
if (instances != null) {
|
||||
for (final Instance instance : ((Result) je.getEntity()).getInstance()) {
|
||||
|
||||
final List<String> fields = Lists.newArrayList();
|
||||
|
||||
if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) {
|
||||
fields.add(mapQualifier("accessright", instance.getAccessright()));
|
||||
}
|
||||
if (instance.getCollectedfrom() != null) {
|
||||
fields.add(mapKeyValue("collectedfrom", instance.getCollectedfrom()));
|
||||
}
|
||||
if (instance.getHostedby() != null) {
|
||||
fields.add(mapKeyValue("hostedby", instance.getHostedby()));
|
||||
}
|
||||
if (instance.getDateofacceptance() != null && isNotBlank(instance.getDateofacceptance().getValue())) {
|
||||
fields.add(asXmlElement("dateofacceptance", instance.getDateofacceptance().getValue()));
|
||||
}
|
||||
if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) {
|
||||
fields.add(mapQualifier("instancetype", instance.getInstancetype()));
|
||||
}
|
||||
if (isNotBlank(instance.getDistributionlocation())) {
|
||||
fields.add(asXmlElement("distributionlocation", instance.getDistributionlocation()));
|
||||
}
|
||||
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) {
|
||||
fields.add(asXmlElement("refereed", instance.getRefereed().getValue()));
|
||||
}
|
||||
if (instance.getProcessingchargeamount() != null && isNotBlank(instance.getProcessingchargeamount().getValue())) {
|
||||
fields.add(asXmlElement("processingchargeamount", instance.getProcessingchargeamount().getValue()));
|
||||
}
|
||||
if (instance.getProcessingchargecurrency() != null && isNotBlank(instance.getProcessingchargecurrency().getValue())) {
|
||||
fields.add(asXmlElement("processingchargecurrency", instance.getProcessingchargecurrency().getValue()));
|
||||
}
|
||||
|
||||
children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl()));
|
||||
}
|
||||
}
|
||||
final List<ExternalReference> ext = ((Result) je.getEntity()).getExternalReference();
|
||||
if (ext != null) {
|
||||
for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) {
|
||||
|
||||
final List<String> fields = Lists.newArrayList();
|
||||
|
||||
if (isNotBlank(er.getSitename())) {
|
||||
fields.add(asXmlElement("sitename", er.getSitename()));
|
||||
}
|
||||
if (isNotBlank(er.getLabel())) {
|
||||
fields.add(asXmlElement("label", er.getLabel()));
|
||||
}
|
||||
if (isNotBlank(er.getUrl())) {
|
||||
fields.add(asXmlElement("url", er.getUrl()));
|
||||
}
|
||||
if (isNotBlank(er.getDescription())) {
|
||||
fields.add(asXmlElement("description", er.getDescription()));
|
||||
}
|
||||
if (isNotBlank(er.getUrl())) {
|
||||
fields.add(mapQualifier("qualifier", er.getQualifier()));
|
||||
}
|
||||
if (isNotBlank(er.getRefidentifier())) {
|
||||
fields.add(asXmlElement("refidentifier", er.getRefidentifier()));
|
||||
}
|
||||
if (isNotBlank(er.getQuery())) {
|
||||
fields.add(asXmlElement("query", er.getQuery()));
|
||||
}
|
||||
|
||||
children.add(templateFactory.getChild("externalreference", null, fields));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return children;
|
||||
}
|
||||
|
||||
private List<String> listExtraInfo(JoinedEntity je) {
|
||||
final List<ExtraInfo> extraInfo = je.getEntity().getExtraInfo();
|
||||
return extraInfo != null ? extraInfo
|
||||
.stream()
|
||||
.map(e -> mapExtraInfo(e))
|
||||
.collect(Collectors.toList()) : Lists.newArrayList();
|
||||
}
|
||||
|
||||
private List<String> buildContexts(final String type) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
if ((contextMapper != null) && !contextMapper.isEmpty() && MainEntityType.result.toString().equals(type)) {
|
||||
|
||||
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");
|
||||
|
||||
for (final String context : contextes) {
|
||||
|
||||
String id = "";
|
||||
for (final String token : Splitter.on("::").split(context)) {
|
||||
id += token;
|
||||
|
||||
final ContextDef def = contextMapper.get(id);
|
||||
|
||||
if (def == null) {
|
||||
continue;
|
||||
// throw new IllegalStateException(String.format("cannot find context for id '%s'", id));
|
||||
}
|
||||
|
||||
if (def.getName().equals("context")) {
|
||||
final String xpath = "//context/@id='" + def.getId() + "'";
|
||||
if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) {
|
||||
document = addContextDef(document.gotoRoot(), def);
|
||||
}
|
||||
}
|
||||
|
||||
if (def.getName().equals("category")) {
|
||||
final String rootId = substringBefore(def.getId(), "::");
|
||||
document = addContextDef(document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), def);
|
||||
}
|
||||
|
||||
if (def.getName().equals("concept")) {
|
||||
document = addContextDef(document, def).gotoParent();
|
||||
}
|
||||
id += "::";
|
||||
}
|
||||
}
|
||||
final Transformer transformer = getTransformer();
|
||||
for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) {
|
||||
try {
|
||||
res.add(asStringElement(x, transformer));
|
||||
} catch (final TransformerException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
private Transformer getTransformer() {
|
||||
try {
|
||||
Transformer transformer = TransformerFactory.newInstance().newTransformer();
|
||||
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
|
||||
return transformer;
|
||||
} catch (TransformerConfigurationException e) {
|
||||
throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e);
|
||||
}
|
||||
}
|
||||
|
||||
private XMLTag addContextDef(final XMLTag tag, final ContextDef def) {
|
||||
tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel());
|
||||
if ((def.getType() != null) && !def.getType().isEmpty()) {
|
||||
tag.addAttribute("type", def.getType());
|
||||
}
|
||||
return tag;
|
||||
}
|
||||
|
||||
private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) throws TransformerException {
|
||||
final StringWriter buffer = new StringWriter();
|
||||
transformer.transform(new DOMSource(element), new StreamResult(buffer));
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
private void fillContextMap(final String xmlTree) {
|
||||
|
||||
Document fundingPath;
|
||||
try {
|
||||
fundingPath = new SAXReader().read(new StringReader(xmlTree));
|
||||
} catch (final DocumentException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
try {
|
||||
final Node funder = fundingPath.selectSingleNode("//funder");
|
||||
|
||||
if (funder != null) {
|
||||
|
||||
final String funderShortName = funder.valueOf("./shortname");
|
||||
contextes.add(funderShortName);
|
||||
|
||||
contextMapper.put(funderShortName, new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding"));
|
||||
final Node level0 = fundingPath.selectSingleNode("//funding_level_0");
|
||||
if (level0 != null) {
|
||||
final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name"));
|
||||
contextMapper.put(level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", ""));
|
||||
final Node level1 = fundingPath.selectSingleNode("//funding_level_1");
|
||||
if (level1 == null) {
|
||||
contextes.add(level0Id);
|
||||
} else {
|
||||
final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name"));
|
||||
contextMapper.put(level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", ""));
|
||||
final Node level2 = fundingPath.selectSingleNode("//funding_level_2");
|
||||
if (level2 == null) {
|
||||
contextes.add(level1Id);
|
||||
} else {
|
||||
final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name"));
|
||||
contextMapper.put(level2Id, new ContextDef(level2Id, level2.valueOf("./description"), "concept", ""));
|
||||
contextes.add(level2Id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (final NullPointerException e) {
|
||||
throw new IllegalArgumentException("malformed funding path: " + xmlTree, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String getRelFundingTree(final String xmlTree) {
|
||||
String funding = "<funding>";
|
||||
try {
|
||||
final Document ftree = new SAXReader().read(new StringReader(xmlTree));
|
||||
funding = "<funding>";
|
||||
|
||||
funding += getFunderElement(ftree);
|
||||
|
||||
for (final Object o : Lists.reverse(ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) {
|
||||
final Element e = (Element) o;
|
||||
final String _id = e.valueOf("./id");
|
||||
funding += "<" + e.getName() + " name=\"" + escapeXml(e.valueOf("./name")) + "\">" + escapeXml(_id) + "</" + e.getName() + ">";
|
||||
}
|
||||
} catch (final DocumentException e) {
|
||||
throw new IllegalArgumentException("unable to parse funding tree: " + xmlTree + "\n" + e.getMessage());
|
||||
} finally {
|
||||
funding += "</funding>";
|
||||
}
|
||||
return funding;
|
||||
}
|
||||
|
||||
private String getFunderElement(final Document ftree) {
|
||||
final String funderId = ftree.valueOf("//fundingtree/funder/id/text()");
|
||||
final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname/text()");
|
||||
final String funderName = ftree.valueOf("//fundingtree/funder/name/text()");
|
||||
final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction/text()");
|
||||
|
||||
return "<funder id=\"" + escapeXml(funderId) + "\" shortname=\"" + escapeXml(funderShortName) + "\" name=\"" + escapeXml(funderName)
|
||||
+ "\" jurisdiction=\"" + escapeXml(funderJurisdiction) + "\" />";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
package eu.dnetlib.dhp.graph.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
import static eu.dnetlib.dhp.graph.utils.GraphMappingUtils.removePrefix;
|
||||
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||
import static org.apache.commons.lang3.StringUtils.isNotBlank;
|
||||
|
||||
public class XmlSerializationUtils {
|
||||
|
||||
// XML 1.0
|
||||
// #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
|
||||
private final static String xml10pattern = "[^"
|
||||
+ "\u0009\r\n"
|
||||
+ "\u0020-\uD7FF"
|
||||
+ "\uE000-\uFFFD"
|
||||
+ "\ud800\udc00-\udbff\udfff"
|
||||
+ "]";
|
||||
|
||||
public static String mapJournal(Journal j) {
|
||||
final String attrs = new StringBuilder()
|
||||
.append(attr("issn", j.getIssnPrinted()))
|
||||
.append(attr("eissn", j.getIssnOnline()))
|
||||
.append(attr("lissn", j.getIssnLinking()))
|
||||
.append(attr("ep", j.getEp()))
|
||||
.append(attr("iss", j.getIss()))
|
||||
.append(attr("sp", j.getSp()))
|
||||
.append(attr("vol", j.getVol()))
|
||||
.toString()
|
||||
.trim();
|
||||
|
||||
return new StringBuilder()
|
||||
.append("<journal")
|
||||
.append(isNotBlank(attrs) ? (" " + attrs) : "")
|
||||
.append(">")
|
||||
.append(escapeXml(j.getName()))
|
||||
.append("</journal>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
private static String attr(final String name, final String value) {
|
||||
return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : "";
|
||||
}
|
||||
|
||||
public static String mapStructuredProperty(String name, StructuredProperty t) {
|
||||
return asXmlElement(name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null);
|
||||
}
|
||||
|
||||
public static String mapQualifier(String name, Qualifier q) {
|
||||
return asXmlElement(name, "", q, null);
|
||||
}
|
||||
|
||||
public static String escapeXml(final String value) {
|
||||
return value
|
||||
.replaceAll("&", "&")
|
||||
.replaceAll("<", "<")
|
||||
.replaceAll(">", ">")
|
||||
.replaceAll("\"", """)
|
||||
.replaceAll("'", "'")
|
||||
.replaceAll(xml10pattern, "");
|
||||
}
|
||||
|
||||
public static String parseDataInfo(final DataInfo dataInfo) {
|
||||
return new StringBuilder()
|
||||
.append("<datainfo>")
|
||||
.append(asXmlElement("inferred", dataInfo.getInferred() + ""))
|
||||
.append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + ""))
|
||||
.append(asXmlElement("trust", dataInfo.getTrust() + ""))
|
||||
.append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + ""))
|
||||
.append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null))
|
||||
.append("</datainfo>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) {
|
||||
return sb
|
||||
.append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : ""))
|
||||
.append(attr("inferenceprovenance", info.getInferenceprovenance()))
|
||||
.append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : ""))
|
||||
.append(attr("trust", info.getTrust()));
|
||||
}
|
||||
|
||||
public static String mapKeyValue(final String name, final KeyValue kv) {
|
||||
return new StringBuilder()
|
||||
.append("<")
|
||||
.append(name)
|
||||
.append(" name=\"")
|
||||
.append(escapeXml(kv.getValue()))
|
||||
.append("\" id=\"")
|
||||
.append(escapeXml(removePrefix(kv.getKey())))
|
||||
.append("\"/>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
public static String mapExtraInfo(final ExtraInfo e) {
|
||||
return new StringBuilder("<extraInfo ")
|
||||
.append("name=\"" + e.getName() + "\" ")
|
||||
.append("typology=\"" + e.getTypology() + "\" ")
|
||||
.append("provenance=\"" + e.getProvenance() + "\" ")
|
||||
.append("trust=\"" + e.getTrust() + "\"")
|
||||
.append(">")
|
||||
.append(e.getValue())
|
||||
.append("</extraInfo>")
|
||||
.toString();
|
||||
}
|
||||
|
||||
public static String asXmlElement(final String name, final String value) {
|
||||
return asXmlElement(name, value, null, null);
|
||||
}
|
||||
|
||||
public static String asXmlElement(final String name, final String value, final Qualifier q, final DataInfo info) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("<");
|
||||
sb.append(name);
|
||||
if (q != null) {
|
||||
sb.append(getAttributes(q));
|
||||
}
|
||||
if (info != null) {
|
||||
sb
|
||||
.append(" ")
|
||||
.append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : ""))
|
||||
.append(attr("inferenceprovenance", info.getInferenceprovenance()))
|
||||
.append(attr("provenanceaction", info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : ""))
|
||||
.append(attr("trust", info.getTrust()));
|
||||
}
|
||||
if (isBlank(value)) {
|
||||
sb.append("/>");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
sb.append(">");
|
||||
sb.append(escapeXml(value));
|
||||
sb.append("</");
|
||||
sb.append(name);
|
||||
sb.append(">");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static String getAttributes(final Qualifier q) {
|
||||
if (q == null || q.isBlank()) return "";
|
||||
|
||||
return new StringBuilder(" ")
|
||||
.append(attr("classid", q.getClassid()))
|
||||
.append(attr("classname", q.getClassname()))
|
||||
.append(attr("schemeid", q.getSchemeid()))
|
||||
.append(attr("schemename", q.getSchemename()))
|
||||
.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
net.sf.saxon.TransformerFactoryImpl
|
|
@ -0,0 +1,6 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,7 @@
|
|||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read the XML records", "paramRequired": true},
|
||||
{"paramName":"f", "paramLongName":"format", "paramDescription": "MDFormat name found in the IS profile", "paramRequired": true},
|
||||
{"paramName":"b", "paramLongName":"batchSize", "paramDescription": "size of the batch of documents sent to solr", "paramRequired": false}
|
||||
]
|
|
@ -0,0 +1,34 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_db_name</name>
|
||||
<value>openaire</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/applicationHistory</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="index_infospace_graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>hive_db_name</name>
|
||||
<description>the target hive database name</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="reuse_records"/>
|
||||
|
||||
<decision name="reuse_records">
|
||||
<switch>
|
||||
<case to="adjancency_lists">${wf:conf('reuseRecords') eq false}</case>
|
||||
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
|
||||
<default to="adjancency_lists"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="adjancency_lists">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>build_adjacency_lists</name>
|
||||
<class>eu.dnetlib.dhp.graph.SparkXmlRecordBuilderJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn</arg>
|
||||
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="to_solr_index"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="to_solr_index">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>to_solr_index</name>
|
||||
<class>eu.dnetlib.dhp.graph.SparkXmlIndexingJob</class>
|
||||
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn</arg>
|
||||
<arg>-is</arg> <arg>${isLookupUrl}</arg>
|
||||
<arg>--sourcePath</arg><arg>${outputPath}/xml</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--batchSize</arg><arg>${batchSize}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,3 @@
|
|||
<name$if(hasId)$ objidentifier="$id$"$else$$endif>>
|
||||
$metadata:{ it | $it$ }$
|
||||
</name>
|
|
@ -0,0 +1,10 @@
|
|||
<oaf:$name$>
|
||||
$metadata:{ it | $it$ }$
|
||||
<rels>
|
||||
$rels:{ it | $it$ }$
|
||||
</rels>
|
||||
<children>
|
||||
$children:{ it | $it$ }$
|
||||
</children>
|
||||
</oaf:$name$>
|
||||
$extrainfo:{ it | $it$ }$
|
|
@ -0,0 +1,4 @@
|
|||
<instance id="$instanceId$">
|
||||
$metadata:{ it | $it$ }$
|
||||
$webresources:{ it | $it$ }$
|
||||
</instance>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0"?>
|
||||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>$id$</dri:objIdentifier>
|
||||
<dri:dateOfCollection>$dateofcollection$</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>$dateoftransformation$</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf $schemaLocation$">
|
||||
$it$
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,4 @@
|
|||
<rel inferred="$inferred$" trust="$trust$" inferenceprovenance="$inferenceprovenance$" provenanceaction="$provenanceaction$">
|
||||
<to class="$class$" scheme="$scheme$" type="$type$">$objIdentifier$</to>
|
||||
$metadata:{ it | $it$ }$
|
||||
</rel>
|
|
@ -0,0 +1,3 @@
|
|||
<webresource>
|
||||
<url>$identifier$</url>
|
||||
</webresource>
|
|
@ -18,6 +18,7 @@
|
|||
<module>dhp-distcp</module>
|
||||
<module>dhp-graph-mapper</module>
|
||||
<module>dhp-dedup</module>
|
||||
<module>dhp-graph-provision</module>
|
||||
</modules>
|
||||
|
||||
<pluginRepositories>
|
||||
|
|
135
pom.xml
135
pom.xml
|
@ -100,7 +100,13 @@
|
|||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
<version>${dhp.hadoop.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
<version>${dhp.hadoop.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
@ -148,11 +154,11 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>9.5.1-5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sf.saxon</groupId>
|
||||
<artifactId>Saxon-HE</artifactId>
|
||||
<version>9.9.1-6</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
|
@ -173,6 +179,56 @@
|
|||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.mycila.xmltool</groupId>
|
||||
<artifactId>xmltool</artifactId>
|
||||
<version>3.3</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-solrj</artifactId>
|
||||
<version>7.5.0</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>*</artifactId>
|
||||
<groupId>*</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.lucidworks.spark</groupId>
|
||||
<artifactId>spark-solr</artifactId>
|
||||
<version>3.6.0</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<artifactId>*</artifactId>
|
||||
<groupId>*</groupId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpmime</artifactId>
|
||||
<version>4.5.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.noggit</groupId>
|
||||
<artifactId>noggit</artifactId>
|
||||
<version>0.8</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.zookeeper</groupId>
|
||||
<artifactId>zookeeper</artifactId>
|
||||
<version>3.4.11</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>net.schmizz</groupId>
|
||||
<artifactId>sshj</artifactId>
|
||||
<version>0.10.0</version>
|
||||
|
@ -204,8 +260,17 @@
|
|||
<artifactId>dnet-pace-core</artifactId>
|
||||
<version>4.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
<version>[2.0.0,3.0.0)</version>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-transports-http</artifactId>
|
||||
<version>3.1.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>javax.persistence</groupId>
|
||||
<artifactId>javax.persistence-api</artifactId>
|
||||
|
@ -213,32 +278,36 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
<version>5.6.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>2.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.arakelian</groupId>
|
||||
<artifactId>java-jq</artifactId>
|
||||
<version>0.10.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
<version>${mongodb.driver.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.rabbitmq</groupId>
|
||||
<artifactId>amqp-client</artifactId>
|
||||
<version>5.6.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
<version>2.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.arakelian</groupId>
|
||||
<artifactId>java-jq</artifactId>
|
||||
<version>0.10.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
<version>1.0.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
<version>${mongodb.driver.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
<version>4.0</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.oozie</groupId>
|
||||
|
|
Loading…
Reference in New Issue