mergin with branch beta
commit
b4eb026c8b
@ -1,2 +1,2 @@
|
||||
# dnet-hadoop
|
||||
Dnet-hadoop is a tool for
|
||||
Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.
|
@ -0,0 +1,73 @@
|
||||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkCreateActionset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/generate_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
val workingDirFolder = parser.get("workingDirFolder")
|
||||
log.info(s"workingDirFolder -> $workingDirFolder")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val resultEncoders: Encoder[Result] = Encoders.kryo[Result]
|
||||
implicit val relationEncoders: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val relation = spark.read.load(s"$sourcePath/relation").as[Relation]
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.flatMap(r => List(r.getSource, r.getTarget)).distinct().write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/id_relation")
|
||||
|
||||
|
||||
val idRelation = spark.read.load(s"$workingDirFolder/id_relation").as[String]
|
||||
|
||||
log.info("extract source and target Identifier involved in relations")
|
||||
|
||||
|
||||
log.info("save relation filtered")
|
||||
|
||||
relation.filter(r => (r.getDataInfo == null || r.getDataInfo.getDeletedbyinference == false) && !r.getRelClass.toLowerCase.contains("merge"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirFolder/actionSetOaf")
|
||||
|
||||
log.info("saving entities")
|
||||
|
||||
val entities: Dataset[(String, Result)] = spark.read.load(s"$sourcePath/entities/*").as[Result].map(p => (p.getId, p))(Encoders.tuple(Encoders.STRING, resultEncoders))
|
||||
|
||||
|
||||
entities.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result])
|
||||
entities
|
||||
.joinWith(idRelation, entities("_1").equalTo(idRelation("value")))
|
||||
.map(p => p._1._2)
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirFolder/actionSetOaf")
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,86 @@
|
||||
package eu.dnetlib.dhp.actionmanager.scholix
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Dataset => OafDataset,Publication, Software, OtherResearchProduct, Relation}
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.io.Source
|
||||
|
||||
object SparkSaveActionSet {
|
||||
|
||||
|
||||
def toActionSet(item: Oaf): (String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: OafDataset =>
|
||||
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
|
||||
a.setClazz(classOf[OafDataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case software: Software =>
|
||||
val a: AtomicAction[Software] = new AtomicAction[Software]
|
||||
a.setClazz(classOf[Software])
|
||||
a.setPayload(software)
|
||||
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case orp: OtherResearchProduct =>
|
||||
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
|
||||
a.setClazz(classOf[OtherResearchProduct])
|
||||
a.setPayload(orp)
|
||||
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/actionset/save_actionset.json")).mkString)
|
||||
parser.parseArgument(args)
|
||||
|
||||
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
implicit val oafEncoders: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||
implicit val tEncoder: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
spark.read.load(sourcePath).as[Oaf]
|
||||
.map(o => toActionSet(o))
|
||||
.filter(o => o != null)
|
||||
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text, Text]], classOf[GzipCodec])
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "source path", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingDirFolder","paramDescription": "the working Dir Folder", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the target path ", "paramRequired": true}
|
||||
]
|
@ -0,0 +1,23 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
@ -0,0 +1,76 @@
|
||||
<workflow-app name="Scholexplorer_to_ActionSet_Workflow" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the path of the consistent graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>workingDirFolder</name>
|
||||
<description>the path of working dir ActionSet</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path of Scholexplorer ActionSet</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="createActionSet"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="createActionSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Create Action Set</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.scholix.SparkCreateActionset</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--workingDirFolder</arg><arg>${workingDirFolder}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="SaveActionSet"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="SaveActionSet">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Save Action Set</name>
|
||||
<class>eu.dnetlib.dhp.actionmanager.scholix.SparkSaveActionSet</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--sourcePath</arg><arg>${workingDirFolder}/actionSetOaf</arg>
|
||||
<arg>--targetPath</arg><arg>${outputPath}</arg>
|
||||
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,5 @@
|
||||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath","paramDescription": "source path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath","paramDescription": "the target path ", "paramRequired": true}
|
||||
]
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,127 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.dump.Utils;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.RelationIdMapping;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class PatchRelationsApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PatchRelationsApplication.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
Optional
|
||||
.ofNullable(
|
||||
PatchRelationsApplication.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/patch_relations_parameters.json"))
|
||||
.orElseThrow(FileNotFoundException::new)));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String graphBasePath = parser.get("graphBasePath");
|
||||
log.info("graphBasePath: {}", graphBasePath);
|
||||
|
||||
final String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
final String idMappingPath = parser.get("idMappingPath");
|
||||
log.info("idMappingPath: {}", idMappingPath);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> patchRelations(spark, graphBasePath, workingDir, idMappingPath));
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitutes the identifiers (source/target) from the set of relations part of the graphBasePath included in the
|
||||
* mapping provided by the dataset stored on idMappingPath, using workingDir as intermediate storage location.
|
||||
*
|
||||
* @param spark the SparkSession
|
||||
* @param graphBasePath base graph path providing the set of relations to patch
|
||||
* @param workingDir intermediate storage location
|
||||
* @param idMappingPath dataset providing the old -> new identifier mapping
|
||||
*/
|
||||
private static void patchRelations(final SparkSession spark, final String graphBasePath, final String workingDir,
|
||||
final String idMappingPath) {
|
||||
|
||||
final String relationPath = graphBasePath + "/relation";
|
||||
|
||||
final Dataset<Relation> rels = Utils.readPath(spark, relationPath, Relation.class);
|
||||
final Dataset<RelationIdMapping> idMapping = Utils.readPath(spark, idMappingPath, RelationIdMapping.class);
|
||||
|
||||
log.info("relations: {}", rels.count());
|
||||
log.info("idMapping: {}", idMapping.count());
|
||||
|
||||
final Dataset<Relation> bySource = rels
|
||||
.joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left")
|
||||
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
|
||||
final Relation r = t._1();
|
||||
Optional
|
||||
.ofNullable(t._2())
|
||||
.map(RelationIdMapping::getNewId)
|
||||
.ifPresent(r::setSource);
|
||||
return r;
|
||||
}, Encoders.bean(Relation.class));
|
||||
|
||||
bySource
|
||||
.joinWith(idMapping, bySource.col("target").equalTo(idMapping.col("oldId")), "left")
|
||||
.map((MapFunction<Tuple2<Relation, RelationIdMapping>, Relation>) t -> {
|
||||
final Relation r = t._1();
|
||||
Optional
|
||||
.ofNullable(t._2())
|
||||
.map(RelationIdMapping::getNewId)
|
||||
.ifPresent(r::setTarget);
|
||||
return r;
|
||||
}, Encoders.bean(Relation.class))
|
||||
.map(
|
||||
(MapFunction<Relation, String>) OBJECT_MAPPER::writeValueAsString,
|
||||
Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(workingDir);
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(workingDir)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.text(relationPath);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
public class RelationIdMapping {
|
||||
|
||||
private String oldId;
|
||||
|
||||
private String newId;
|
||||
|
||||
public String getOldId() {
|
||||
return oldId;
|
||||
}
|
||||
|
||||
public void setOldId(final String oldId) {
|
||||
this.oldId = oldId;
|
||||
}
|
||||
|
||||
public String getNewId() {
|
||||
return newId;
|
||||
}
|
||||
|
||||
public void setNewId(final String newId) {
|
||||
this.newId = newId;
|
||||
}
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, OtherResearchProduct, Publication, Result, Software, Dataset => OafDataset}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkConvertDatasetToJsonRDD {
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $targetPath")
|
||||
|
||||
val resultObject = List("publication","dataset","software", "otherResearchProduct")
|
||||
val mapper = new ObjectMapper()
|
||||
implicit val oafEncoder: Encoder[Result] = Encoders.kryo(classOf[Result])
|
||||
|
||||
|
||||
resultObject.foreach{item =>
|
||||
spark.read.load(s"$sourcePath/$item").as[Result].map(r=> mapper.writeValueAsString(r))(Encoders.STRING).rdd.saveAsTextFile(s"$targetPath/${item.toLowerCase}", classOf[GzipCodec])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
package eu.dnetlib.dhp.sx.graph
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{OtherResearchProduct, Publication, Relation, Result, Software, Dataset => OafDataset}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
object SparkConvertRDDtoDataset {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val log: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/convert_dataset_json_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
log.info(s"sourcePath -> $sourcePath")
|
||||
val t = parser.get("targetPath")
|
||||
log.info(s"targetPath -> $t")
|
||||
|
||||
val entityPath = s"$t/entities"
|
||||
val relPath = s"$t/relation"
|
||||
val mapper = new ObjectMapper()
|
||||
implicit val datasetEncoder: Encoder[OafDataset] = Encoders.kryo(classOf[OafDataset])
|
||||
implicit val publicationEncoder: Encoder[Publication] = Encoders.kryo(classOf[Publication])
|
||||
implicit val relationEncoder: Encoder[Relation] = Encoders.kryo(classOf[Relation])
|
||||
implicit val orpEncoder: Encoder[OtherResearchProduct] = Encoders.kryo(classOf[OtherResearchProduct])
|
||||
implicit val softwareEncoder: Encoder[Software] = Encoders.kryo(classOf[Software])
|
||||
|
||||
|
||||
log.info("Converting dataset")
|
||||
val rddDataset =spark.sparkContext.textFile(s"$sourcePath/dataset").map(s => mapper.readValue(s, classOf[OafDataset]))
|
||||
spark.createDataset(rddDataset).as[OafDataset].write.mode(SaveMode.Overwrite).save(s"$entityPath/dataset")
|
||||
|
||||
|
||||
log.info("Converting publication")
|
||||
val rddPublication =spark.sparkContext.textFile(s"$sourcePath/publication").map(s => mapper.readValue(s, classOf[Publication]))
|
||||
spark.createDataset(rddPublication).as[Publication].write.mode(SaveMode.Overwrite).save(s"$entityPath/publication")
|
||||
|
||||
log.info("Converting software")
|
||||
val rddSoftware =spark.sparkContext.textFile(s"$sourcePath/software").map(s => mapper.readValue(s, classOf[Software]))
|
||||
spark.createDataset(rddSoftware).as[Software].write.mode(SaveMode.Overwrite).save(s"$entityPath/software")
|
||||
|
||||
log.info("Converting otherresearchproduct")
|
||||
val rddOtherResearchProduct =spark.sparkContext.textFile(s"$sourcePath/otherresearchproduct").map(s => mapper.readValue(s, classOf[OtherResearchProduct]))
|
||||
spark.createDataset(rddOtherResearchProduct).as[OtherResearchProduct].write.mode(SaveMode.Overwrite).save(s"$entityPath/otherresearchproduct")
|
||||
|
||||
|
||||
log.info("Converting Relation")
|
||||
|
||||
|
||||
val rddRelation =spark.sparkContext.textFile(s"$sourcePath/relation").map(s => mapper.readValue(s, classOf[Relation]))
|
||||
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphBasePath",
|
||||
"paramDescription": "base graph path providing the set of relations to patch",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "intermediate storage location",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "idMappingPath",
|
||||
"paramDescription": "dataset providing the old -> new identifier mapping",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -0,0 +1,5 @@
|
||||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true},
|
||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the raw graph", "paramRequired": true}
|
||||
]
|
@ -0,0 +1,85 @@
|
||||
<workflow-app name="Create Raw Graph Step 1: extract Entities in raw graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the working dir base path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the graph Raw base path</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ExtractEntities"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ExtractEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract entities in raw graph</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkCreateInputGraph</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||
</spark>
|
||||
<ok to="DropDedupPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="DropDedupPath">
|
||||
<fs>
|
||||
<delete path='${targetPath}/dedup'/>
|
||||
<mkdir path='${targetPath}/dedup/'/>
|
||||
</fs>
|
||||
<ok to="GenerateInputGraphForDedup"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateInputGraphForDedup">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Generate Input Graph for deduplication</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkConvertDatasetToJsonRDD</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=3000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--sourcePath</arg><arg>${targetPath}/preprocess</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/dedup</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,62 @@
|
||||
<workflow-app name="Resolve Relation" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>entityPath</name>
|
||||
<description>the path of deduplicate Entities</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>relationPath</name>
|
||||
<description>the path of relation unresolved</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the path of relation unresolved</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<start to="DropRelFolder"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
|
||||
<action name="DropRelFolder">
|
||||
<fs>
|
||||
<delete path='${targetPath}/relation'/>
|
||||
<delete path='${targetPath}/relation_resolved'/>
|
||||
<delete path='${targetPath}/resolvedSource'/>
|
||||
<delete path='${targetPath}/resolvedPid'/>
|
||||
|
||||
</fs>
|
||||
<ok to="ResolveRelations"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<action name="ResolveRelations">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Resolve Relations in raw graph</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkResolveRelation</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.shuffle.partitions=3000
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--master</arg><arg>yarn</arg>
|
||||
<arg>--relationPath</arg><arg>${relationPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${targetPath}</arg>
|
||||
<arg>--entityPath</arg><arg>${entityPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,120 +0,0 @@
|
||||
<workflow-app name="Create Raw Graph Step 2: Map XML to OAF Entities" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<description>the working path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="ExtractDLIPublication"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ExtractDLIPublication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract DLI Entities (Publication)</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>-e</arg><arg>publication</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDLIDataset"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractDLIDataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract DLI Entities (Dataset)</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>-e</arg><arg>dataset</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDLIUnknown"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractDLIUnknown">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract DLI Entities (Unknown)</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>-e</arg><arg>unknown</arg>
|
||||
</spark>
|
||||
<ok to="ExtractDLIRelation"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ExtractDLIRelation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Extract DLI Entities (Relation)</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
${sparkExtraOPT}
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||
<arg>-e</arg><arg>relation</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -1,61 +0,0 @@
|
||||
<workflow-app name="Create Raw Graph Final Step: Construct the Scholexplorer Raw Graph" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>entity</name>
|
||||
<description>the entity to be merged</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="DeleteTargetPath"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="DeleteTargetPath">
|
||||
<fs>
|
||||
<mkdir path="${targetPath}"/>
|
||||
|
||||
<delete path='${targetPath}/${entity}'/>
|
||||
</fs>
|
||||
<ok to="MergeDLIEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="MergeDLIEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<master>yarn-cluster</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Merge ${entity}</name>
|
||||
<class>eu.dnetlib.dhp.sx.graph.SparkScholexplorerCreateRawGraphJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts> --executor-memory ${sparkExecutorMemory} --driver-memory=${sparkDriverMemory} ${sparkExtraOPT}</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>--sourcePath</arg><arg>${sourcePath}/${entity}</arg>
|
||||
<arg>--targetPath</arg><arg>${targetPath}/${entity}</arg>
|
||||
<arg>--entity</arg><arg>${entity}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
@ -0,0 +1,115 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
|
||||
public class PatchRelationApplicationTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
public static final String ID_MAPPING_PATH = "map/id_mapping.json";
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PatchRelationApplicationTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files
|
||||
.createTempDirectory(PatchRelationApplicationTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(PatchRelationApplicationTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(PatchRelationApplicationTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
|
||||
FileUtils
|
||||
.copyInputStreamToFile(
|
||||
PatchRelationApplicationTest.class.getResourceAsStream("id_mapping.json"),
|
||||
workingDir.resolve(ID_MAPPING_PATH).toFile());
|
||||
|
||||
FileUtils
|
||||
.copyInputStreamToFile(
|
||||
PatchRelationApplicationTest.class.getResourceAsStream("relations_to_patch.json"),
|
||||
workingDir.resolve("graphBasePath/relation/rels.json").toFile());
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPatchRelationApplication() throws Exception {
|
||||
|
||||
final String graphBasePath = workingDir.toString() + "/graphBasePath";
|
||||
PatchRelationsApplication.main(new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-graphBasePath", graphBasePath,
|
||||
"-workingDir", workingDir.toString() + "/workingDir",
|
||||
"-idMappingPath", workingDir.toString() + "/" + ID_MAPPING_PATH
|
||||
});
|
||||
|
||||
final List<Relation> rels = spark
|
||||
.read()
|
||||
.textFile(graphBasePath + "/relation")
|
||||
.map(
|
||||
(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class),
|
||||
Encoders.bean(Relation.class))
|
||||
.collectAsList();
|
||||
|
||||
assertEquals(6, rels.size());
|
||||
|
||||
assertEquals(0, getCount(rels, "1a"), "should be patched to 1b");
|
||||
assertEquals(0, getCount(rels, "2a"), "should be patched to 2b");
|
||||
|
||||
assertEquals(2, getCount(rels, "10a"), "not included in patching");
|
||||
assertEquals(2, getCount(rels, "20a"), "not included in patching");
|
||||
|
||||
assertEquals(2, getCount(rels, "15a"), "not included in patching");
|
||||
assertEquals(2, getCount(rels, "25a"), "not included in patching");
|
||||
|
||||
assertEquals(2, getCount(rels, "1b"), "patched from 1a");
|
||||
assertEquals(2, getCount(rels, "2b"), "patched from 2a");
|
||||
}
|
||||
|
||||
private long getCount(List<Relation> rels, final String id) {
|
||||
return rels.stream().filter(r -> r.getSource().equals(id) || r.getTarget().equals(id)).count();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,72 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr" xmlns:oaf="http://namespace.openaire.eu/oaf">
|
||||
<oai:header xmlns="http://namespace.openaire.eu/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<dri:objIdentifier>enermaps____::04149ee428d07360314c2cb3ba95d41e</dri:objIdentifier>
|
||||
<dri:recordIdentifier>tgs00004</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2021-07-20T18:43:12.096+02:00</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>enermaps____</oaf:datasourceprefix>
|
||||
</oai:header>
|
||||
<metadata>
|
||||
<resource>
|
||||
<identifier identifierType="URL">https://ec.europa.eu/eurostat/web/products-datasets/-/tgs00004</identifier>
|
||||
<creators>
|
||||
<creator>
|
||||
<creatorName>Statistical Office of the European Union (Eurostat)</creatorName>
|
||||
</creator>
|
||||
</creators>
|
||||
<titles>
|
||||
<title>
|
||||
Regional GDP
|
||||
</title>
|
||||
</titles>
|
||||
<publisher>Statistical Office of the European Union (Eurostat)</publisher>
|
||||
<publicationYear>2020</publicationYear>
|
||||
<dates>
|
||||
<date dateType="Issued">2020-10-07</date>
|
||||
</dates>
|
||||
<resourceType resourceTypeGeneral="Dataset"/>
|
||||
<rightsList>
|
||||
<rights rightsURI="info:eu-repo/semantics/openAccess">OPEN</rights>
|
||||
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International</rights>
|
||||
</rightsList>
|
||||
<descriptions>
|
||||
<description descriptionType="Abstract" xml:lang="EN">GDP expressed in PPS (purchasing power standards) eliminates differences in price levels between countries. Calculations on a per inhabitant basis allow for the comparison of economies and regions significantly different in absolute size. GDP per inhabitant in PPS is the key variable for determining the eligibility of NUTS 2 regions in the framework of the European Unions structural policy.</description>
|
||||
</descriptions>
|
||||
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2020-10-07</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:license>Creative Commons Attribution 4.0 International</oaf:license>
|
||||
<oaf:hostedBy
|
||||
id="openaire____::1256f046-bf1f-4afc-8b47-d0b147148b18" name="Unknown Repository"/>
|
||||
<oaf:collectedFrom id="enermaps____::db" name="Enermaps"/>
|
||||
<oaf:concept id="enermaps::selection::tgs00004"/>
|
||||
</resource>
|
||||
</metadata>
|
||||
<about xmlns="" xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2021-07-20T18:43:12.096+02:00">
|
||||
<baseURL>https%3A%2F%2Flab.idiap.ch%2Fenermaps%2Fapi%2Fdatacite</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk"
|
||||
classname="sysimport:crosswalk"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
@ -0,0 +1,5 @@
|
||||
{"oldId": "1a", "newId": "1b"}
|
||||
{"oldId": "2a", "newId": "2b"}
|
||||
{"oldId": "3a", "newId": "3b"}
|
||||
{"oldId": "4a", "newId": "4b"}
|
||||
{"oldId": "5a", "newId": "5b"}
|
@ -0,0 +1,70 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header xmlns="http://namespace.openaire.eu/">
|
||||
<dri:objIdentifier>jairo_______::000012e58ed836576ef2a0d38b0f726f</dri:objIdentifier>
|
||||
<dri:recordIdentifier>oai:irdb.nii.ac.jp:01221:0000010198</dri:recordIdentifier>
|
||||
<dri:dateOfCollection/>
|
||||
<dri:mdFormat/>
|
||||
<dri:mdFormatInterpretation/>
|
||||
<dri:repositoryId/>
|
||||
<dr:objectIdentifier/>
|
||||
<dr:dateOfCollection>2021-05-10T11:31:09.424Z</dr:dateOfCollection>
|
||||
<dr:dateOfTransformation>2021-06-03T01:45:42.536Z</dr:dateOfTransformation>
|
||||
<oaf:datasourceprefix>jairo_______</oaf:datasourceprefix>
|
||||
</header>
|
||||
<metadata xmlns="http://namespace.openaire.eu/">
|
||||
<dc:title>多項式GCDを用いた復号法に関する研究</dc:title>
|
||||
<dc:creator>上原, 剛</dc:creator>
|
||||
<dc:creator>甲斐, 博</dc:creator>
|
||||
<dc:creator>野田, 松太郎</dc:creator>
|
||||
<dc:format>application/pdf</dc:format>
|
||||
<dc:identifier>http://hdl.handle.net/2433/25934</dc:identifier>
|
||||
<dc:language>jpn</dc:language>
|
||||
<dc:publisher>京都大学数理解析研究所</dc:publisher>
|
||||
<dc:subject classid="ndc" classname="ndc"
|
||||
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">410</dc:subject>
|
||||
<dc:type>Departmental Bulletin Paper</dc:type>
|
||||
<dr:CobjCategory type="publication">0014</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2004-10-01</oaf:dateAccepted>
|
||||
<oaf:projectid/>
|
||||
<oaf:collectedDatasourceid>openaire____::554c7c2873</oaf:collectedDatasourceid>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:hostedBy id="openaire____::554c7c2873" name="JAIRO"/>
|
||||
<oaf:collectedFrom id="openaire____::554c7c2873" name="JAIRO"/>
|
||||
<oaf:identifier identifierType="handle">2433/25934</oaf:identifier>
|
||||
<oaf:identifier identifierType="ncid">AN00061013</oaf:identifier>
|
||||
<oaf:identifier identifierType="LandingPage">http://hdl.handle.net/2433/25934</oaf:identifier>
|
||||
<oaf:fulltext>http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf</oaf:fulltext>
|
||||
<oaf:journal ep="110" iss="" issn="1880-2818" sp="104" vol="1395">数理解析研究所講究録</oaf:journal>
|
||||
</metadata>
|
||||
<about>
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2021-05-10T11:31:09.424Z">
|
||||
<baseURL>https%3A%2F%2Firdb.nii.ac.jp%2Foai</baseURL>
|
||||
<identifier>oai:irdb.nii.ac.jp:01221:0000010198</identifier>
|
||||
<datestamp>2021-04-13T13:36:29Z</datestamp>
|
||||
<metadataNamespace/>
|
||||
<originDescription altered="true" harvestDate="2021-04-13T13:36:29Z">
|
||||
<baseURL>http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request</baseURL>
|
||||
<identifier>oai:repository.kulib.kyoto-u.ac.jp:2433/25934</identifier>
|
||||
<datestamp>2012-07-12T14:15:41Z</datestamp>
|
||||
<metadataNamespace>http://irdb.nii.ac.jp/oai</metadataNamespace>
|
||||
</originDescription>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||
classname="sysimport:crosswalk:repository"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
@ -0,0 +1,6 @@
|
||||
{"source":"1a","target":"10a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
|
||||
{"source":"10a","target":"1a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
|
||||
{"source":"2a","target":"20a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
|
||||
{"source":"20a","target":"2a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
|
||||
{"source":"15a","target":"25a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
|
||||
{"source":"25a","target":"15a","collectedfrom":[{"key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824","value":"DOAJ-Articles","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.900","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:entityregistry","classname":"sysimport:crosswalk:entityregistry","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":1626336932282,"relType":"datasourceOrganization","subRelType":"provision","relClass":"provides","validated":false,"validationDate":null,"properties":[]}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue