added global properties in wf definitions to avoid repeating name-node and job-tracker in the (many) distcp actions; reintroduced output directory removal at the beginning of each spark action

This commit is contained in:
Claudio Atzori 2020-05-14 10:25:41 +02:00
parent 12bfa6702e
commit ab37953332
17 changed files with 213 additions and 212 deletions

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.bulktag; package eu.dnetlib.dhp.bulktag;
import static eu.dnetlib.dhp.PropagationConstant.removeOutputDir;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional; import java.util.Optional;
@ -84,6 +85,7 @@ public class SparkBulkTagJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath);
execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc);
}); });
} }

View File

@ -69,13 +69,16 @@ public class SparkCountryPropagationJob {
runWithSparkSession( runWithSparkSession(
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> execPropagation( spark -> {
spark, removeOutputDir(spark, outputPath);
sourcePath, execPropagation(
preparedInfoPath, spark,
outputPath, sourcePath,
resultClazz, preparedInfoPath,
saveGraph)); outputPath,
resultClazz,
saveGraph);
});
} }
private static <R extends Result> void execPropagation( private static <R extends Result> void execPropagation(

View File

@ -74,9 +74,7 @@ public class PrepareResultOrcidAssociationStep1 {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) { removeOutputDir(spark, outputPath);
removeOutputDir(spark, outputPath);
}
prepareInfo( prepareInfo(
spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel); spark, inputRelationPath, inputResultPath, outputResultPath, resultClazz, allowedsemrel);
}); });

View File

@ -50,9 +50,7 @@ public class PrepareResultOrcidAssociationStep2 {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) { removeOutputDir(spark, outputPath);
removeOutputDir(spark, outputPath);
}
mergeInfo(spark, inputPath, outputPath); mergeInfo(spark, inputPath, outputPath);
}); });
} }

View File

@ -70,11 +70,10 @@ public class SparkOrcidToResultFromSemRelJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) { removeOutputDir(spark, outputPath);
removeOutputDir(spark, outputPath); if (saveGraph) {
}
if (saveGraph)
execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz);
}
}); });
} }

View File

@ -60,6 +60,8 @@ public class PrepareProjectResultsAssociation {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, potentialUpdatePath);
removeOutputDir(spark, alreadyLinkedPath);
prepareResultProjProjectResults( prepareResultProjProjectResults(
spark, spark,
inputPath, inputPath,

View File

@ -55,9 +55,7 @@ public class PrepareResultCommunitySet {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) { removeOutputDir(spark, outputPath);
removeOutputDir(spark, outputPath);
}
prepareInfo(spark, inputPath, outputPath, organizationMap); prepareInfo(spark, inputPath, outputPath, organizationMap);
}); });
} }

View File

@ -68,11 +68,10 @@ public class SparkResultToCommunityFromOrganizationJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) { removeOutputDir(spark, outputPath);
removeOutputDir(spark, outputPath); if (saveGraph) {
}
if (saveGraph)
execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath);
}
}); });
} }

View File

@ -58,30 +58,15 @@ public class PrepareResultInstRepoAssociation {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
readNeededResources(spark, inputPath); readNeededResources(spark, inputPath);
removeOutputDir(spark, datasourceOrganizationPath);
prepareDatasourceOrganization(spark, datasourceOrganizationPath); prepareDatasourceOrganization(spark, datasourceOrganizationPath);
removeOutputDir(spark, alreadyLinkedPath);
prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath);
}); });
} }
private static void prepareAlreadyLinkedAssociation(
SparkSession spark, String alreadyLinkedPath) {
String query = "Select source resultId, collect_set(target) organizationSet "
+ "from relation "
+ "where datainfo.deletedbyinference = false "
+ "and relClass = '"
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
+ "' "
+ "group by source";
spark
.sql(query)
.as(Encoders.bean(ResultOrganizationSet.class))
// TODO retry to stick with datasets
.toJavaRDD()
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
}
private static void readNeededResources(SparkSession spark, String inputPath) { private static void readNeededResources(SparkSession spark, String inputPath) {
Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class); Dataset<Datasource> datasource = readPath(spark, inputPath + "/datasource", Datasource.class);
datasource.createOrReplaceTempView("datasource"); datasource.createOrReplaceTempView("datasource");
@ -119,4 +104,24 @@ public class PrepareResultInstRepoAssociation {
.option("compression", "gzip") .option("compression", "gzip")
.json(datasourceOrganizationPath); .json(datasourceOrganizationPath);
} }
private static void prepareAlreadyLinkedAssociation(
SparkSession spark, String alreadyLinkedPath) {
String query = "Select source resultId, collect_set(target) organizationSet "
+ "from relation "
+ "where datainfo.deletedbyinference = false "
+ "and relClass = '"
+ RELATION_RESULT_ORGANIZATION_REL_CLASS
+ "' "
+ "group by source";
spark
.sql(query)
.as(Encoders.bean(ResultOrganizationSet.class))
// TODO retry to stick with datasets
.toJavaRDD()
.map(r -> OBJECT_MAPPER.writeValueAsString(r))
.saveAsTextFile(alreadyLinkedPath, GzipCodec.class);
}
} }

View File

@ -83,10 +83,8 @@ public class SparkResultToOrganizationFromIstRepoJob {
conf, conf,
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
if (isTest(parser)) { removeOutputDir(spark, outputPath);
removeOutputDir(spark, outputPath); if (saveGraph) {
}
if (saveGraph)
execPropagation( execPropagation(
spark, spark,
datasourceorganization, datasourceorganization,
@ -94,6 +92,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
inputPath, inputPath,
outputPath, outputPath,
resultClazz); resultClazz);
}
}); });
} }

View File

@ -18,6 +18,17 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/> <start to="reset_outputpath"/>
<kill name="Kill"> <kill name="Kill">
@ -42,8 +53,6 @@
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
@ -53,8 +62,6 @@
<action name="copy_organization"> <action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg> <arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg> <arg>${nameNode}/${outputPath}/organization</arg>
</distcp> </distcp>
@ -64,8 +71,6 @@
<action name="copy_projects"> <action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg> <arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg> <arg>${nameNode}/${outputPath}/project</arg>
</distcp> </distcp>
@ -75,8 +80,6 @@
<action name="copy_datasources"> <action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg> <arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg> <arg>${nameNode}/${outputPath}/datasource</arg>
</distcp> </distcp>
@ -95,8 +98,6 @@
<action name="join_bulktag_publication"> <action name="join_bulktag_publication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging-publication</name> <name>bulkTagging-publication</name>
@ -124,8 +125,6 @@
<action name="join_bulktag_dataset"> <action name="join_bulktag_dataset">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging-dataset</name> <name>bulkTagging-dataset</name>
@ -153,8 +152,6 @@
<action name="join_bulktag_otherresearchproduct"> <action name="join_bulktag_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging-orp</name> <name>bulkTagging-orp</name>
@ -182,8 +179,6 @@
<action name="join_bulktag_software"> <action name="join_bulktag_software">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>bulkTagging-software</name> <name>bulkTagging-software</name>

View File

@ -19,6 +19,17 @@
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/> <start to="reset_outputpath"/>
<kill name="Kill"> <kill name="Kill">
@ -43,8 +54,6 @@
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
@ -54,18 +63,15 @@
<action name="copy_organization"> <action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg> <arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg> <arg>${nameNode}/${outputPath}/organization</arg>
</distcp> </distcp>
<ok to="copy_wait"/> <ok to="copy_wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_projects"> <action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg> <arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg> <arg>${nameNode}/${outputPath}/project</arg>
</distcp> </distcp>
@ -75,8 +81,6 @@
<action name="copy_datasources"> <action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg> <arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg> <arg>${nameNode}/${outputPath}/datasource</arg>
</distcp> </distcp>

View File

@ -57,6 +57,7 @@
<ok to="copy_wait"/> <ok to="copy_wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_projects"> <action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
@ -81,7 +82,6 @@
<join name="copy_wait" to="fork_prepare_assoc_step1"/> <join name="copy_wait" to="fork_prepare_assoc_step1"/>
<fork name="fork_prepare_assoc_step1"> <fork name="fork_prepare_assoc_step1">
<path start="join_prepare_publication"/> <path start="join_prepare_publication"/>
<path start="join_prepare_dataset"/> <path start="join_prepare_dataset"/>
@ -230,8 +230,8 @@
</spark> </spark>
<ok to="fork-join-exec-propagation"/> <ok to="fork-join-exec-propagation"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<fork name="fork-join-exec-propagation"> <fork name="fork-join-exec-propagation">
<path start="join_propagate_publication"/> <path start="join_propagate_publication"/>
<path start="join_propagate_dataset"/> <path start="join_propagate_dataset"/>
@ -271,6 +271,7 @@
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_propagate_dataset"> <action name="join_propagate_dataset">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -302,6 +303,7 @@
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_propagate_otherresearchproduct"> <action name="join_propagate_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -333,6 +335,7 @@
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="join_propagate_software"> <action name="join_propagate_software">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>

View File

@ -14,6 +14,17 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/> <start to="reset_outputpath"/>
<kill name="Kill"> <kill name="Kill">
@ -42,8 +53,6 @@
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
@ -53,8 +62,6 @@
<action name="copy_publication"> <action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/publication</arg> <arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg> <arg>${nameNode}/${outputPath}/publication</arg>
</distcp> </distcp>
@ -64,8 +71,6 @@
<action name="copy_dataset"> <action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/dataset</arg> <arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg> <arg>${nameNode}/${outputPath}/dataset</arg>
</distcp> </distcp>
@ -75,8 +80,6 @@
<action name="copy_orp"> <action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg> <arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg> <arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp> </distcp>
@ -86,28 +89,24 @@
<action name="copy_software"> <action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/software</arg> <arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg> <arg>${nameNode}/${outputPath}/software</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_organization"> <action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg> <arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg> <arg>${nameNode}/${outputPath}/organization</arg>
</distcp> </distcp>
<ok to="wait"/> <ok to="wait"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="copy_projects"> <action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg> <arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg> <arg>${nameNode}/${outputPath}/project</arg>
</distcp> </distcp>
@ -117,8 +116,6 @@
<action name="copy_datasources"> <action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg> <arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg> <arg>${nameNode}/${outputPath}/datasource</arg>
</distcp> </distcp>

View File

@ -14,6 +14,17 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/> <start to="reset_outputpath"/>
<kill name="Kill"> <kill name="Kill">
@ -38,8 +49,6 @@
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
@ -49,8 +58,6 @@
<action name="copy_organization"> <action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg> <arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg> <arg>${nameNode}/${outputPath}/organization</arg>
</distcp> </distcp>
@ -60,8 +67,6 @@
<action name="copy_projects"> <action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg> <arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg> <arg>${nameNode}/${outputPath}/project</arg>
</distcp> </distcp>
@ -71,8 +76,6 @@
<action name="copy_datasources"> <action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg> <arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg> <arg>${nameNode}/${outputPath}/datasource</arg>
</distcp> </distcp>
@ -101,8 +104,8 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/relation</arg> <arg>--sourcePath</arg><arg>${sourcePath}/relation</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--outputPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg> <arg>--organizationtoresultcommunitymap</arg><arg>${organizationtoresultcommunitymap}</arg>
</spark> </spark>
<ok to="fork-join-exec-propagation"/> <ok to="fork-join-exec-propagation"/>
@ -136,9 +139,9 @@
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
@ -165,9 +168,9 @@
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
@ -194,9 +197,9 @@
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
@ -223,9 +226,9 @@
</spark-opts> </spark-opts>
<arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg> <arg>--preparedInfoPath</arg><arg>${workingDir}/preparedInfo/resultCommunityList</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg> <arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg> <arg>--saveGraph</arg><arg>${saveGraph}</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>

View File

@ -10,6 +10,17 @@
</property> </property>
</parameters> </parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="reset_outputpath"/> <start to="reset_outputpath"/>
<kill name="Kill"> <kill name="Kill">
@ -38,8 +49,6 @@
<action name="copy_relation"> <action name="copy_relation">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/relation</arg> <arg>${nameNode}/${sourcePath}/relation</arg>
<arg>${nameNode}/${outputPath}/relation</arg> <arg>${nameNode}/${outputPath}/relation</arg>
</distcp> </distcp>
@ -49,8 +58,6 @@
<action name="copy_publication"> <action name="copy_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/publication</arg> <arg>${nameNode}/${sourcePath}/publication</arg>
<arg>${nameNode}/${outputPath}/publication</arg> <arg>${nameNode}/${outputPath}/publication</arg>
</distcp> </distcp>
@ -60,8 +67,6 @@
<action name="copy_dataset"> <action name="copy_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/dataset</arg> <arg>${nameNode}/${sourcePath}/dataset</arg>
<arg>${nameNode}/${outputPath}/dataset</arg> <arg>${nameNode}/${outputPath}/dataset</arg>
</distcp> </distcp>
@ -71,8 +76,6 @@
<action name="copy_orp"> <action name="copy_orp">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg> <arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg> <arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
</distcp> </distcp>
@ -82,8 +85,6 @@
<action name="copy_software"> <action name="copy_software">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/software</arg> <arg>${nameNode}/${sourcePath}/software</arg>
<arg>${nameNode}/${outputPath}/software</arg> <arg>${nameNode}/${outputPath}/software</arg>
</distcp> </distcp>
@ -93,8 +94,6 @@
<action name="copy_organization"> <action name="copy_organization">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/organization</arg> <arg>${nameNode}/${sourcePath}/organization</arg>
<arg>${nameNode}/${outputPath}/organization</arg> <arg>${nameNode}/${outputPath}/organization</arg>
</distcp> </distcp>
@ -104,8 +103,6 @@
<action name="copy_projects"> <action name="copy_projects">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/project</arg> <arg>${nameNode}/${sourcePath}/project</arg>
<arg>${nameNode}/${outputPath}/project</arg> <arg>${nameNode}/${outputPath}/project</arg>
</distcp> </distcp>
@ -115,8 +112,6 @@
<action name="copy_datasources"> <action name="copy_datasources">
<distcp xmlns="uri:oozie:distcp-action:0.2"> <distcp xmlns="uri:oozie:distcp-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<arg>${nameNode}/${sourcePath}/datasource</arg> <arg>${nameNode}/${sourcePath}/datasource</arg>
<arg>${nameNode}/${outputPath}/datasource</arg> <arg>${nameNode}/${outputPath}/datasource</arg>
</distcp> </distcp>
@ -125,6 +120,7 @@
</action> </action>
<join name="wait" to="prepare_result_organization_association"/> <join name="wait" to="prepare_result_organization_association"/>
<action name="prepare_result_organization_association"> <action name="prepare_result_organization_association">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
@ -176,12 +172,12 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg> <arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -206,12 +202,12 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -236,12 +232,12 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>
@ -266,12 +262,12 @@
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${outputPath}/relation</arg> <arg>--outputPath</arg><arg>${outputPath}/relation</arg>
<arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg> <arg>--datasourceOrganizationPath</arg><arg>${workingDir}/preparedInfo/datasourceOrganization</arg>
<arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg> <arg>--alreadyLinkedPath</arg><arg>${workingDir}/preparedInfo/alreadyLinked</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
<arg>--saveGraph</arg><arg>${saveGraph}</arg>
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
</spark> </spark>
<ok to="wait2"/> <ok to="wait2"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -64,7 +64,7 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) { protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
this.code2name = code2name; this.code2name = code2name;
@ -75,20 +75,20 @@ public abstract class AbstractMdRecordToOafMapper {
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
final Document doc = DocumentHelper final Document doc = DocumentHelper
.parseText( .parseText(
xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));
final String type = doc.valueOf("//dr:CobjCategory/@type"); final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = getProvenanceDatasource( final KeyValue collectedFrom = getProvenanceDatasource(
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
if (collectedFrom == null) { if (collectedFrom == null) {
return null; return null;
} }
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
? collectedFrom ? collectedFrom
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
if (hostedBy == null) { if (hostedBy == null) {
return null; return null;
@ -112,17 +112,17 @@ public abstract class AbstractMdRecordToOafMapper {
} }
return keyValue( return keyValue(
createOpenaireId(10, dsId, true), createOpenaireId(10, dsId, true),
dsName); dsName);
} }
protected List<Oaf> createOafs( protected List<Oaf> createOafs(
final Document doc, final Document doc,
final String type, final String type,
final KeyValue collectedFrom, final KeyValue collectedFrom,
final KeyValue hostedBy, final KeyValue hostedBy,
final DataInfo info, final DataInfo info,
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>(); final List<Oaf> oafs = new ArrayList<>();
@ -179,10 +179,10 @@ public abstract class AbstractMdRecordToOafMapper {
} }
private List<Oaf> addProjectRels( private List<Oaf> addProjectRels(
final Document doc, final Document doc,
final KeyValue collectedFrom, final KeyValue collectedFrom,
final DataInfo info, final DataInfo info,
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
@ -196,15 +196,15 @@ public abstract class AbstractMdRecordToOafMapper {
final String projectId = createOpenaireId(40, originalId, true); final String projectId = createOpenaireId(40, originalId, true);
res res
.add( .add(
getRelation( getRelation(
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info,
lastUpdateTimestamp)); lastUpdateTimestamp));
res res
.add( .add(
getRelation( getRelation(
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info,
lastUpdateTimestamp)); lastUpdateTimestamp));
} }
} }
@ -212,7 +212,7 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass, protected Relation getRelation(String source, String target, String relType, String subRelType, String relClass,
KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) { KeyValue collectedFrom, DataInfo info, long lastUpdateTimestamp) {
final Relation rel = new Relation(); final Relation rel = new Relation();
rel.setRelType(relType); rel.setRelType(relType);
rel.setSubRelType(subRelType); rel.setSubRelType(subRelType);
@ -226,27 +226,27 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected abstract List<Oaf> addOtherResultRels( protected abstract List<Oaf> addOtherResultRels(
final Document doc, final Document doc,
final KeyValue collectedFrom, final KeyValue collectedFrom,
final DataInfo info, final DataInfo info,
final long lastUpdateTimestamp); final long lastUpdateTimestamp);
private void populateResultFields( private void populateResultFields(
final Result r, final Result r,
final Document doc, final Document doc,
final KeyValue collectedFrom, final KeyValue collectedFrom,
final KeyValue hostedBy, final KeyValue hostedBy,
final DataInfo info, final DataInfo info,
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
r.setDataInfo(info); r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setCollectedfrom(Arrays.asList(collectedFrom));
r r
.setPid( .setPid(
prepareListStructProps( prepareListStructProps(
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
@ -289,7 +289,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances( protected abstract List<Instance> prepareInstances(
Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info); protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
@ -314,13 +314,13 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info); protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductTools( protected abstract List<Field<String>> prepareOtherResearchProductTools(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups( protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons( protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
@ -329,7 +329,7 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls( protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info); protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
@ -359,37 +359,37 @@ public abstract class AbstractMdRecordToOafMapper {
final String edition = n.valueOf("@edition"); final String edition = n.valueOf("@edition");
if (StringUtils.isNotBlank(name)) { if (StringUtils.isNotBlank(name)) {
return journal( return journal(
name, name,
issnPrinted, issnPrinted,
issnOnline, issnOnline,
issnLinking, issnLinking,
ep, ep,
iss, iss,
sp, sp,
vol, vol,
edition, edition,
null, null,
null, null,
info); info);
} }
} }
return null; return null;
} }
protected Qualifier prepareQualifier( protected Qualifier prepareQualifier(
final Node node, final String xpath, final String schemeId, final String schemeName) { final Node node, final String xpath, final String schemeId, final String schemeName) {
final String classId = node.valueOf(xpath); final String classId = node.valueOf(xpath);
final String className = code2name.get(classId); final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName); return qualifier(classId, className, schemeId, schemeName);
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
final Node node, final Node node,
final String xpath, final String xpath,
final String xpathClassId, final String xpathClassId,
final String schemeId, final String schemeId,
final String schemeName, final String schemeName,
final DataInfo info) { final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
@ -401,7 +401,7 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
@ -411,19 +411,19 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
final Node node, final String xpath, final DataInfo info) { final Node node, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
res res
.add( .add(
structuredProperty( structuredProperty(
n.getText(), n.getText(),
n.valueOf("@classid"), n.valueOf("@classid"),
n.valueOf("@classname"), n.valueOf("@classname"),
n.valueOf("@schemeid"), n.valueOf("@schemeid"),
n.valueOf("@schemename"), n.valueOf("@schemename"),
info)); info));
} }
return res; return res;
} }
@ -450,7 +450,7 @@ public abstract class AbstractMdRecordToOafMapper {
if (n == null) { if (n == null) {
return dataInfo( return dataInfo(
false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
} }
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
@ -464,12 +464,12 @@ public abstract class AbstractMdRecordToOafMapper {
final String trust = n.valueOf("./oaf:trust"); final String trust = n.valueOf("./oaf:trust");
return dataInfo( return dataInfo(
deletedbyinference, deletedbyinference,
inferenceprovenance, inferenceprovenance,
inferred, inferred,
false, false,
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), qualifier(paClassId, paClassName, paSchemeId, paSchemeName),
trust); trust);
} }
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) { protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
@ -477,7 +477,7 @@ public abstract class AbstractMdRecordToOafMapper {
} }
protected List<Field<String>> prepareListFields( protected List<Field<String>> prepareListFields(
final Node node, final String xpath, final DataInfo info) { final Node node, final String xpath, final DataInfo info) {
return listFields(info, prepareListString(node, xpath)); return listFields(info, prepareListString(node, xpath));
} }