Add action set creation for Datacite affiliations

This commit is contained in:
Serafeim Chatzopoulos 2024-04-01 17:23:26 +03:00
parent 24227ab598
commit 0eb0701b26
5 changed files with 27 additions and 3 deletions

View File

@ -67,6 +67,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String openapcInputPath = parser.get("openapcInputPath"); final String openapcInputPath = parser.get("openapcInputPath");
log.info("openapcInputPath: {}", openapcInputPath); log.info("openapcInputPath: {}", openapcInputPath);
final String dataciteInputPath = parser.get("dataciteInputPath");
log.info("dataciteInputPath: {}", dataciteInputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
@ -93,9 +96,15 @@ public class PrepareAffiliationRelations implements Serializable {
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC); spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
crossrefRelations crossrefRelations
.union(pubmedRelations) .union(pubmedRelations)
.union(openAPCRelations) .union(openAPCRelations)
.union(dataciteRelations)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);

View File

@ -23,6 +23,12 @@
"paramDescription": "the path to get the input data from OpenAPC", "paramDescription": "the path to get the input data from OpenAPC",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "dip",
"paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite",
"paramRequired": true
},
{ {
"paramName": "o", "paramName": "o",
"paramLongName": "outputPath", "paramLongName": "outputPath",

View File

@ -34,4 +34,6 @@ oozie.wf.application.path=${oozieTopWfApplicationPath}
crossrefInputPath=/data/bip-affiliations/crossref-data.json crossrefInputPath=/data/bip-affiliations/crossref-data.json
pubmedInputPath=/data/bip-affiliations/pubmed-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json
openapcInputPath=/data/bip-affiliations/openapc-data.json openapcInputPath=/data/bip-affiliations/openapc-data.json
dataciteInputPath=/data/bip-affiliations/openapc-data.json
outputPath=/tmp/crossref-affiliations-output-v5 outputPath=/tmp/crossref-affiliations-output-v5

View File

@ -13,6 +13,10 @@
<name>openapcInputPath</name> <name>openapcInputPath</name>
<description>the path where to find the inferred affiliation relations from OpenAPC</description> <description>the path where to find the inferred affiliation relations from OpenAPC</description>
</property> </property>
<property>
<name>dataciteInputPath</name>
<description>the path where to find the inferred affiliation relations from Datacite</description>
</property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<description>the path where to store the actionset</description> <description>the path where to store the actionset</description>
@ -107,6 +111,8 @@
<arg>--crossrefInputPath</arg><arg>${crossrefInputPath}</arg> <arg>--crossrefInputPath</arg><arg>${crossrefInputPath}</arg>
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg> <arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg> <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg> <arg>--outputPath</arg><arg>${outputPath}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>

View File

@ -87,6 +87,7 @@ public class PrepareAffiliationRelationsTest {
"-crossrefInputPath", crossrefAffiliationRelationPath, "-crossrefInputPath", crossrefAffiliationRelationPath,
"-pubmedInputPath", crossrefAffiliationRelationPath, "-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath,
"-outputPath", outputPath "-outputPath", outputPath
}); });
@ -103,7 +104,7 @@ public class PrepareAffiliationRelationsTest {
// ); // );
// } // }
// count the number of relations // count the number of relations
assertEquals(60, tmp.count()); assertEquals(80, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
@ -114,7 +115,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions Assertions
.assertEquals( .assertEquals(
30, execVerification 40, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
@ -122,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
Assertions Assertions
.assertEquals( .assertEquals(
30, execVerification 40, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()