Add action set creation for Datacite affiliations

This commit is contained in:
Serafeim Chatzopoulos 2024-04-01 17:23:26 +03:00
parent 24227ab598
commit 0eb0701b26
5 changed files with 27 additions and 3 deletions

View File

@ -67,6 +67,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String openapcInputPath = parser.get("openapcInputPath");
log.info("openapcInputPath: {}", openapcInputPath);
final String dataciteInputPath = parser.get("dataciteInputPath");
log.info("dataciteInputPath: {}", dataciteInputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
@ -93,9 +96,15 @@ public class PrepareAffiliationRelations implements Serializable {
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
crossrefRelations
.union(pubmedRelations)
.union(openAPCRelations)
.union(dataciteRelations)
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);

View File

@ -23,6 +23,12 @@
"paramDescription": "the path to get the input data from OpenAPC",
"paramRequired": true
},
{
"paramName": "dip",
"paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",

View File

@ -34,4 +34,6 @@ oozie.wf.application.path=${oozieTopWfApplicationPath}
crossrefInputPath=/data/bip-affiliations/crossref-data.json
pubmedInputPath=/data/bip-affiliations/pubmed-data.json
openapcInputPath=/data/bip-affiliations/openapc-data.json
dataciteInputPath=/data/bip-affiliations/openapc-data.json
outputPath=/tmp/crossref-affiliations-output-v5

View File

@ -13,6 +13,10 @@
<name>openapcInputPath</name>
<description>the path where to find the inferred affiliation relations from OpenAPC</description>
</property>
<property>
<name>dataciteInputPath</name>
<description>the path where to find the inferred affiliation relations from Datacite</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
@ -107,6 +111,8 @@
<arg>--crossrefInputPath</arg><arg>${crossrefInputPath}</arg>
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>

View File

@ -87,6 +87,7 @@ public class PrepareAffiliationRelationsTest {
"-crossrefInputPath", crossrefAffiliationRelationPath,
"-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath,
"-outputPath", outputPath
});
@ -103,7 +104,7 @@ public class PrepareAffiliationRelationsTest {
// );
// }
// count the number of relations
assertEquals(60, tmp.count());
assertEquals(80, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
@ -114,7 +115,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations
Assertions
.assertEquals(
30, execVerification
40, execVerification
.filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList()
@ -122,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
Assertions
.assertEquals(
30, execVerification
40, execVerification
.filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList()