From 7de0164c269fcc1b4b15a37ca006e3383709aad6 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Mon, 4 Sep 2023 16:04:41 +0300 Subject: [PATCH 1/2] Fix import of affiliations relations from Crossref --- .../PrepareAffiliationRelations.java | 5 ++--- .../actionmanager/bipaffiliations/job.properties | 4 ++-- .../PrepareAffiliationRelationsTest.java | 8 ++++---- .../actionmanager/bipaffiliations/doi_to_ror.json | 13 +++++++------ 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index a9c610de7b..9ac610240c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -80,16 +80,15 @@ public class PrepareAffiliationRelations implements Serializable { // load and parse affiliation relations from HDFS Dataset df = spark .read() - .schema("`DOI` STRING, `Matchings` ARRAY,`Confidence`:DOUBLE>>") + .schema("`DOI` STRING, `Matchings` ARRAY>") .json(inputPath); // unroll nested arrays df = df .withColumn("matching", functions.explode(new Column("Matchings"))) - .withColumn("rorid", functions.explode(new Column("matching.RORid"))) .select( new Column("DOI").as("doi"), - new Column("rorid"), + new Column("matching.RORid").as("rorid"), new Column("matching.Confidence").as("confidence")); // prepare action sets for affiliation relations diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index 43d86ee09f..d942e67723 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -31,5 +31,5 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen # The following is needed as a property of a workflow oozie.wf.application.path=${oozieTopWfApplicationPath} -inputPath=/user/schatz/affiliations/data-v3.1.json -outputPath=/tmp/crossref-affiliations-output-v3.1 +inputPath=/data/bip-affiliations/data.json +outputPath=/tmp/crossref-affiliations-output-v5 diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index 72aabde7f8..ed8e5fe0df 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -101,7 +101,7 @@ public class PrepareAffiliationRelationsTest { // ); // } // count the number of relations - assertEquals(16, tmp.count()); + assertEquals(20, tmp.count()); Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 8, execVerification + 10, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -120,14 +120,14 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 8, execVerification + 10, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList() .size()); // check confidence value of a specific relation - String sourceDOI = "10.1105/tpc.8.3.343"; + String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)"; final String sourceOpenaireId = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI)); diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json index 3b067dcc83..985a8d14b4 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json @@ -1,6 +1,7 @@ -{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":["https:\/\/ror.org\/01teme464"],"Confidence":0.73},{"RORid":["https:\/\/ror.org\/03yxnpp24"],"Confidence":0.7071067812}]} -{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":["https:\/\/ror.org\/02k40bc56"],"Confidence":0.7071067812}]} -{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":["https:\/\/ror.org\/00qjgza05"],"Confidence":1}]} -{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":["https:\/\/ror.org\/05apxxy63"],"Confidence":1},{"RORid":["https:\/\/ror.org\/035xkbk20"],"Confidence":1}]} -{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":["https:\/\/ror.org\/04j198w64"],"Confidence":0.58}]} -{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":["https:\/\/ror.org\/057xtrt18"],"Confidence":0.7071067812}]} \ No newline at end of file +{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]} +{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]} +{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]} +{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]} +{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]} +{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]} +{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} \ No newline at end of file From 15666e86a8448f724e3c7876f480afac36b4a766 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 4 Sep 2023 15:56:06 +0200 Subject: [PATCH 2/2] added collectedfrom to the affiliation relations imported from Crossref --- .../bipaffiliations/PrepareAffiliationRelations.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 9ac610240c..603ad6339f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -120,8 +120,10 @@ public class PrepareAffiliationRelations implements Serializable { qualifier, Double.toString(row.getAs("confidence"))); + List collectedfrom = OafMapperUtils.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref"); + // return bi-directional relations - return getAffiliationRelationPair(paperId, affId, dataInfo).iterator(); + return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator(); }) .map(p -> new AtomicAction(Relation.class, p)) @@ -132,7 +134,8 @@ public class PrepareAffiliationRelations implements Serializable { } - private static List getAffiliationRelationPair(String paperId, String affId, DataInfo dataInfo) { + private static List getAffiliationRelationPair(String paperId, String affId, List collectedfrom, + DataInfo dataInfo) { return Arrays .asList( OafMapperUtils @@ -142,7 +145,7 @@ public class PrepareAffiliationRelations implements Serializable { ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION, ModelConstants.HAS_AUTHOR_INSTITUTION, - null, + collectedfrom, dataInfo, null), OafMapperUtils @@ -152,7 +155,7 @@ public class PrepareAffiliationRelations implements Serializable { ModelConstants.RESULT_ORGANIZATION, ModelConstants.AFFILIATION, ModelConstants.IS_AUTHOR_INSTITUTION_OF, - null, + collectedfrom, dataInfo, null)); }