From 3ee8a7d18a2339a7e19b1675e79755f3ecd85def Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Sat, 29 Jun 2024 18:47:23 +0200 Subject: [PATCH 1/2] [WebCrawl]moved to Constants web crawl name and id --- .../java/eu/dnetlib/dhp/actionmanager/Constants.java | 3 +++ .../webcrawl/CreateActionSetFromWebEntries.java | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 006d3af76..73b4b77cb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -42,6 +42,9 @@ public class Constants { public static final String NULL = "NULL"; public static final String NA = "N/A"; + public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b"; + public static final String WEB_CRAWL_NAME = "Web Crawl"; + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private Constants() { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java index 263179d6f..e263f1694 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/webcrawl/CreateActionSetFromWebEntries.java @@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.actionmanager.Constants; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -29,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner; import eu.dnetlib.dhp.schema.oaf.utils.PidType; +import io.netty.util.Constant; import scala.Tuple2; /** @@ -44,8 +46,7 @@ public class CreateActionSetFromWebEntries implements Serializable { private static final String PMID_PREFIX = "50|pmid________::"; private static final String PMCID_PREFIX = "50|pmc_________::"; - private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b"; - private static final String WEB_CRAWL_NAME = "Web Crawl"; + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(String[] args) throws Exception { @@ -220,7 +221,7 @@ public class CreateActionSetFromWebEntries implements Serializable { ModelConstants.IS_AUTHOR_INSTITUTION_OF, Arrays .asList( - OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)), + OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)), OafMapperUtils .dataInfo( false, null, false, false, @@ -239,7 +240,7 @@ public class CreateActionSetFromWebEntries implements Serializable { ModelConstants.HAS_AUTHOR_INSTITUTION, Arrays .asList( - OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)), + OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)), OafMapperUtils .dataInfo( false, null, false, false, From 4dbce392375e9acd8dbce7ef66b9c24fed5fdee6 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Sat, 29 Jun 2024 18:51:06 +0200 Subject: [PATCH 2/2] [AffiliationInference]Extended the affiliation ingestion from OpenAIRE to include also the links derived from web crawl. Changed the provenance from BIP! to OpenAIRE --- .../PrepareAffiliationRelations.java | 18 ++++++++++++++---- .../input_actionset_parameter.json | 8 +++++++- .../bipaffiliations/job.properties | 1 + .../bipaffiliations/oozie_app/workflow.xml | 6 +++++- .../PrepareAffiliationRelationsTest.java | 7 ++++--- .../bipaffiliations/doi_to_ror.json | 4 +++- 6 files changed, 34 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index c28d81992..8f911e980 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -9,6 +9,7 @@ import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; @@ -40,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable { private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String ID_PREFIX = "50|doi_________::"; - public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference"; - public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!"; - public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref"; + public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference"; + public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE"; + public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static void main(String[] args) throws Exception { @@ -70,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable { final String dataciteInputPath = parser.get("dataciteInputPath"); log.info("dataciteInputPath: {}", dataciteInputPath); + final String webcrawlInputPath = parser.get("webCrawlInputPath"); + log.info("webcrawlInputPath: {}", webcrawlInputPath); + final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); @@ -101,12 +105,18 @@ public class PrepareAffiliationRelations implements Serializable { JavaPairRDD dataciteRelations = prepareAffiliationRelations( spark, dataciteInputPath, collectedFromDatacite); + List collectedFromWebCrawl = OafMapperUtils + .listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME); + JavaPairRDD webCrawlRelations = prepareAffiliationRelations( + spark, webcrawlInputPath, collectedFromWebCrawl); + crossrefRelations .union(pubmedRelations) .union(openAPCRelations) .union(dataciteRelations) + .union(webCrawlRelations) .saveAsHadoopFile( - outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); }); } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json index 9671129f7..4d85cf26b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json @@ -28,7 +28,13 @@ "paramLongName": "dataciteInputPath", "paramDescription": "the path to get the input data from Datacite", "paramRequired": true - }, + },{ + "paramName": "wip", + "paramLongName": "webCrawlInputPath", + "paramDescription": "the path to get the input data from Web Crawl", + "paramRequired": true +} +, { "paramName": "o", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index ded4fe409..3639e11da 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -35,5 +35,6 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json openapcInputPath=/data/bip-affiliations/openapc-data.json dataciteInputPath=/data/bip-affiliations/datacite-data.json +webCrawlInputPath=/data/bip-affiliations/webCrawl/ outputPath=/tmp/crossref-affiliations-output-v5 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index e8e6a7c33..2e89c07fd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -17,6 +17,10 @@ dataciteInputPath the path where to find the inferred affiliation relations from Datacite + + webCrawlInputPath + the path where to find the inferred affiliation relations from webCrawl + outputPath the path where to store the actionset @@ -112,7 +116,7 @@ --pubmedInputPath${pubmedInputPath} --openapcInputPath${openapcInputPath} --dataciteInputPath${dataciteInputPath} - + --webCrawlInputPath${webCrawlInputPath} --outputPath${outputPath} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index bceb9d754..bffe41ac7 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest { "-pubmedInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath, + "-webCrawlInputPath", crossrefAffiliationRelationPath, "-outputPath", outputPath }); @@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest { // ); // } // count the number of relations - assertEquals(80, tmp.count()); + assertEquals(120, tmp.count()); Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 40, execVerification + 60, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 40, execVerification + 60, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList() diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json index 985a8d14b..08dc3f7eb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json @@ -4,4 +4,6 @@ {"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]} -{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} \ No newline at end of file +{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} +{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]} +{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]} \ No newline at end of file