diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index c28d81992..8f911e980 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -9,6 +9,7 @@ import java.util.List; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; @@ -40,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable { private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String ID_PREFIX = "50|doi_________::"; - public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference"; - public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!"; - public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref"; + public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference"; + public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE"; + public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static void main(String[] args) throws Exception { @@ -70,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable { final String dataciteInputPath = parser.get("dataciteInputPath"); log.info("dataciteInputPath: {}", dataciteInputPath); + final String webcrawlInputPath = parser.get("webCrawlInputPath"); + log.info("webcrawlInputPath: {}", webcrawlInputPath); + final String outputPath = parser.get("outputPath"); log.info("outputPath: {}", outputPath); @@ -101,12 +105,18 @@ public class PrepareAffiliationRelations implements Serializable { JavaPairRDD dataciteRelations = prepareAffiliationRelations( spark, dataciteInputPath, collectedFromDatacite); + List collectedFromWebCrawl = OafMapperUtils + .listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME); + JavaPairRDD webCrawlRelations = prepareAffiliationRelations( + spark, webcrawlInputPath, collectedFromWebCrawl); + crossrefRelations .union(pubmedRelations) .union(openAPCRelations) .union(dataciteRelations) + .union(webCrawlRelations) .saveAsHadoopFile( - outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); }); } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json index 9671129f7..4d85cf26b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/input_actionset_parameter.json @@ -28,7 +28,13 @@ "paramLongName": "dataciteInputPath", "paramDescription": "the path to get the input data from Datacite", "paramRequired": true - }, + },{ + "paramName": "wip", + "paramLongName": "webCrawlInputPath", + "paramDescription": "the path to get the input data from Web Crawl", + "paramRequired": true +} +, { "paramName": "o", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index ded4fe409..3639e11da 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -35,5 +35,6 @@ crossrefInputPath=/data/bip-affiliations/crossref-data.json pubmedInputPath=/data/bip-affiliations/pubmed-data.json openapcInputPath=/data/bip-affiliations/openapc-data.json dataciteInputPath=/data/bip-affiliations/datacite-data.json +webCrawlInputPath=/data/bip-affiliations/webCrawl/ outputPath=/tmp/crossref-affiliations-output-v5 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index e8e6a7c33..2e89c07fd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -17,6 +17,10 @@ dataciteInputPath the path where to find the inferred affiliation relations from Datacite + + webCrawlInputPath + the path where to find the inferred affiliation relations from webCrawl + outputPath the path where to store the actionset @@ -112,7 +116,7 @@ --pubmedInputPath${pubmedInputPath} --openapcInputPath${openapcInputPath} --dataciteInputPath${dataciteInputPath} - + --webCrawlInputPath${webCrawlInputPath} --outputPath${outputPath} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index bceb9d754..bffe41ac7 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest { "-pubmedInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath, + "-webCrawlInputPath", crossrefAffiliationRelationPath, "-outputPath", outputPath }); @@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest { // ); // } // count the number of relations - assertEquals(80, tmp.count()); + assertEquals(120, tmp.count()); Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 40, execVerification + 60, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 40, execVerification + 60, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList() diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json index 985a8d14b..08dc3f7eb 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json @@ -4,4 +4,6 @@ {"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]} -{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} \ No newline at end of file +{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} +{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]} +{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]} \ No newline at end of file