AffiliationFromPublisher #470

Merged
claudio.atzori merged 3 commits from AffiliationFromPublisher into main 2024-08-07 14:49:29 +02:00
7 changed files with 99 additions and 40 deletions

View File

@ -44,6 +44,8 @@ public class PrepareAffiliationRelations implements Serializable {
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference"; public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE"; public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation"; public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
public static <I extends Result> void main(String[] args) throws Exception { public static <I extends Result> void main(String[] args) throws Exception {
@ -74,6 +76,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String webcrawlInputPath = parser.get("webCrawlInputPath"); final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath); log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String publisherInputPath = parser.get("publisherInputPath");
log.info("publisherInputPath: {}", publisherInputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
@ -84,43 +89,68 @@ public class PrepareAffiliationRelations implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Constants.removeOutputDir(spark, outputPath); Constants.removeOutputDir(spark, outputPath);
createActionSet(
List<KeyValue> collectedFromCrossref = OafMapperUtils spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref"); publisherInputPath, outputPath);
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
spark, crossrefInputPath, collectedFromCrossref);
List<KeyValue> collectedFromPubmed = OafMapperUtils
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
spark, pubmedInputPath, collectedFromPubmed);
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl);
crossrefRelations
.union(pubmedRelations)
.union(openAPCRelations)
.union(dataciteRelations)
.union(webCrawlRelations)
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}); });
} }
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
String outputPath) {
List<KeyValue> collectedFromCrossref = OafMapperUtils
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
spark, crossrefInputPath, collectedFromCrossref);
List<KeyValue> collectedFromPubmed = OafMapperUtils
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
spark, pubmedInputPath, collectedFromPubmed);
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl);
List<KeyValue> collectedfromPublisher = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
spark, publisherlInputPath, collectedfromPublisher);
crossrefRelations
.union(pubmedRelations)
.union(openAPCRelations)
.union(dataciteRelations)
.union(webCrawlRelations)
.union(publisherRelations)
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
}
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark, private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
String inputPath, String inputPath,
List<KeyValue> collectedfrom) { List<KeyValue> collectedfrom) {
@ -132,6 +162,10 @@ public class PrepareAffiliationRelations implements Serializable {
.json(inputPath) .json(inputPath)
.where("DOI is not null"); .where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df);
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays // unroll nested arrays
df = df df = df
.withColumn("matching", functions.explode(new Column("Matchings"))) .withColumn("matching", functions.explode(new Column("Matchings")))

View File

@ -33,6 +33,11 @@
"paramLongName": "webCrawlInputPath", "paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl", "paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true "paramRequired": true
},{
"paramName": "pip",
"paramLongName": "publisherInputPath",
"paramDescription": "the path to get the input data from publishers",
"paramRequired": true
} }
, ,
{ {

View File

@ -78,6 +78,10 @@ public class PrepareAffiliationRelationsTest {
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String publisherAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
.getPath();
String outputPath = workingDir.toString() + "/actionSet"; String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations PrepareAffiliationRelations
@ -89,6 +93,7 @@ public class PrepareAffiliationRelationsTest {
"-openapcInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath, "-webCrawlInputPath", crossrefAffiliationRelationPath,
"-publisherInputPath", publisherAffiliationRelationPath,
"-outputPath", outputPath "-outputPath", outputPath
}); });
@ -105,7 +110,7 @@ public class PrepareAffiliationRelationsTest {
// ); // );
// } // }
// count the number of relations // count the number of relations
assertEquals(120, tmp.count()); assertEquals(138, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
@ -116,7 +121,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions Assertions
.assertEquals( .assertEquals(
60, execVerification 69, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
@ -124,7 +129,7 @@ public class PrepareAffiliationRelationsTest {
Assertions Assertions
.assertEquals( .assertEquals(
60, execVerification 69, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()
@ -145,5 +150,11 @@ public class PrepareAffiliationRelationsTest {
.get(0) .get(0)
.getString(4)); .getString(4));
final String publisherid = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s00217-010-1268-9"));
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/03265fv13");
Assertions.assertEquals(1, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId +"'").count()
);
} }
} }

View File

@ -0,0 +1,6 @@
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple; package eu.dnetlib.dhp.broker.oa.matchers.simple;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
@ -16,7 +17,8 @@ class EnrichMissingAuthorOrcidTest {
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid(); final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
@BeforeEach @BeforeEach
void setUp() throws Exception {} void setUp() throws Exception {
}
@Test @Test
void testFindDifferences_1() { void testFindDifferences_1() {

View File

@ -23,7 +23,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ConversionUtilsTest { public class ConversionUtilsTest {
@BeforeEach @BeforeEach
public void setUp() throws Exception {} public void setUp() throws Exception {
}
@Test @Test
public void testAllResultPids() { public void testAllResultPids() {

View File

@ -20,7 +20,6 @@ import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource; import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import eu.dnetlib.dhp.oa.provision.model.*;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
@ -42,6 +41,7 @@ import com.google.common.collect.Sets;
import com.mycila.xmltool.XMLDoc; import com.mycila.xmltool.XMLDoc;
import com.mycila.xmltool.XMLTag; import com.mycila.xmltool.XMLTag;
import eu.dnetlib.dhp.oa.provision.model.*;
import eu.dnetlib.dhp.schema.common.*; import eu.dnetlib.dhp.schema.common.*;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;