[AffiliationFromPublisher]Adding to the creation of the ActrionSet also the links got from the publishers
This commit is contained in:
parent
b8bc237079
commit
907eeadce8
|
@ -44,6 +44,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
|
||||
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
|
@ -74,6 +76,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||
|
||||
final String publisherInputPath = parser.get("publisherInputPath");
|
||||
log.info("publisherInputPath: {}", publisherInputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
|
@ -84,43 +89,66 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
Constants.removeOutputDir(spark, outputPath);
|
||||
|
||||
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
||||
spark, crossrefInputPath, collectedFromCrossref);
|
||||
|
||||
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedFromPubmed);
|
||||
|
||||
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||
spark, openapcInputPath, collectedFromOpenAPC);
|
||||
|
||||
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedFromDatacite);
|
||||
|
||||
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
|
||||
createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherInputPath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath, String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath, String outputPath) {
|
||||
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
||||
spark, crossrefInputPath, collectedFromCrossref);
|
||||
|
||||
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedFromPubmed);
|
||||
|
||||
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
||||
spark, openapcInputPath, collectedFromOpenAPC);
|
||||
|
||||
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
||||
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedFromDatacite);
|
||||
|
||||
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||
|
||||
List<KeyValue> collectedfromPublisher = OafMapperUtils
|
||||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
||||
spark, publisherlInputPath, collectedfromPublisher);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.union(publisherRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||
List<KeyValue> collectedfrom){
|
||||
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
|
@ -132,6 +160,10 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
|
@ -142,41 +174,41 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
|
||||
// prepare action sets for affiliation relations
|
||||
return df
|
||||
.toJavaRDD()
|
||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||
.toJavaRDD()
|
||||
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
||||
|
||||
// DOI to OpenAIRE id
|
||||
final String paperId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||
// DOI to OpenAIRE id
|
||||
final String paperId = ID_PREFIX
|
||||
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
||||
|
||||
// ROR id to OpenAIRE id
|
||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||
// ROR id to OpenAIRE id
|
||||
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
||||
|
||||
Qualifier qualifier = OafMapperUtils
|
||||
.qualifier(
|
||||
BIP_AFFILIATIONS_CLASSID,
|
||||
BIP_AFFILIATIONS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||
Qualifier qualifier = OafMapperUtils
|
||||
.qualifier(
|
||||
BIP_AFFILIATIONS_CLASSID,
|
||||
BIP_AFFILIATIONS_CLASSNAME,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
||||
|
||||
// format data info; setting `confidence` into relation's `trust`
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
BIP_INFERENCE_PROVENANCE,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
Double.toString(row.getAs("confidence")));
|
||||
// format data info; setting `confidence` into relation's `trust`
|
||||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
BIP_INFERENCE_PROVENANCE,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
Double.toString(row.getAs("confidence")));
|
||||
|
||||
// return bi-directional relations
|
||||
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
||||
// return bi-directional relations
|
||||
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
||||
|
||||
})
|
||||
.map(p -> new AtomicAction(Relation.class, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
})
|
||||
.map(p -> new AtomicAction(Relation.class, p))
|
||||
.mapToPair(
|
||||
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
||||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
}
|
||||
|
||||
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
||||
|
|
|
@ -33,6 +33,11 @@
|
|||
"paramLongName": "webCrawlInputPath",
|
||||
"paramDescription": "the path to get the input data from Web Crawl",
|
||||
"paramRequired": true
|
||||
},{
|
||||
"paramName": "pip",
|
||||
"paramLongName": "publisherInputPath",
|
||||
"paramDescription": "the path to get the input data from publishers",
|
||||
"paramRequired": true
|
||||
}
|
||||
,
|
||||
{
|
||||
|
|
|
@ -78,6 +78,10 @@ public class PrepareAffiliationRelationsTest {
|
|||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
||||
.getPath();
|
||||
|
||||
String publisherAffiliationRelationPath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
||||
.getPath();
|
||||
|
||||
String outputPath = workingDir.toString() + "/actionSet";
|
||||
|
||||
PrepareAffiliationRelations
|
||||
|
@ -89,6 +93,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||
"-publisherInputPath", publisherAffiliationRelationPath,
|
||||
"-outputPath", outputPath
|
||||
});
|
||||
|
||||
|
@ -105,7 +110,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// );
|
||||
// }
|
||||
// count the number of relations
|
||||
assertEquals(120, tmp.count());
|
||||
assertEquals(138, tmp.count());
|
||||
|
||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
dataset.createOrReplaceTempView("result");
|
||||
|
@ -116,7 +121,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// verify that we have equal number of bi-directional relations
|
||||
Assertions
|
||||
.assertEquals(
|
||||
60, execVerification
|
||||
69, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||
.collectAsList()
|
||||
|
@ -124,7 +129,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
60, execVerification
|
||||
69, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||
.collectAsList()
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
|
||||
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
|
||||
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
|
Loading…
Reference in New Issue