|
|
@ -89,64 +89,66 @@ public class PrepareAffiliationRelations implements Serializable {
|
|
|
|
isSparkSessionManaged,
|
|
|
|
isSparkSessionManaged,
|
|
|
|
spark -> {
|
|
|
|
spark -> {
|
|
|
|
Constants.removeOutputDir(spark, outputPath);
|
|
|
|
Constants.removeOutputDir(spark, outputPath);
|
|
|
|
createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherInputPath, outputPath);
|
|
|
|
createActionSet(
|
|
|
|
|
|
|
|
spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath,
|
|
|
|
|
|
|
|
publisherInputPath, outputPath);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath, String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath, String outputPath) {
|
|
|
|
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
|
|
|
|
|
|
|
|
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
|
|
|
|
|
|
|
|
String outputPath) {
|
|
|
|
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
|
|
|
List<KeyValue> collectedFromCrossref = OafMapperUtils
|
|
|
|
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
|
|
|
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
|
|
|
|
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
|
|
|
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
|
|
|
|
spark, crossrefInputPath, collectedFromCrossref);
|
|
|
|
spark, crossrefInputPath, collectedFromCrossref);
|
|
|
|
|
|
|
|
|
|
|
|
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
|
|
|
List<KeyValue> collectedFromPubmed = OafMapperUtils
|
|
|
|
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
|
|
|
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
|
|
|
|
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
|
|
|
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
|
|
|
spark, pubmedInputPath, collectedFromPubmed);
|
|
|
|
spark, pubmedInputPath, collectedFromPubmed);
|
|
|
|
|
|
|
|
|
|
|
|
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
|
|
|
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
|
|
|
|
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
|
|
|
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
|
|
|
|
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
|
|
|
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
|
|
|
|
spark, openapcInputPath, collectedFromOpenAPC);
|
|
|
|
spark, openapcInputPath, collectedFromOpenAPC);
|
|
|
|
|
|
|
|
|
|
|
|
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
|
|
|
List<KeyValue> collectedFromDatacite = OafMapperUtils
|
|
|
|
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
|
|
|
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
|
|
|
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
|
|
|
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
|
|
|
spark, dataciteInputPath, collectedFromDatacite);
|
|
|
|
spark, dataciteInputPath, collectedFromDatacite);
|
|
|
|
|
|
|
|
|
|
|
|
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
|
|
|
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
|
|
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
|
|
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
|
|
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
|
|
|
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
|
|
|
spark, webcrawlInputPath, collectedFromWebCrawl);
|
|
|
|
spark, webcrawlInputPath, collectedFromWebCrawl);
|
|
|
|
|
|
|
|
|
|
|
|
List<KeyValue> collectedfromPublisher = OafMapperUtils
|
|
|
|
List<KeyValue> collectedfromPublisher = OafMapperUtils
|
|
|
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
|
|
|
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
|
|
|
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
|
|
|
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
|
|
|
spark, publisherlInputPath, collectedfromPublisher);
|
|
|
|
spark, publisherlInputPath, collectedfromPublisher);
|
|
|
|
|
|
|
|
|
|
|
|
crossrefRelations
|
|
|
|
crossrefRelations
|
|
|
|
.union(pubmedRelations)
|
|
|
|
.union(pubmedRelations)
|
|
|
|
.union(openAPCRelations)
|
|
|
|
.union(openAPCRelations)
|
|
|
|
.union(dataciteRelations)
|
|
|
|
.union(dataciteRelations)
|
|
|
|
.union(webCrawlRelations)
|
|
|
|
.union(webCrawlRelations)
|
|
|
|
.union(publisherRelations)
|
|
|
|
.union(publisherRelations)
|
|
|
|
.saveAsHadoopFile(
|
|
|
|
.saveAsHadoopFile(
|
|
|
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
|
|
|
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
|
|
|
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
|
|
|
List<KeyValue> collectedfrom){
|
|
|
|
List<KeyValue> collectedfrom) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dataset<Row> df = spark
|
|
|
|
Dataset<Row> df = spark
|
|
|
|
.read()
|
|
|
|
.read()
|
|
|
|
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
|
|
|
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
|
|
|
.json(inputPath)
|
|
|
|
.json(inputPath)
|
|
|
|
.where("DOI is not null");
|
|
|
|
.where("DOI is not null");
|
|
|
|
|
|
|
|
|
|
|
|
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
|
|
|
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
|
|
|
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
|
|
@ -174,41 +176,41 @@ public class PrepareAffiliationRelations implements Serializable {
|
|
|
|
|
|
|
|
|
|
|
|
// prepare action sets for affiliation relations
|
|
|
|
// prepare action sets for affiliation relations
|
|
|
|
return df
|
|
|
|
return df
|
|
|
|
.toJavaRDD()
|
|
|
|
.toJavaRDD()
|
|
|
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
|
|
|
.flatMap((FlatMapFunction<Row, Relation>) row -> {
|
|
|
|
|
|
|
|
|
|
|
|
// DOI to OpenAIRE id
|
|
|
|
// DOI to OpenAIRE id
|
|
|
|
final String paperId = ID_PREFIX
|
|
|
|
final String paperId = ID_PREFIX
|
|
|
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
|
|
|
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
|
|
|
|
|
|
|
|
|
|
|
|
// ROR id to OpenAIRE id
|
|
|
|
// ROR id to OpenAIRE id
|
|
|
|
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
|
|
|
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
|
|
|
|
|
|
|
|
|
|
|
|
Qualifier qualifier = OafMapperUtils
|
|
|
|
Qualifier qualifier = OafMapperUtils
|
|
|
|
.qualifier(
|
|
|
|
.qualifier(
|
|
|
|
BIP_AFFILIATIONS_CLASSID,
|
|
|
|
BIP_AFFILIATIONS_CLASSID,
|
|
|
|
BIP_AFFILIATIONS_CLASSNAME,
|
|
|
|
BIP_AFFILIATIONS_CLASSNAME,
|
|
|
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
|
|
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
|
|
|
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
|
|
|
ModelConstants.DNET_PROVENANCE_ACTIONS);
|
|
|
|
|
|
|
|
|
|
|
|
// format data info; setting `confidence` into relation's `trust`
|
|
|
|
// format data info; setting `confidence` into relation's `trust`
|
|
|
|
DataInfo dataInfo = OafMapperUtils
|
|
|
|
DataInfo dataInfo = OafMapperUtils
|
|
|
|
.dataInfo(
|
|
|
|
.dataInfo(
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
BIP_INFERENCE_PROVENANCE,
|
|
|
|
BIP_INFERENCE_PROVENANCE,
|
|
|
|
true,
|
|
|
|
true,
|
|
|
|
false,
|
|
|
|
false,
|
|
|
|
qualifier,
|
|
|
|
qualifier,
|
|
|
|
Double.toString(row.getAs("confidence")));
|
|
|
|
Double.toString(row.getAs("confidence")));
|
|
|
|
|
|
|
|
|
|
|
|
// return bi-directional relations
|
|
|
|
// return bi-directional relations
|
|
|
|
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
|
|
|
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
|
|
|
|
|
|
|
|
|
|
|
|
})
|
|
|
|
})
|
|
|
|
.map(p -> new AtomicAction(Relation.class, p))
|
|
|
|
.map(p -> new AtomicAction(Relation.class, p))
|
|
|
|
.mapToPair(
|
|
|
|
.mapToPair(
|
|
|
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
|
|
|
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
|
|
|
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
|
|
|
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
|
|
|
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
|
|
|
|