[createASfromAffRo] adding the provenance datasource used to get the relation (no datasource can be webcrawl = publisher, rawaff means oalex)
This commit is contained in:
parent
56b05cde0b
commit
0e5dd14538
|
@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
|
||||
|
||||
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, crossrefInputPath, collectedfromOpenAIRE);
|
||||
spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::crossref");
|
||||
|
||||
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
|
||||
spark, pubmedInputPath, collectedfromOpenAIRE);
|
||||
spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::pubmed");
|
||||
|
||||
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, openapcInputPath, collectedfromOpenAIRE);
|
||||
spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::openapc");
|
||||
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedfromOpenAIRE);
|
||||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::datacite");
|
||||
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedfromOpenAIRE);
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelationsNewModel(
|
||||
spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::rawaff");
|
||||
|
||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
|
||||
spark, publisherlInputPath, collectedfromOpenAIRE);
|
||||
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisherNewModel(
|
||||
spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::webcrawl");
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
|
@ -133,7 +133,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
|
||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
List<KeyValue> collectedfrom,
|
||||
String dataprovenance) {
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
|
@ -142,12 +143,13 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
||||
return getTextTextJavaPairRDDNew(
|
||||
collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
|
||||
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
|
@ -155,13 +157,14 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
|
||||
return getTextTextJavaPairRDD(
|
||||
collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance);
|
||||
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelations(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||
|
||||
// load and parse affiliation relations from HDFS
|
||||
Dataset<Row> df = spark
|
||||
|
@ -170,12 +173,12 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDD(collectedfrom, df);
|
||||
return getTextTextJavaPairRDD(collectedfrom, df, dataprovenance);
|
||||
}
|
||||
|
||||
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
|
||||
String inputPath,
|
||||
List<KeyValue> collectedfrom) {
|
||||
List<KeyValue> collectedfrom, String dataprovenance) {
|
||||
// load and parse affiliation relations from HDFS
|
||||
Dataset<Row> df = spark
|
||||
.read()
|
||||
|
@ -184,10 +187,11 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
.json(inputPath)
|
||||
.where("DOI is not null");
|
||||
|
||||
return getTextTextJavaPairRDDNew(collectedfrom, df);
|
||||
return getTextTextJavaPairRDDNew(collectedfrom, df, dataprovenance);
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df,
|
||||
String dataprovenance) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
|
@ -219,7 +223,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
BIP_INFERENCE_PROVENANCE,
|
||||
dataprovenance,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
|
@ -235,7 +239,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
|
||||
}
|
||||
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
|
||||
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df,
|
||||
String dataprovenance) {
|
||||
// unroll nested arrays
|
||||
df = df
|
||||
.withColumn("matching", functions.explode(new Column("Matchings")))
|
||||
|
@ -276,7 +281,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
DataInfo dataInfo = OafMapperUtils
|
||||
.dataInfo(
|
||||
false,
|
||||
BIP_INFERENCE_PROVENANCE,
|
||||
dataprovenance,
|
||||
true,
|
||||
false,
|
||||
qualifier,
|
||||
|
|
|
@ -98,9 +98,9 @@ public class PrepareAffiliationRelationsTest {
|
|||
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||
"-openapcInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||
"-publisherInputPath", publisherAffiliationRelationOldPath,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPathNew,
|
||||
"-publisherInputPath", publisherAffiliationRelationPath,
|
||||
"-outputPath", outputPath
|
||||
});
|
||||
|
||||
|
@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
.map(aa -> ((Relation) aa.getPayload()));
|
||||
|
||||
// count the number of relations
|
||||
assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
|
||||
assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 =
|
||||
|
||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
dataset.createOrReplaceTempView("result");
|
||||
|
@ -123,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// verify that we have equal number of bi-directional relations
|
||||
Assertions
|
||||
.assertEquals(
|
||||
75, execVerification
|
||||
81, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||
.collectAsList()
|
||||
|
@ -131,7 +131,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
75, execVerification
|
||||
81, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||
.collectAsList()
|
||||
|
@ -158,7 +158,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
||||
4, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
|
@ -173,7 +173,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, execVerification
|
||||
1, execVerification
|
||||
.filter(
|
||||
"source = '" + ID_PREFIX
|
||||
+ IdentifierFactory
|
||||
|
|
Loading…
Reference in New Issue