[AffiliationFromPublisher]Added test, included new path parameter

This commit is contained in:
Miriam Baglioni 2024-08-07 11:03:22 +02:00
parent 7df492fa34
commit a6195fb7de
3 changed files with 20 additions and 7 deletions

View File

@ -17,6 +17,7 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -76,8 +77,8 @@ public class PrepareAffiliationRelations implements Serializable {
final String webcrawlInputPath = parser.get("webCrawlInputPath"); final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath); log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String publisherlInputPath = parser.get("publisherlInputPath"); final String publisherInputPath = parser.get("publisherInputPath");
log.info("publisherlInputPath: {}", publisherlInputPath); log.info("publisherInputPath: {}", publisherInputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
@ -89,7 +90,7 @@ public class PrepareAffiliationRelations implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
Constants.removeOutputDir(spark, outputPath); Constants.removeOutputDir(spark, outputPath);
createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherlInputPath, outputPath); createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherInputPath, outputPath);
}); });
} }
@ -136,9 +137,11 @@ public class PrepareAffiliationRelations implements Serializable {
private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath, private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
List<KeyValue> collectedfrom){ List<KeyValue> collectedfrom){
Dataset<Row> df = spark Dataset<Row> df = spark
.read() .read()
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<RORid`:STRING,`Confidence`:DOUBLE>>") .schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
.json(inputPath) .json(inputPath)
.where("DOI is not null"); .where("DOI is not null");

View File

@ -33,6 +33,11 @@
"paramLongName": "webCrawlInputPath", "paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl", "paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true "paramRequired": true
},{
"paramName": "pip",
"paramLongName": "publisherInputPath",
"paramDescription": "the path to get the input data from publishers",
"paramRequired": true
} }
, ,
{ {

View File

@ -78,6 +78,10 @@ public class PrepareAffiliationRelationsTest {
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String publisherAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
.getPath();
String outputPath = workingDir.toString() + "/actionSet"; String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations PrepareAffiliationRelations
@ -89,6 +93,7 @@ public class PrepareAffiliationRelationsTest {
"-openapcInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath, "-webCrawlInputPath", crossrefAffiliationRelationPath,
"-publisherInputPath", publisherAffiliationRelationPath,
"-outputPath", outputPath "-outputPath", outputPath
}); });
@ -105,7 +110,7 @@ public class PrepareAffiliationRelationsTest {
// ); // );
// } // }
// count the number of relations // count the number of relations
assertEquals(120, tmp.count()); assertEquals(138, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
@ -116,7 +121,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions Assertions
.assertEquals( .assertEquals(
60, execVerification 69, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
@ -124,7 +129,7 @@ public class PrepareAffiliationRelationsTest {
Assertions Assertions
.assertEquals( .assertEquals(
60, execVerification 69, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()