[AffiliationFromPublisher]Added test, included new path parameter
This commit is contained in:
parent
7df492fa34
commit
a6195fb7de
|
@ -17,6 +17,7 @@ import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -76,8 +77,8 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||||
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||||
|
|
||||||
final String publisherlInputPath = parser.get("publisherlInputPath");
|
final String publisherInputPath = parser.get("publisherInputPath");
|
||||||
log.info("publisherlInputPath: {}", publisherlInputPath);
|
log.info("publisherInputPath: {}", publisherInputPath);
|
||||||
|
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
log.info("outputPath: {}", outputPath);
|
log.info("outputPath: {}", outputPath);
|
||||||
|
@ -89,7 +90,7 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
Constants.removeOutputDir(spark, outputPath);
|
Constants.removeOutputDir(spark, outputPath);
|
||||||
createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherlInputPath, outputPath);
|
createActionSet(spark, crossrefInputPath, pubmedInputPath, openapcInputPath, dataciteInputPath, webcrawlInputPath, publisherInputPath, outputPath);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,9 +137,11 @@ public class PrepareAffiliationRelations implements Serializable {
|
||||||
|
|
||||||
private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
private static JavaPairRDD<Text,Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
|
||||||
List<KeyValue> collectedfrom){
|
List<KeyValue> collectedfrom){
|
||||||
|
|
||||||
|
|
||||||
Dataset<Row> df = spark
|
Dataset<Row> df = spark
|
||||||
.read()
|
.read()
|
||||||
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<RORid`:STRING,`Confidence`:DOUBLE>>")
|
.schema("`DOI` STRING, `Organizations` ARRAY<STRUCT<`RORid`:STRING,`Confidence`:DOUBLE>>")
|
||||||
.json(inputPath)
|
.json(inputPath)
|
||||||
.where("DOI is not null");
|
.where("DOI is not null");
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,11 @@
|
||||||
"paramLongName": "webCrawlInputPath",
|
"paramLongName": "webCrawlInputPath",
|
||||||
"paramDescription": "the path to get the input data from Web Crawl",
|
"paramDescription": "the path to get the input data from Web Crawl",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},{
|
||||||
|
"paramName": "pip",
|
||||||
|
"paramLongName": "publisherInputPath",
|
||||||
|
"paramDescription": "the path to get the input data from publishers",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
,
|
,
|
||||||
{
|
{
|
||||||
|
|
|
@ -78,6 +78,10 @@ public class PrepareAffiliationRelationsTest {
|
||||||
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
|
||||||
.getPath();
|
.getPath();
|
||||||
|
|
||||||
|
String publisherAffiliationRelationPath = getClass()
|
||||||
|
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
|
||||||
|
.getPath();
|
||||||
|
|
||||||
String outputPath = workingDir.toString() + "/actionSet";
|
String outputPath = workingDir.toString() + "/actionSet";
|
||||||
|
|
||||||
PrepareAffiliationRelations
|
PrepareAffiliationRelations
|
||||||
|
@ -89,6 +93,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||||
|
"-publisherInputPath", publisherAffiliationRelationPath,
|
||||||
"-outputPath", outputPath
|
"-outputPath", outputPath
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -105,7 +110,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// );
|
// );
|
||||||
// }
|
// }
|
||||||
// count the number of relations
|
// count the number of relations
|
||||||
assertEquals(120, tmp.count());
|
assertEquals(138, tmp.count());
|
||||||
|
|
||||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||||
dataset.createOrReplaceTempView("result");
|
dataset.createOrReplaceTempView("result");
|
||||||
|
@ -116,7 +121,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
// verify that we have equal number of bi-directional relations
|
// verify that we have equal number of bi-directional relations
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
60, execVerification
|
69, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
@ -124,7 +129,7 @@ public class PrepareAffiliationRelationsTest {
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
60, execVerification
|
69, execVerification
|
||||||
.filter(
|
.filter(
|
||||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||||
.collectAsList()
|
.collectAsList()
|
||||||
|
|
Loading…
Reference in New Issue