dnet-hadoop/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations...

148 lines
4.6 KiB
Java
Raw Normal View History

2023-07-17 15:04:21 +02:00
2023-07-05 23:51:01 +02:00
package eu.dnetlib.dhp.actionmanager.bipaffiliations;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
2023-07-17 15:04:21 +02:00
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
2023-07-05 23:51:01 +02:00
public class PrepareAffiliationRelationsTest {
2023-07-17 15:04:21 +02:00
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
private static SparkSession spark;
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
private static Path workingDir;
private static final String ID_PREFIX = "50|doi_________::";
private static final Logger log = LoggerFactory
.getLogger(PrepareAffiliationRelationsTest.class);
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(PrepareAffiliationRelationsTest.class.getSimpleName());
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
log.info("Using work dir {}", workingDir);
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
SparkConf conf = new SparkConf();
conf.setAppName(PrepareAffiliationRelationsTest.class.getSimpleName());
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("hive.metastore.local", "true");
conf.set("spark.ui.enabled", "false");
conf.set("spark.sql.warehouse.dir", workingDir.toString());
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
spark = SparkSession
.builder()
.appName(PrepareAffiliationRelationsTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(workingDir.toFile());
spark.stop();
}
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
@Test
void testMatch() throws Exception {
2023-10-26 22:47:06 +02:00
String crossrefAffiliationRelationPath = getClass()
2023-07-17 15:04:21 +02:00
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath();
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
String outputPath = workingDir.toString() + "/actionSet";
2023-07-17 15:04:21 +02:00
PrepareAffiliationRelations
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
2023-10-26 22:47:06 +02:00
"-crossrefInputPath", crossrefAffiliationRelationPath,
"-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath,
2023-07-17 15:04:21 +02:00
"-outputPath", outputPath
});
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
2023-07-05 23:51:01 +02:00
2023-07-17 15:04:21 +02:00
JavaRDD<Relation> tmp = sc
.sequenceFile(outputPath, Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
2023-07-17 10:17:53 +02:00
// for (Relation r : tmp.collect()) {
// System.out.println(
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
// );
// }
2023-07-17 15:04:21 +02:00
// count the number of relations
assertEquals(60, tmp.count());
2023-07-17 15:04:21 +02:00
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
Dataset<Row> execVerification = spark
.sql("select r.relType, r.relClass, r.source, r.target, r.dataInfo.trust from result r");
// verify that we have equal number of bi-directional relations
Assertions
.assertEquals(
30, execVerification
2023-07-17 15:04:21 +02:00
.filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList()
.size());
Assertions
.assertEquals(
30, execVerification
2023-07-17 15:04:21 +02:00
.filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList()
.size());
// check confidence value of a specific relation
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
2023-07-17 15:04:21 +02:00
final String sourceOpenaireId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
Assertions
.assertEquals(
"0.7071067812", execVerification
.filter(
"source='" + sourceOpenaireId + "'")
.collectAsList()
.get(0)
.getString(4));
}
2023-07-05 23:51:01 +02:00
}