Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
1 changed files with 52 additions and 56 deletions
Showing only changes of commit c2998a14e8 - Show all commits

View File

@ -5,8 +5,11 @@ import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf;
@ -26,7 +29,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Result;
public class PrepareAffiliationRelationsTest {
@ -35,6 +37,7 @@ public class PrepareAffiliationRelationsTest {
private static SparkSession spark;
private static Path workingDir;
private static final String ID_PREFIX = "50|doi_________::";
private static final Logger log = LoggerFactory
.getLogger(PrepareAffiliationRelationsTest.class);
@ -69,71 +72,64 @@ public class PrepareAffiliationRelationsTest {
@Test
void testMatch() throws Exception {
String affiliationRelationsPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath();
String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
affiliationRelationsPath,
"-outputPath",
workingDir.toString() + "/actionSet"
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-inputPath", affiliationRelationsPath,
"-outputPath", outputPath
});
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
// JavaRDD<Result> tmp = sc
// .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
// .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
// .map(aa -> ((Result) aa.getPayload()));
//
// assertEquals(4, tmp.count());
//
// Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
// verificationDataset.createOrReplaceTempView("result");
//
// Dataset<Row> execVerification = spark
// .sql(
// "Select p.id oaid, mes.id, mUnit.value from result p " +
// "lateral view explode(measures) m as mes " +
// "lateral view explode(mes.unit) u as mUnit ");
//
// Assertions.assertEquals(12, execVerification.count());
// Assertions
// .assertEquals(
// "6.63451994567e-09", execVerification
// .filter(
// "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
// "and id = 'influence'")
// .select("value")
// .collectAsList()
// .get(0)
// .getString(0));
// Assertions
// .assertEquals(
// "0.348694533145", execVerification
// .filter(
// "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
// "and id = 'popularity_alt'")
// .select("value")
// .collectAsList()
// .get(0)
// .getString(0));
// Assertions
// .assertEquals(
// "2.16094680115e-09", execVerification
// .filter(
// "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
// "and id = 'popularity'")
// .select("value")
// .collectAsList()
// .get(0)
// .getString(0));
//
JavaRDD<Relation> tmp = sc
.sequenceFile(outputPath, Text.class, Text.class)
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
for (Relation r : tmp.collect()) {
System.out.println(
r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
);
}
// count the number of relations
assertEquals(16, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
Dataset<Row> execVerification = spark.sql("select r.relType, r.relClass, r.source, r.target, r.dataInfo.trust from result r");
// verify that we have equal number of bi-directional relations
Assertions.assertEquals(8, execVerification
.filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION +"'")
.collectAsList()
.size());
Assertions.assertEquals(8, execVerification
.filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF +"'")
.collectAsList()
.size());
// check confidence value of a specific relation
String sourceDOI = "10.1105/tpc.8.3.343";
final String sourceOpenaireId = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
Assertions.assertEquals("0.7071067812", execVerification
.filter(
"source='" + sourceOpenaireId +"'")
.collectAsList().get(0).getString(4));
}
}