Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
1 changed files with 52 additions and 56 deletions
Showing only changes of commit c2998a14e8 - Show all commits

View File

@ -5,8 +5,11 @@ import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -26,7 +29,6 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.Result;
public class PrepareAffiliationRelationsTest { public class PrepareAffiliationRelationsTest {
@ -35,6 +37,7 @@ public class PrepareAffiliationRelationsTest {
private static SparkSession spark; private static SparkSession spark;
private static Path workingDir; private static Path workingDir;
private static final String ID_PREFIX = "50|doi_________::";
private static final Logger log = LoggerFactory private static final Logger log = LoggerFactory
.getLogger(PrepareAffiliationRelationsTest.class); .getLogger(PrepareAffiliationRelationsTest.class);
@ -69,71 +72,64 @@ public class PrepareAffiliationRelationsTest {
@Test @Test
void testMatch() throws Exception { void testMatch() throws Exception {
String affiliationRelationsPath = getClass() String affiliationRelationsPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json") .getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath(); .getPath();
String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations PrepareAffiliationRelations
.main( .main(
new String[] { new String[] {
"-isSparkSessionManaged", "-isSparkSessionManaged", Boolean.FALSE.toString(),
Boolean.FALSE.toString(), "-inputPath", affiliationRelationsPath,
"-inputPath", "-outputPath", outputPath
affiliationRelationsPath,
"-outputPath",
workingDir.toString() + "/actionSet"
}); });
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
// JavaRDD<Result> tmp = sc JavaRDD<Relation> tmp = sc
// .sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class) .sequenceFile(outputPath, Text.class, Text.class)
// .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
// .map(aa -> ((Result) aa.getPayload())); .map(aa -> ((Relation) aa.getPayload()));
//
// assertEquals(4, tmp.count()); for (Relation r : tmp.collect()) {
// System.out.println(
// Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class)); r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
// verificationDataset.createOrReplaceTempView("result"); );
// }
// Dataset<Row> execVerification = spark // count the number of relations
// .sql( assertEquals(16, tmp.count());
// "Select p.id oaid, mes.id, mUnit.value from result p " +
// "lateral view explode(measures) m as mes " + Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
// "lateral view explode(mes.unit) u as mUnit "); dataset.createOrReplaceTempView("result");
//
// Assertions.assertEquals(12, execVerification.count()); Dataset<Row> execVerification = spark.sql("select r.relType, r.relClass, r.source, r.target, r.dataInfo.trust from result r");
// Assertions
// .assertEquals( // verify that we have equal number of bi-directional relations
// "6.63451994567e-09", execVerification Assertions.assertEquals(8, execVerification
// .filter( .filter(
// "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " + "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION +"'")
// "and id = 'influence'") .collectAsList()
// .select("value") .size());
// .collectAsList()
// .get(0) Assertions.assertEquals(8, execVerification
// .getString(0)); .filter(
// Assertions "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF +"'")
// .assertEquals( .collectAsList()
// "0.348694533145", execVerification .size());
// .filter(
// "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " + // check confidence value of a specific relation
// "and id = 'popularity_alt'") String sourceDOI = "10.1105/tpc.8.3.343";
// .select("value")
// .collectAsList() final String sourceOpenaireId = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
// .get(0)
// .getString(0)); Assertions.assertEquals("0.7071067812", execVerification
// Assertions .filter(
// .assertEquals( "source='" + sourceOpenaireId +"'")
// "2.16094680115e-09", execVerification .collectAsList().get(0).getString(4));
// .filter(
// "oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
// "and id = 'popularity'")
// .select("value")
// .collectAsList()
// .get(0)
// .getString(0));
//
} }
} }