package eu.dnetlib.dhp.oa.graph.dump.eosc; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; import java.util.List; import java.util.Optional; import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.eosc.model.Affiliation; import eu.dnetlib.dhp.eosc.model.Indicator; import eu.dnetlib.dhp.eosc.model.Result; import eu.dnetlib.dhp.schema.action.AtomicAction; import scala.Tuple2; /** * @author miriam.baglioni * @Date 21/09/22 */ public class SelectEoscResultTest { private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static SparkSession spark; private static Path workingDir; private static final Logger log = LoggerFactory .getLogger(SelectEoscResultTest.class); private static HashMap map = new HashMap<>(); @BeforeAll public static void beforeAll() throws IOException { workingDir = Files .createTempDirectory(SelectEoscResultTest.class.getSimpleName()); log.info("using work dir {}", workingDir); SparkConf conf = new SparkConf(); conf.setAppName(SelectEoscResultTest.class.getSimpleName()); conf.setMaster("local[*]"); conf.set("spark.driver.host", "localhost"); conf.set("hive.metastore.local", "true"); conf.set("spark.ui.enabled", "false"); conf.set("spark.sql.warehouse.dir", workingDir.toString()); conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); spark = SparkSession .builder() .appName(SelectEoscResultTest.class.getSimpleName()) .config(conf) .getOrCreate(); } @AfterAll public static void afterAll() throws IOException { FileUtils.deleteDirectory(workingDir.toFile()); spark.stop(); } @Test public void selectEoscResults() throws Exception { final String sourcePath = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/input/publication") .getPath(); final String cmp = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json") .getPath(); final String mdp = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/working/masterduplicate") .getPath(); SelectEoscResultsJobStep1.main(new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-outputPath", workingDir.toString() + "/publication", "-sourcePath", sourcePath, "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", "-communityMapPath", cmp, "-eoscDatasourceIdsPath", mdp }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/publication") .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); Assertions.assertEquals(3, tmp.count()); Assertions .assertEquals( 0, tmp .filter(r -> Optional.ofNullable(r.getAffiliation()).isPresent() && r.getAffiliation().size() > 0) .count()); tmp.foreach(r -> System.out.println(new ObjectMapper().writeValueAsString(r))); } // "source":"20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba" // legalname = Doris Engineering (France) // pid = // [(pidtype:"GRID","value":"grid.432986.2"),("classid":"ROR","value":"https://ror.org/03nd0ms94"),("classid":"GRID","value":"grid.432986.2"),("classid":"ROR","value":"https://ror.org/03nd0ms94")] // sono replicate e ci dovrebbero essere uniche nel dump // "source":"20|MetisRadboud::b58bdbe8ae5acead04fc76777d2f8017","subRelType":"affiliation","target":"50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba" // legalname = RENNES METROPOLE // pid = [(PIC, 892062829)] // "source":"20|____________::d1b0ee22411434cf905692d0fac25749","subRelType":"affiliation","target":"50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98" // legalname = MIKARE RESEARCH // pid = [] // for 50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f no affiliation relation is provided @Test public void ExtendEoscResultWithOrganizationTest() throws Exception { final String sourcePath = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/input") .getPath(); final String cmp = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/communityMapPath/communitymap.json") .getPath(); String resultPath = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/working/publication") .getPath(); ExtendEoscResultWithOrganizationStep2.main(new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-outputPath", workingDir.toString() + "/publication", "-sourcePath", sourcePath, // "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", "-resultPath", resultPath }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/publication") .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); Assertions.assertEquals(3, tmp.count()); Assertions .assertEquals( 2, tmp .filter(r -> Optional.ofNullable(r.getAffiliation()).isPresent() && r.getAffiliation().size() > 0) .count()); Assertions .assertEquals( 2, tmp .filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) .first() .getAffiliation() .size()); List affiliations = tmp .filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) .first() .getAffiliation(); Assertions .assertTrue( affiliations.stream().anyMatch(a -> a.getName().equalsIgnoreCase("Doris Engineering (France)"))); Assertions.assertTrue(affiliations.stream().anyMatch(a -> a.getName().equalsIgnoreCase("RENNES METROPOLE"))); Affiliation organization = affiliations .stream() .filter(a -> a.getId().equalsIgnoreCase("20|13811704aa70::51a6ade52065e3b371d1ae822e07f1ff")) .findFirst() .get(); Assertions.assertEquals("Doris Engineering (France)", organization.getName()); Assertions .assertTrue( organization .getPid() .stream() .anyMatch( p -> p.getValue().equalsIgnoreCase("grid.432986.2") && p.getType().equalsIgnoreCase("grid"))); Assertions .assertTrue( organization .getPid() .stream() .anyMatch( p -> p.getValue().equalsIgnoreCase("https://ror.org/03nd0ms94") && p.getType().equalsIgnoreCase("ror"))); Assertions.assertEquals(2, organization.getPid().size()); organization = affiliations .stream() .filter(a -> a.getId().equalsIgnoreCase("20|MetisRadboud::b58bdbe8ae5acead04fc76777d2f8017")) .findFirst() .get(); Assertions.assertEquals("RENNES METROPOLE", organization.getName()); Assertions.assertEquals(1, organization.getPid().size()); Assertions .assertTrue( organization.getPid().get(0).getValue().equalsIgnoreCase("892062829") && organization.getPid().get(0).getType().equalsIgnoreCase("pic")); Assertions .assertEquals( 1, tmp .filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) .first() .getAffiliation() .size()); Assertions .assertEquals( "MIKARE RESEARCH", tmp .filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) .first() .getAffiliation() .get(0) .getName()); Assertions .assertEquals( 0, tmp .filter(r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::cd7711c65d518859f1d87056e2c45d98")) .first() .getAffiliation() .get(0) .getPid() .size()); Assertions .assertFalse( Optional .ofNullable( tmp .filter( r -> r.getId().equalsIgnoreCase("50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f")) .first() .getAffiliation()) .isPresent()); } @Test public void verifyIndicatorsTest() throws Exception { // 50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba // 50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f final String resultPath = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/working/publication") .getPath(); final String actionSetPath = getClass() .getResource("/eu/dnetlib/dhp/oa/graph/dump/eosc/input/actionset") .getPath(); Utils .readPath(spark, actionSetPath, eu.dnetlib.dhp.schema.oaf.Result.class) .toJavaRDD() .map(p -> new AtomicAction(p.getClass(), p)) .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), new Text(OBJECT_MAPPER.writeValueAsString(aa)))) .saveAsHadoopFile( workingDir.toString() + "/actionSet", Text.class, Text.class, SequenceFileOutputFormat.class); ExtendWithUsageCounts.main(new String[] { "-isSparkSessionManaged", Boolean.FALSE.toString(), "-outputPath", workingDir.toString() + "/publication", "-actionSetPath", workingDir.toString() + "/actionSet", "-resultPath", resultPath }); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD tmp = sc .textFile(workingDir.toString() + "/publication") .map(item -> OBJECT_MAPPER.readValue(item, Result.class)); Assertions .assertEquals( 1, tmp .filter( x -> x.getId().equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba") && Optional.ofNullable(x.getIndicator()).isPresent()) .count()); Assertions .assertEquals( 1, tmp .filter( x -> x.getId().equals("50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f") && Optional.ofNullable(x.getIndicator()).isPresent()) .count()); Indicator indicators = tmp .filter(x -> x.getId().equals("50|06cdd3ff4700::93859bd27121c3ee7c6ee4bfb1790cba")) .first() .getIndicator(); Assertions.assertEquals(79, Integer.valueOf(indicators.getUsageCounts().getDownloads())); Assertions.assertEquals(94, Integer.valueOf(indicators.getUsageCounts().getViews())); indicators = tmp .filter(x -> x.getId().equals("50|06cdd3ff4700::ff21e3c55d527fa7db171137c5fd1f1f")) .first() .getIndicator(); Assertions.assertEquals(0, Integer.valueOf(indicators.getUsageCounts().getDownloads())); Assertions.assertEquals(1, Integer.valueOf(indicators.getUsageCounts().getViews())); Assertions.assertEquals(1, tmp.filter(r -> !Optional.ofNullable(r.getIndicator()).isPresent()).count()); } }