From 7184cc08045ecea6d2695b00d6cf9ed919209472 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 13 Feb 2023 13:03:49 +0100 Subject: [PATCH] [FoS] added check for null on level1 subject --- .../dnetlib/dhp/actionmanager/Constants.java | 2 +- .../createunresolvedentities/GetFosTest.java | 96 +++++++++++++++++++ .../createunresolvedentities/fos/fos_sbs.tsv | 40 ++++++++ 3 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 6167182a8..7fd3315c8 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -61,7 +61,7 @@ public class Constants { public static StructuredProperty getSubject(String sbj, String classid, String classname, String diqualifierclassid) { - if (sbj.equals(NULL)) + if (sbj == null || sbj.equals(NULL)) return null; StructuredProperty sp = new StructuredProperty(); sp.setValue(sbj); diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java new file mode 100644 index 000000000..9a74ae07e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFosTest.java @@ -0,0 +1,96 @@ +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocalFileSystem; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * @author miriam.baglioni + * @Date 13/02/23 + */ +public class GetFosTest { + + private static final Logger log = LoggerFactory.getLogger(ProduceTest.class); + + private static Path workingDir; + private static SparkSession spark; + private static LocalFileSystem fs; + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(PrepareTest.class.getSimpleName()); + + fs = FileSystem.getLocal(new Configuration()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(ProduceTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(PrepareTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + @Test + void test3() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv") + .getPath(); + + + final String outputPath = workingDir.toString() + "/fos.json"; + GetFOSSparkJob + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", sourcePath, + + "-outputPath", outputPath + + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(outputPath) + .map(item -> OBJECT_MAPPER.readValue(item, FOSDataModel.class)); + + tmp.foreach(t -> Assertions.assertTrue(t.getDoi() != null)); + tmp.foreach(t -> Assertions.assertTrue(t.getLevel1() != null)); + tmp.foreach(t -> Assertions.assertTrue(t.getLevel2() != null)); + tmp.foreach(t -> Assertions.assertTrue(t.getLevel3() != null)); + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv new file mode 100644 index 000000000..98a338e2d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos_sbs.tsv @@ -0,0 +1,40 @@ +doi level1 level2 level3 +10.1080/09638237.2018.1466033 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1016/j.dsi.2015.10.003 03 medical and health sciences 0301 basic medicine 030105 genetics & heredity +10.1007/s10072-017-2914-9 03 medical and health sciences 0302 clinical medicine 030217 neurology & neurosurgery +10.1016/j.bspc.2021.102726 02 engineering and technology 0206 medical engineering 020601 biomedical engineering +10.1177/0306312706069439 06 humanities and the arts 0601 history and archaeology 060101 anthropology +10.1016/j.jacep.2016.05.010 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1111/anae.13418 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1142/s1793744210000168 01 natural sciences 0103 physical sciences 010306 general physics +10.1016/j.jadohealth.2019.04.029 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1109/icais50930.2021.9395847 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020201 artificial intelligence & image processing +10.1145/3154837 01 natural sciences 0101 mathematics 010102 general mathematics +10.1038/srep38130 03 medical and health sciences 0301 basic medicine 030106 microbiology +10.1007/s13369-017-2871-x 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020201 artificial intelligence & image processing +10.1063/1.4964718 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1007/s12603-019-1276-9 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1002/cam4.1463 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1164/rccm.201611-2290ed 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1088/1757-899x/225/1/012132 01 natural sciences 0105 earth and related environmental sciences 010504 meteorology & atmospheric sciences +10.1117/1.jmm.15.1.015501 02 engineering and technology 0210 nano-technology 021001 nanoscience & nanotechnology +10.1088/1361-6587/ab569d 01 natural sciences 0103 physical sciences 010303 astronomy & astrophysics +10.1016/j.rser.2015.11.092 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy +10.1016/j.jhydrol.2013.06.035 01 natural sciences 0105 earth and related environmental sciences 010504 meteorology & atmospheric sciences +10.1111/php.12892 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1088/0264-9381/27/10/105001 01 natural sciences 0103 physical sciences 010308 nuclear & particles physics +10.1016/j.matchemphys.2018.02.039 02 engineering and technology 0210 nano-technology 021001 nanoscience & nanotechnology +10.1098/rsos.160993 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.1016/j.rinp.2017.07.054 02 engineering and technology 0209 industrial biotechnology 020901 industrial engineering & automation +10.1111/eip.12348 03 medical and health sciences 0302 clinical medicine 030227 psychiatry +10.20965/jrm.2016.p0371 02 engineering and technology 0201 civil engineering 020101 civil engineering +10.2337/dci19-0036 03 medical and health sciences 0302 clinical medicine 030212 general & internal medicine +10.1155/2018/7692913 01 natural sciences 0104 chemical sciences 010404 medicinal & biomolecular chemistry +10.1117/12.2262306 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020206 networking & telecommunications +10.1021/acs.jpcb.7b01885 01 natural sciences 0104 chemical sciences 010405 organic chemistry +10.1177/0033294117711131 05 social sciences 0502 economics and business 050203 business & management +10.1016/j.jrurstud.2017.08.019 05 social sciences 0502 economics and business 050203 business & management +10.1111/febs.15296 03 medical and health sciences 0301 basic medicine 030104 developmental biology +10.3923/jeasci.2017.6922.6927 05 social sciences 0505 law 050501 criminology +10.1007/s10854-017-6376-x 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020208 electrical & electronic engineering +10.3390/app10176095 02 engineering and technology 0202 electrical engineering, electronic engineering, information engineering 020209 energy \ No newline at end of file