From 29150a5d0c69a540b546ffcfbc91114e68c40301 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 21 Apr 2022 13:31:56 +0200 Subject: [PATCH 1/3] code formatting --- .../oa/graph/clean/CleanContextSparkJob.java | 181 +++++---- .../dhp/oa/graph/clean/CleanContextTest.java | 375 ++++++++++++------ 2 files changed, 357 insertions(+), 199 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextSparkJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextSparkJob.java index b20dcb67bc..3e9b17f3f9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextSparkJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextSparkJob.java @@ -1,16 +1,13 @@ + package eu.dnetlib.dhp.oa.graph.clean; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Context; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -21,93 +18,113 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.Serializable; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; +import com.fasterxml.jackson.databind.ObjectMapper; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Context; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class CleanContextSparkJob implements Serializable { - private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class); + private static final Logger log = LoggerFactory.getLogger(CleanContextSparkJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = IOUtils - .toString( - CleanContextSparkJob.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + String jsonConfiguration = IOUtils + .toString( + CleanContextSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/input_clean_context_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String workingPath = parser.get("workingPath"); - log.info("workingPath: {}", workingPath); + String workingPath = parser.get("workingPath"); + log.info("workingPath: {}", workingPath); - String contextId = parser.get("contextId"); - log.info("contextId: {}", contextId); + String contextId = parser.get("contextId"); + log.info("contextId: {}", contextId); - String verifyParam = parser.get("verifyParam"); - log.info("verifyParam: {}", verifyParam); + String verifyParam = parser.get("verifyParam"); + log.info("verifyParam: {}", verifyParam); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + Class entityClazz = (Class) Class.forName(graphTableClassName); - Class entityClazz = (Class) Class.forName(graphTableClassName); + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingPath); + }); + } - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { + private static void cleanContext(SparkSession spark, String contextId, String verifyParam, + String inputPath, Class entityClazz, String workingPath) { + Dataset res = spark + .read() + .textFile(inputPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)); - cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingPath); - }); - } + res.map((MapFunction) r -> { + if (!r + .getTitle() + .stream() + .filter( + t -> t + .getQualifier() + .getClassid() + .equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())) + .anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))) { + return r; + } + r + .setContext( + r + .getContext() + .stream() + .filter( + c -> !c.getId().split("::")[0] + .equalsIgnoreCase(contextId)) + .collect(Collectors.toList())); + return r; + }, Encoders.bean(entityClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(workingPath); - private static void cleanContext(SparkSession spark, String contextId, String verifyParam, String inputPath, Class entityClazz, String workingPath) { - Dataset res = spark - .read() - .textFile(inputPath) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), - Encoders.bean(entityClazz)); - - res.map((MapFunction) r -> { - if(!r.getTitle() - .stream() - .filter(t -> t.getQualifier().getClassid() - .equalsIgnoreCase(ModelConstants.MAIN_TITLE_QUALIFIER.getClassid())) - .anyMatch(t -> t.getValue().toLowerCase().startsWith(verifyParam.toLowerCase()))){ - return r; - } - r.setContext(r.getContext().stream().filter(c -> !c.getId().split("::")[0] - .equalsIgnoreCase(contextId)).collect(Collectors.toList())); - return r; - } ,Encoders.bean(entityClazz)) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(workingPath); - - spark.read().textFile(workingPath).map((MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), - Encoders.bean(entityClazz)) - .write() - .mode(SaveMode.Overwrite) - .option("compression","gzip") - .json(inputPath); - } + spark + .read() + .textFile(workingPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(inputPath); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextTest.java index bba8143463..472d3781d2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/CleanContextTest.java @@ -1,18 +1,12 @@ + package eu.dnetlib.dhp.oa.graph.clean; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.graph.dump.Constants; -import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest; -import eu.dnetlib.dhp.oa.graph.dump.DumpProducts; -import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.dump.oaf.Instance; -import eu.dnetlib.dhp.schema.dump.oaf.OpenAccessRoute; -import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Software; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Locale; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -27,133 +21,280 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.Locale; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.graph.dump.Constants; +import eu.dnetlib.dhp.oa.graph.dump.DumpJobTest; +import eu.dnetlib.dhp.oa.graph.dump.DumpProducts; +import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.dump.oaf.Instance; +import eu.dnetlib.dhp.schema.dump.oaf.OpenAccessRoute; +import eu.dnetlib.dhp.schema.dump.oaf.graph.GraphResult; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class CleanContextTest { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class); + private static final Logger log = LoggerFactory.getLogger(CleanContextTest.class); - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(DumpJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(DumpJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(DumpJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = SparkSession - .builder() - .appName(DumpJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(DumpJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testResultClean() throws Exception { - final String sourcePath = getClass() - .getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json") - .getPath(); - final String prefix = "gcube "; + @Test + public void testResultClean() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/oa/graph/clean/publication_clean_context.json") + .getPath(); + final String prefix = "gcube "; + spark + .read() + .textFile(sourcePath) + .map( + (MapFunction) r -> OBJECT_MAPPER.readValue(r, Publication.class), + Encoders.bean(Publication.class)) + .write() + .json(workingDir.toString() + "/publication"); - spark.read().textFile(sourcePath).map((MapFunction) r -> OBJECT_MAPPER.readValue(r, Publication.class), Encoders.bean(Publication.class)) - .write().json(workingDir.toString() + "/publication"); + CleanContextSparkJob.main(new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--inputPath", workingDir.toString() + "/publication", + "-graphTableClassName", Publication.class.getCanonicalName(), + "-workingPath", workingDir.toString() + "/working", + "-contextId", "sobigdata", + "-verifyParam", "gCube " + }); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/publication") + .map(item -> OBJECT_MAPPER.readValue(item, Publication.class)); - CleanContextSparkJob.main(new String[] { - "--isSparkSessionManaged", Boolean.FALSE.toString(), - "--inputPath", workingDir.toString() + "/publication", - "-graphTableClassName", Publication.class.getCanonicalName(), - "-workingPath", workingDir.toString() + "/working", - "-contextId","sobigdata", - "-verifyParam","gCube " - }); + Assertions.assertEquals(7, tmp.count()); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD tmp = sc - .textFile(workingDir.toString() + "/publication") - .map(item -> OBJECT_MAPPER.readValue(item, Publication.class)); + // original result with sobigdata context and gcube as starting string in the main title for the publication + Assertions + .assertEquals( + 0, + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")) + .collect() + .get(0) + .getContext() + .size()); - Assertions.assertEquals(7, tmp.count()); + // original result with sobigdata context without gcube as starting string in the main title for the publication + Assertions + .assertEquals( + 1, + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")) + .collect() + .get(0) + .getContext() + .size()); + Assertions + .assertEquals( + "sobigdata::projects::2", + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")) + .collect() + .get(0) + .getContext() + .get(0) + .getId()); - //original result with sobigdata context and gcube as starting string in the main title for the publication - Assertions.assertEquals(0, - tmp.filter(p->p.getId().equals("50|DansKnawCris::0224aae28af558f21768dbc6439c7a95")).collect().get(0).getContext().size()); + // original result with sobigdata context with gcube as starting string in the subtitle + Assertions + .assertEquals( + 1, + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")) + .collect() + .get(0) + .getContext() + .size()); + Assertions + .assertEquals( + "sobigdata::projects::2", + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")) + .collect() + .get(0) + .getContext() + .get(0) + .getId()); + List titles = tmp + .filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")) + .collect() + .get(0) + .getTitle(); + Assertions.assertEquals(1, titles.size()); + Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix)); + Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid()); - //original result with sobigdata context without gcube as starting string in the main title for the publication - Assertions.assertEquals(1, - tmp.filter(p->p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).collect().get(0).getContext().size()); - Assertions.assertEquals("sobigdata::projects::2",tmp.filter(p->p.getId().equals("50|DansKnawCris::20c414a3b1c742d5dd3851f1b67df2d9")).collect().get(0).getContext().get(0).getId() ); + // original result with sobigdata context with gcube not as starting string in the main title + Assertions + .assertEquals( + 1, + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")) + .collect() + .get(0) + .getContext() + .size()); + Assertions + .assertEquals( + "sobigdata::projects::1", + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")) + .collect() + .get(0) + .getContext() + .get(0) + .getId()); + titles = tmp + .filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")) + .collect() + .get(0) + .getTitle(); + Assertions.assertEquals(1, titles.size()); + Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix)); + Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim())); + Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid()); - //original result with sobigdata context with gcube as starting string in the subtitle - Assertions.assertEquals(1, - tmp.filter(p->p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")).collect().get(0).getContext().size()); - Assertions.assertEquals("sobigdata::projects::2",tmp.filter(p->p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")).collect().get(0).getContext().get(0).getId() ); - List titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::3c81248c335f0aa07e06817ece6fa6af")).collect().get(0).getTitle(); - Assertions.assertEquals(1, titles.size()); - Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix) ); - Assertions.assertEquals("subtitle", titles.get(0).getQualifier().getClassid()); + // original result with sobigdata in context and also other contexts with gcube as starting string for the main + // title + Assertions + .assertEquals( + 1, + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")) + .collect() + .get(0) + .getContext() + .size()); + Assertions + .assertEquals( + "dh-ch", + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")) + .collect() + .get(0) + .getContext() + .get(0) + .getId()); + titles = tmp + .filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")) + .collect() + .get(0) + .getTitle(); + Assertions.assertEquals(1, titles.size()); + Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix)); + Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid()); - //original result with sobigdata context with gcube not as starting string in the main title - Assertions.assertEquals(1, - tmp.filter(p->p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")).collect().get(0).getContext().size()); - Assertions.assertEquals("sobigdata::projects::1",tmp.filter(p->p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")).collect().get(0).getContext().get(0).getId() ); - titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::3c9f068ddc930360bec6925488a9a97f")).collect().get(0).getTitle(); - Assertions.assertEquals(1, titles.size()); - Assertions.assertFalse(titles.get(0).getValue().toLowerCase().startsWith(prefix) ); - Assertions.assertTrue(titles.get(0).getValue().toLowerCase().contains(prefix.trim()) ); - Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid()); + // original result with multiple main title one of which whith gcube as starting string and with 2 contextes + Assertions + .assertEquals( + 1, + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")) + .collect() + .get(0) + .getContext() + .size()); + Assertions + .assertEquals( + "dh-ch", + tmp + .filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")) + .collect() + .get(0) + .getContext() + .get(0) + .getId()); + titles = tmp + .filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")) + .collect() + .get(0) + .getTitle(); + Assertions.assertEquals(2, titles.size()); + Assertions + .assertTrue( + titles + .stream() + .anyMatch( + t -> t.getQualifier().getClassid().equals("main title") + && t.getValue().toLowerCase().startsWith(prefix))); - //original result with sobigdata in context and also other contexts with gcube as starting string for the main title - Assertions.assertEquals(1, - tmp.filter(p->p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")).collect().get(0).getContext().size()); - Assertions.assertEquals("dh-ch",tmp.filter(p->p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")).collect().get(0).getContext().get(0).getId() ); - titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::4669a378a73661417182c208e6fdab53")).collect().get(0).getTitle(); - Assertions.assertEquals(1, titles.size()); - Assertions.assertTrue(titles.get(0).getValue().toLowerCase().startsWith(prefix) ); - Assertions.assertEquals("main title", titles.get(0).getQualifier().getClassid()); + // original result without sobigdata in context with gcube as starting string for the main title + Assertions + .assertEquals( + 1, + tmp + .filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")) + .collect() + .get(0) + .getContext() + .size()); + Assertions + .assertEquals( + "dh-ch", + tmp + .filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")) + .collect() + .get(0) + .getContext() + .get(0) + .getId()); + titles = tmp + .filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")) + .collect() + .get(0) + .getTitle(); + Assertions.assertEquals(2, titles.size()); - //original result with multiple main title one of which whith gcube as starting string and with 2 contextes - Assertions.assertEquals(1, - tmp.filter(p->p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")).collect().get(0).getContext().size()); - Assertions.assertEquals("dh-ch",tmp.filter(p->p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")).collect().get(0).getContext().get(0).getId() ); - titles = tmp.filter(p -> p.getId().equals("50|DansKnawCris::4a9152e80f860eab99072e921d74a0ff")).collect().get(0).getTitle(); - Assertions.assertEquals(2, titles.size()); - Assertions.assertTrue(titles.stream().anyMatch(t -> t.getQualifier().getClassid().equals("main title") && t.getValue().toLowerCase().startsWith(prefix)) ); + Assertions + .assertTrue( + titles + .stream() + .anyMatch( + t -> t.getQualifier().getClassid().equals("main title") + && t.getValue().toLowerCase().startsWith(prefix))); - - //original result without sobigdata in context with gcube as starting string for the main title - Assertions.assertEquals(1, - tmp.filter(p->p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")).collect().get(0).getContext().size()); - Assertions.assertEquals("dh-ch",tmp.filter(p->p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")).collect().get(0).getContext().get(0).getId() ); - titles = tmp.filter(p -> p.getId().equals("50|dedup_wf_001::01e6a28565ca01376b7548e530c6f6e8")).collect().get(0).getTitle(); - Assertions.assertEquals(2, titles.size()); - - Assertions.assertTrue(titles.stream().anyMatch(t -> t.getQualifier().getClassid().equals("main title") && t.getValue().toLowerCase().startsWith(prefix))); - - - } + } } From aa12429f500e563a114b39650b3d8c13c361d3fa Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 22 Apr 2022 11:05:08 +0200 Subject: [PATCH 2/3] Modified last intersection since we lost many titles. --- .../doiboost/SparkGenerateDoiBoost.scala | 83 ++++++++----------- 1 file changed, 34 insertions(+), 49 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 9323c994cf..10a5be969d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -59,52 +59,6 @@ object SparkGenerateDoiBoost { val workingDirPath = parser.get("workingPath") val openaireOrganizationPath = parser.get("openaireOrganizationPath") - val crossrefAggregator = new Aggregator[(String, Publication), Publication, Publication] with Serializable { - override def zero: Publication = new Publication - - override def reduce(b: Publication, a: (String, Publication)): Publication = { - - if (b == null) { - if (a != null && a._2 != null) { - a._2.setId(a._1) - return a._2 - } - } else { - if (a != null && a._2 != null) { - b.mergeFrom(a._2) - b.setId(a._1) - val authors = AuthorMerger.mergeAuthor(b.getAuthor, a._2.getAuthor) - b.setAuthor(authors) - return b - } - } - new Publication - } - - override def merge(b1: Publication, b2: Publication): Publication = { - if (b1 == null) { - if (b2 != null) - return b2 - } else { - if (b2 != null) { - b1.mergeFrom(b2) - val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor) - b1.setAuthor(authors) - if (b2.getId != null && b2.getId.nonEmpty) - b1.setId(b2.getId) - return b1 - } - } - new Publication - } - - override def finish(reduction: Publication): Publication = reduction - - override def bufferEncoder: Encoder[Publication] = Encoders.kryo[Publication] - - override def outputEncoder: Encoder[Publication] = Encoders.kryo[Publication] - } - implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization] implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset] @@ -175,12 +129,43 @@ object SparkGenerateDoiBoost { .map(DoiBoostMappingUtil.fixPublication) .map(p => (p.getId, p)) .groupByKey(_._1) - .agg(crossrefAggregator.toColumn) - .map(p => p._2) + .reduceGroups((left, right) => + { + //Check left is not null + if (left != null && left._1 != null) + { + //If right is null then return left + if (right == null || right._2 == null) + left + else { + // Here Left and Right are not null + // So we have to merge + val b1 = left._2 + val b2 = right._2 + b1.mergeFrom(b2) + b1.mergeOAFDataInfo(b2) + val authors = AuthorMerger.mergeAuthor(b1.getAuthor, b2.getAuthor) + b1.setAuthor(authors) + if (b2.getId != null && b2.getId.nonEmpty) + b1.setId(b2.getId) + //Return publication Merged + (b1.getId, b1) + } + } + else { + // Left is Null so we return right + right + } + } + + ) + .filter(s => s!= null && s._2!=null) + .map(s => s._2._2) .write .mode(SaveMode.Overwrite) .save(s"$workingDirPath/doiBoostPublicationFiltered") + val affiliationPath = parser.get("affiliationPath") val paperAffiliationPath = parser.get("paperAffiliationPath") @@ -305,4 +290,4 @@ object SparkGenerateDoiBoost { .save(s"$workingDirPath/doiBoostOrganization") } -} +} \ No newline at end of file From a82ec3aaaf60ea77918f3ff1cfab20538ef6fe7e Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 22 Apr 2022 11:08:13 +0200 Subject: [PATCH 3/3] code formatter --- .../doiboost/SparkGenerateDoiBoost.scala | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 10a5be969d..2cbd530979 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -129,11 +129,9 @@ object SparkGenerateDoiBoost { .map(DoiBoostMappingUtil.fixPublication) .map(p => (p.getId, p)) .groupByKey(_._1) - .reduceGroups((left, right) => - { + .reduceGroups((left, right) => { //Check left is not null - if (left != null && left._1 != null) - { + if (left != null && left._1 != null) { //If right is null then return left if (right == null || right._2 == null) left @@ -151,21 +149,17 @@ object SparkGenerateDoiBoost { //Return publication Merged (b1.getId, b1) } - } - else { + } else { // Left is Null so we return right right } - } - - ) - .filter(s => s!= null && s._2!=null) + }) + .filter(s => s != null && s._2 != null) .map(s => s._2._2) .write .mode(SaveMode.Overwrite) .save(s"$workingDirPath/doiBoostPublicationFiltered") - val affiliationPath = parser.get("affiliationPath") val paperAffiliationPath = parser.get("paperAffiliationPath") @@ -290,4 +284,4 @@ object SparkGenerateDoiBoost { .save(s"$workingDirPath/doiBoostOrganization") } -} \ No newline at end of file +}