From 438abdf96fcdc340dfdbe18fa51c56c8a3ce5657 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Wed, 20 Jul 2022 18:07:54 +0200 Subject: [PATCH 1/4] [EOSC TAG] adding eosc interoperability guidelines in the specific element in the result. Removed from subjects. Removed also the deletion of EOSC Jupyter Notebook from subject since now the criteria are searchd for in a different place --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 82 ++++++------------- pom.xml | 2 +- 2 files changed, 27 insertions(+), 57 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index b9de5dd11..d31934081 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -28,28 +28,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final Qualifier EOSC_QUALIFIER = OafMapperUtils - .qualifier( - "EOSC", - "European Open Science Cloud", - ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); - public static final DataInfo EOSC_DATAINFO = OafMapperUtils - .dataInfo( - false, "propagation", true, false, - OafMapperUtils - .qualifier( - "propagation:subject", "Inferred by OpenAIRE", - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.9"); - public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils - .structuredProperty( - "EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_GALAXY = OafMapperUtils - .structuredProperty( - "EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_TWITTER = OafMapperUtils - .structuredProperty( - "EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO); + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -84,29 +63,30 @@ public class SparkEoscTag { }); } + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){ + EoscIfGuidelines eig = new EoscIfGuidelines(); + eig.setCode( code); + eig.setLabel(label); + eig.setUrl(url); + eig.setSemanticRelation(semantics); + return eig; + + } private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) .map((MapFunction) s -> { - List sbject; - if (!Optional.ofNullable(s.getSubject()).isPresent()) - s.setSubject(new ArrayList<>()); - sbject = s.getSubject(); if (containsCriteriaNotebook(s)) { - sbject.add(EOSC_NOTEBOOK); - if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) { - sbject = sbject.stream().map(sb -> { - if (sb.getValue().equals("EOSC Jupyter Notebook")) { - return null; - } - return sb; - }).filter(Objects::nonNull).collect(Collectors.toList()); - s.setSubject(sbject); - } + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + s.getEoscifguidelines().add(newInstance("EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith")); } if (containsCriteriaGalaxy(s)) { - sbject.add(EOSC_GALAXY); + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + + s.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); } return s; }, Encoders.bean(Software.class)) @@ -124,14 +104,14 @@ public class SparkEoscTag { readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) .map((MapFunction) orp -> { List sbject; - if (!Optional.ofNullable(orp.getSubject()).isPresent()) - orp.setSubject(new ArrayList<>()); - sbject = orp.getSubject(); + if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent()) + orp.setEoscifguidelines(new ArrayList<>()); + if (containsCriteriaGalaxy(orp)) { - sbject.add(EOSC_GALAXY); + orp.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); } if (containscriteriaTwitter(orp)) { - sbject.add(EOSC_TWITTER); + orp.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -149,11 +129,10 @@ public class SparkEoscTag { readPath(spark, inputPath + "/dataset", Dataset.class) .map((MapFunction) d -> { List sbject; - if (!Optional.ofNullable(d.getSubject()).isPresent()) - d.setSubject(new ArrayList<>()); - sbject = d.getSubject(); + if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) + d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - sbject.add(EOSC_TWITTER); + d.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); } return d; }, Encoders.bean(Dataset.class)) @@ -212,13 +191,6 @@ public class SparkEoscTag { return false; } - private static Set getSubjects(List s) { - Set subjects = new HashSet<>(); - s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" ")))); - s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase())); - return subjects; - } - private static Set getWordsSP(List elem) { Set words = new HashSet<>(); Optional @@ -242,9 +214,7 @@ public class SparkEoscTag { t -> words .addAll( Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); -// elem -// .forEach( -// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))); + return words; } diff --git a/pom.xml b/pom.xml index 54070f654..973bc3773 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.12.0] + [2.12.2-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6] From 5143a802320b3a64a16b09f298f005d4fbce78e9 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 11:56:51 +0200 Subject: [PATCH 2/4] [EOSC TAG] modification of test class to align with new element --- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 174 ++++++++++++++++-- 1 file changed, 155 insertions(+), 19 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 1ea254157..b1c0cbb84 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,8 +126,9 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp + .filter(s -> s.getEoscifguidelines()!= null) .filter( - s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); Assertions @@ -136,17 +137,36 @@ public class EOSCTagJobTest { .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) - .getSubject() + .getEoscifguidelines() .size()); + + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .size()); Assertions .assertTrue( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) - .getSubject() + .getEoscifguidelines() .stream() - .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + + Assertions + .assertFalse( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -166,16 +186,23 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertTrue(tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null + ); + Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() @@ -183,6 +210,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -201,17 +245,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null + ); Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() @@ -219,14 +270,27 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertEquals(1, + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions.assertTrue(tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .collect() .get(0) .getSubject(); - Assertions.assertEquals(8, subjects.size()); - Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertEquals(7, subjects.size()); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire"))); @@ -250,6 +314,14 @@ public class EOSCTagJobTest { .filter( ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -264,9 +336,18 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); + // spark.stop(); } @@ -326,22 +407,37 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, tmp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines()!=null) + .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines()!=null) + .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( - 2, tmp + 1, tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() @@ -350,6 +446,19 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions.assertEquals(1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size() ); + Assertions.assertTrue(tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions .assertEquals( 5, tmp @@ -385,22 +494,26 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, orp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); + + Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null) + .filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count()); Assertions .assertEquals( - 3, orp + 2, orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() @@ -408,6 +521,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -516,10 +646,16 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 3, + 0, orp .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); + Assertions + .assertEquals( + 3, + orp + .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -531,7 +667,7 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) + .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); } From 56d09e6348d63d531250ea6f075c1d306c07ed66 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 14:36:48 +0200 Subject: [PATCH 3/4] [EOSC TAG] before adding the tag added a step to verify the same tag is not already present --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index d31934081..c5ed0b45c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -80,13 +80,14 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - s.getEoscifguidelines().add(newInstance("EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith")); + addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith"); + } if (containsCriteriaGalaxy(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - s.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); + addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -103,15 +104,15 @@ public class SparkEoscTag { readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) .map((MapFunction) orp -> { - List sbject; + if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent()) orp.setEoscifguidelines(new ArrayList<>()); if (containsCriteriaGalaxy(orp)) { - orp.getEoscifguidelines().add(newInstance("EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith")); + addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } if (containscriteriaTwitter(orp)) { - orp.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); + addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -128,11 +129,11 @@ public class SparkEoscTag { readPath(spark, inputPath + "/dataset", Dataset.class) .map((MapFunction) d -> { - List sbject; + if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - d.getEoscifguidelines().add(newInstance("EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith")); + addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -148,6 +149,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } + private static void addEIG(List eoscifguidelines, String code, String label, String url, String sem) { + if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) + eoscifguidelines.add(newInstance(code, label, url, sem)); + } + + private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); From 3be036f290f3cb569465cf9566557e51c4660288 Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Thu, 21 Jul 2022 14:45:43 +0200 Subject: [PATCH 4/4] [EOSC TAG] refactoring after compilation --- .../eu/dnetlib/dhp/bulktag/SparkEoscTag.java | 25 +- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 284 ++++++++++-------- 2 files changed, 178 insertions(+), 131 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index c5ed0b45c..730e8a3fe 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -29,7 +29,6 @@ public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -63,15 +62,16 @@ public class SparkEoscTag { }); } - public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){ + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) { EoscIfGuidelines eig = new EoscIfGuidelines(); - eig.setCode( code); + eig.setCode(code); eig.setLabel(label); eig.setUrl(url); eig.setSemanticRelation(semantics); return eig; } + private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) @@ -80,14 +80,17 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith"); + addEIG( + s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", + "compliesWith"); } if (containsCriteriaGalaxy(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + addEIG( + s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -109,10 +112,12 @@ public class SparkEoscTag { orp.setEoscifguidelines(new ArrayList<>()); if (containsCriteriaGalaxy(orp)) { - addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + addEIG( + orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", + "compliesWith"); } if (containscriteriaTwitter(orp)) { - addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -133,7 +138,7 @@ public class SparkEoscTag { if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -149,12 +154,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } - private static void addEIG(List eoscifguidelines, String code, String label, String url, String sem) { + private static void addEIG(List eoscifguidelines, String code, String label, String url, + String sem) { if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) eoscifguidelines.add(newInstance(code, label, url, sem)); } - private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index b1c0cbb84..5f47da10e 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,9 +126,12 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp - .filter(s -> s.getEoscifguidelines()!= null) + .filter(s -> s.getEoscifguidelines() != null) .filter( - s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); Assertions @@ -141,13 +144,13 @@ public class EOSCTagJobTest { .size()); Assertions - .assertEquals( - 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getSubject() - .size()); + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .size()); Assertions .assertTrue( tmp @@ -159,14 +162,14 @@ public class EOSCTagJobTest { .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions - .assertFalse( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getSubject() - .stream() - .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + .assertFalse( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -186,12 +189,13 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) - .collect() - .get(0) - .getEoscifguidelines() == null - ); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( @@ -211,22 +215,22 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions - .assertEquals( - 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); Assertions - .assertTrue( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -245,13 +249,13 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertTrue( + Assertions + .assertTrue( tmp - .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) - .collect() - .get(0) - .getEoscifguidelines() == null - ); + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( @@ -270,20 +274,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertEquals(1, + Assertions + .assertEquals( + 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) @@ -315,13 +323,16 @@ public class EOSCTagJobTest { ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); Assertions - .assertEquals( - 0, sc - .textFile(workingDir.toString() + "/input/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) - .filter( - ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) - .count()); + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -336,17 +347,23 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp + .getSubject() + .stream() + .anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); Assertions - .assertEquals( - 0, sc - .textFile(workingDir.toString() + "/input/otherresearchproduct") - .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) - .filter( - orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) - .count()); + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp + .getSubject() + .stream() + .anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); // spark.stop(); } @@ -413,20 +430,24 @@ public class EOSCTagJobTest { s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); Assertions - .assertEquals( - 1, - tmp - .filter( - s -> s.getEoscifguidelines()!=null) - .count()); + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .count()); Assertions - .assertEquals( - 1, - tmp - .filter( - s -> s.getEoscifguidelines()!=null) - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) - .count()); + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( @@ -446,18 +467,23 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); - Assertions.assertEquals(1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getEoscifguidelines() - .size() ); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getEoscifguidelines() - .stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -499,10 +525,18 @@ public class EOSCTagJobTest { .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); - orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); + orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); - Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null) - .filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count()); + Assertions + .assertEquals( + 1, orp + .filter(o -> o.getEoscifguidelines() != null) + .filter( + o -> o + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( @@ -522,22 +556,22 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); Assertions - .assertEquals( - 1, orp - .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); Assertions - .assertTrue( - orp - .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -651,11 +685,15 @@ public class EOSCTagJobTest { .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); Assertions - .assertEquals( - 3, - orp - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) - .count()); + .assertEquals( + 3, + orp + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -667,7 +705,11 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); }