diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index c5ed0b45ce..730e8a3fe7 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -29,7 +29,6 @@ public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -63,15 +62,16 @@ public class SparkEoscTag { }); } - public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){ + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) { EoscIfGuidelines eig = new EoscIfGuidelines(); - eig.setCode( code); + eig.setCode(code); eig.setLabel(label); eig.setUrl(url); eig.setSemanticRelation(semantics); return eig; } + private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) @@ -80,14 +80,17 @@ public class SparkEoscTag { if (containsCriteriaNotebook(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith"); + addEIG( + s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", + "compliesWith"); } if (containsCriteriaGalaxy(s)) { if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) s.setEoscifguidelines(new ArrayList<>()); - addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + addEIG( + s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -109,10 +112,12 @@ public class SparkEoscTag { orp.setEoscifguidelines(new ArrayList<>()); if (containsCriteriaGalaxy(orp)) { - addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); + addEIG( + orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", + "compliesWith"); } if (containscriteriaTwitter(orp)) { - addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -133,7 +138,7 @@ public class SparkEoscTag { if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); + addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -149,12 +154,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } - private static void addEIG(List eoscifguidelines, String code, String label, String url, String sem) { + private static void addEIG(List eoscifguidelines, String code, String label, String url, + String sem) { if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) eoscifguidelines.add(newInstance(code, label, url, sem)); } - private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index b1c0cbb84a..5f47da10e5 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,9 +126,12 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp - .filter(s -> s.getEoscifguidelines()!= null) + .filter(s -> s.getEoscifguidelines() != null) .filter( - s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); Assertions @@ -141,13 +144,13 @@ public class EOSCTagJobTest { .size()); Assertions - .assertEquals( - 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getSubject() - .size()); + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .size()); Assertions .assertTrue( tmp @@ -159,14 +162,14 @@ public class EOSCTagJobTest { .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions - .assertFalse( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getSubject() - .stream() - .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + .assertFalse( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getSubject() + .stream() + .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -186,12 +189,13 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) - .collect() - .get(0) - .getEoscifguidelines() == null - ); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( @@ -211,22 +215,22 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); Assertions - .assertEquals( - 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); Assertions - .assertTrue( - tmp - .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -245,13 +249,13 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertTrue( + Assertions + .assertTrue( tmp - .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) - .collect() - .get(0) - .getEoscifguidelines() == null - ); + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( @@ -270,20 +274,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); - Assertions.assertEquals(1, + Assertions + .assertEquals( + 1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) @@ -315,13 +323,16 @@ public class EOSCTagJobTest { ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); Assertions - .assertEquals( - 0, sc - .textFile(workingDir.toString() + "/input/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) - .filter( - ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) - .count()); + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -336,17 +347,23 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp + .getSubject() + .stream() + .anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); Assertions - .assertEquals( - 0, sc - .textFile(workingDir.toString() + "/input/otherresearchproduct") - .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) - .filter( - orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) - .count()); + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp + .getSubject() + .stream() + .anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); // spark.stop(); } @@ -413,20 +430,24 @@ public class EOSCTagJobTest { s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); Assertions - .assertEquals( - 1, - tmp - .filter( - s -> s.getEoscifguidelines()!=null) - .count()); + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .count()); Assertions - .assertEquals( - 1, - tmp - .filter( - s -> s.getEoscifguidelines()!=null) - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) - .count()); + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( @@ -446,18 +467,23 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); - Assertions.assertEquals(1, tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getEoscifguidelines() - .size() ); - Assertions.assertTrue(tmp - .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) - .collect() - .get(0) - .getEoscifguidelines() - .stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -499,10 +525,18 @@ public class EOSCTagJobTest { .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); - orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); + orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); - Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null) - .filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count()); + Assertions + .assertEquals( + 1, orp + .filter(o -> o.getEoscifguidelines() != null) + .filter( + o -> o + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( @@ -522,22 +556,22 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); Assertions - .assertEquals( - 1, orp - .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) - .collect() - .get(0) - .getEoscifguidelines() - .size()); + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); Assertions - .assertTrue( - orp - .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) - .collect() - .get(0) - .getEoscifguidelines() - .stream() - .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -651,11 +685,15 @@ public class EOSCTagJobTest { .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); Assertions - .assertEquals( - 3, - orp - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) - .count()); + .assertEquals( + 3, + orp + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -667,7 +705,11 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); }