diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java index b9de5dd11..730e8a3fe 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkEoscTag.java @@ -28,28 +28,6 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public class SparkEoscTag { private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final Qualifier EOSC_QUALIFIER = OafMapperUtils - .qualifier( - "EOSC", - "European Open Science Cloud", - ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES); - public static final DataInfo EOSC_DATAINFO = OafMapperUtils - .dataInfo( - false, "propagation", true, false, - OafMapperUtils - .qualifier( - "propagation:subject", "Inferred by OpenAIRE", - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.9"); - public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils - .structuredProperty( - "EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_GALAXY = OafMapperUtils - .structuredProperty( - "EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO); - public final static StructuredProperty EOSC_TWITTER = OafMapperUtils - .structuredProperty( - "EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO); public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -84,29 +62,35 @@ public class SparkEoscTag { }); } + public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) { + EoscIfGuidelines eig = new EoscIfGuidelines(); + eig.setCode(code); + eig.setLabel(label); + eig.setUrl(url); + eig.setSemanticRelation(semantics); + return eig; + + } + private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { readPath(spark, inputPath + "/software", Software.class) .map((MapFunction) s -> { - List sbject; - if (!Optional.ofNullable(s.getSubject()).isPresent()) - s.setSubject(new ArrayList<>()); - sbject = s.getSubject(); if (containsCriteriaNotebook(s)) { - sbject.add(EOSC_NOTEBOOK); - if (sbject.stream().anyMatch(sb -> sb.getValue().equals("EOSC Jupyter Notebook"))) { - sbject = sbject.stream().map(sb -> { - if (sb.getValue().equals("EOSC Jupyter Notebook")) { - return null; - } - return sb; - }).filter(Objects::nonNull).collect(Collectors.toList()); - s.setSubject(sbject); - } + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + addEIG( + s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", + "compliesWith"); + } if (containsCriteriaGalaxy(s)) { - sbject.add(EOSC_GALAXY); + if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) + s.setEoscifguidelines(new ArrayList<>()); + + addEIG( + s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); } return s; }, Encoders.bean(Software.class)) @@ -123,15 +107,17 @@ public class SparkEoscTag { readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class) .map((MapFunction) orp -> { - List sbject; - if (!Optional.ofNullable(orp.getSubject()).isPresent()) - orp.setSubject(new ArrayList<>()); - sbject = orp.getSubject(); + + if (!Optional.ofNullable(orp.getEoscifguidelines()).isPresent()) + orp.setEoscifguidelines(new ArrayList<>()); + if (containsCriteriaGalaxy(orp)) { - sbject.add(EOSC_GALAXY); + addEIG( + orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", + "compliesWith"); } if (containscriteriaTwitter(orp)) { - sbject.add(EOSC_TWITTER); + addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return orp; }, Encoders.bean(OtherResearchProduct.class)) @@ -148,12 +134,11 @@ public class SparkEoscTag { readPath(spark, inputPath + "/dataset", Dataset.class) .map((MapFunction) d -> { - List sbject; - if (!Optional.ofNullable(d.getSubject()).isPresent()) - d.setSubject(new ArrayList<>()); - sbject = d.getSubject(); + + if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) + d.setEoscifguidelines(new ArrayList<>()); if (containscriteriaTwitter(d)) { - sbject.add(EOSC_TWITTER); + addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); } return d; }, Encoders.bean(Dataset.class)) @@ -169,6 +154,12 @@ public class SparkEoscTag { .json(inputPath + "/dataset"); } + private static void addEIG(List eoscifguidelines, String code, String label, String url, + String sem) { + if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) + eoscifguidelines.add(newInstance(code, label, url, sem)); + } + private static boolean containscriteriaTwitter(Result r) { Set words = getWordsSP(r.getTitle()); words.addAll(getWordsF(r.getDescription())); @@ -212,13 +203,6 @@ public class SparkEoscTag { return false; } - private static Set getSubjects(List s) { - Set subjects = new HashSet<>(); - s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" ")))); - s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase())); - return subjects; - } - private static Set getWordsSP(List elem) { Set words = new HashSet<>(); Optional @@ -242,9 +226,7 @@ public class SparkEoscTag { t -> words .addAll( Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); -// elem -// .forEach( -// t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))); + return words; } diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index 1ea254157..5f47da10e 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -126,10 +126,23 @@ public class EOSCTagJobTest { .assertEquals( 4, tmp + .filter(s -> s.getEoscifguidelines() != null) .filter( - s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions .assertEquals( 1, tmp @@ -140,6 +153,16 @@ public class EOSCTagJobTest { .size()); Assertions .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); + + Assertions + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() @@ -166,16 +189,24 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) + .collect() + .get(0) + .getEoscifguidelines() == null); + Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .collect() @@ -183,6 +214,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); Assertions .assertEquals( @@ -201,17 +249,24 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) + .collect() + .get(0) + .getEoscifguidelines() == null); Assertions .assertEquals( - 9, tmp + 8, tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .collect() @@ -219,14 +274,31 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions + .assertEquals( + 1, + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .collect() .get(0) .getSubject(); - Assertions.assertEquals(8, subjects.size()); - Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); + Assertions.assertEquals(7, subjects.size()); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("Modeling and Simulation"))); Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("structure granulaire"))); @@ -250,6 +322,17 @@ public class EOSCTagJobTest { .filter( ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) .count()); + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) + .filter( + ds -> ds + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) + .count()); Assertions .assertEquals( @@ -264,7 +347,22 @@ public class EOSCTagJobTest { .textFile(workingDir.toString() + "/input/otherresearchproduct") .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .filter( - ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + orp -> orp + .getSubject() + .stream() + .anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) + .count()); + + Assertions + .assertEquals( + 0, sc + .textFile(workingDir.toString() + "/input/otherresearchproduct") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) + .filter( + orp -> orp + .getSubject() + .stream() + .anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) .count()); // spark.stop(); @@ -326,22 +424,41 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, tmp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .count()); + Assertions + .assertEquals( + 1, + tmp + .filter( + s -> s.getEoscifguidelines() != null) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); Assertions .assertEquals( - 2, tmp + 1, tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( tmp .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .collect() @@ -350,6 +467,24 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + tmp + .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); + Assertions .assertEquals( 5, tmp @@ -385,22 +520,34 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 1, + 0, orp .filter( s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) .count()); + orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); Assertions .assertEquals( - 3, orp + 1, orp + .filter(o -> o.getEoscifguidelines() != null) + .filter( + o -> o + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) + .count()); + + Assertions + .assertEquals( + 2, orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() .get(0) .getSubject() .size()); Assertions - .assertTrue( + .assertFalse( orp .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .collect() @@ -408,6 +555,23 @@ public class EOSCTagJobTest { .getSubject() .stream() .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); + Assertions + .assertEquals( + 1, orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .size()); + Assertions + .assertTrue( + orp + .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) + .collect() + .get(0) + .getEoscifguidelines() + .stream() + .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); Assertions .assertEquals( @@ -516,10 +680,20 @@ public class EOSCTagJobTest { Assertions .assertEquals( - 3, + 0, orp .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .count()); + Assertions + .assertEquals( + 3, + orp + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) + .count()); JavaRDD dats = sc .textFile(workingDir.toString() + "/input/dataset") @@ -531,7 +705,11 @@ public class EOSCTagJobTest { .assertEquals( 3, dats - .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) + .filter( + s -> s + .getEoscifguidelines() + .stream() + .anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .count()); } diff --git a/pom.xml b/pom.xml index 54070f654..973bc3773 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.12.0] + [2.12.2-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6]