EOSC IF #230

Merged
claudio.atzori merged 6 commits from tagEosc into beta 2022-07-25 14:14:54 +02:00
2 changed files with 178 additions and 131 deletions
Showing only changes of commit 3be036f290 - Show all commits

View File

@ -29,7 +29,6 @@ public class SparkEoscTag {
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
.toString( .toString(
@ -63,15 +62,16 @@ public class SparkEoscTag {
}); });
} }
public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){ public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) {
EoscIfGuidelines eig = new EoscIfGuidelines(); EoscIfGuidelines eig = new EoscIfGuidelines();
eig.setCode( code); eig.setCode(code);
eig.setLabel(label); eig.setLabel(label);
eig.setUrl(url); eig.setUrl(url);
eig.setSemanticRelation(semantics); eig.setSemanticRelation(semantics);
return eig; return eig;
} }
private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) { private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) {
readPath(spark, inputPath + "/software", Software.class) readPath(spark, inputPath + "/software", Software.class)
@ -80,14 +80,17 @@ public class SparkEoscTag {
if (containsCriteriaNotebook(s)) { if (containsCriteriaNotebook(s)) {
if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
s.setEoscifguidelines(new ArrayList<>()); s.setEoscifguidelines(new ArrayList<>());
addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith"); addEIG(
s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "",
"compliesWith");
} }
if (containsCriteriaGalaxy(s)) { if (containsCriteriaGalaxy(s)) {
if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
s.setEoscifguidelines(new ArrayList<>()); s.setEoscifguidelines(new ArrayList<>());
addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); addEIG(
s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith");
} }
return s; return s;
}, Encoders.bean(Software.class)) }, Encoders.bean(Software.class))
@ -109,10 +112,12 @@ public class SparkEoscTag {
orp.setEoscifguidelines(new ArrayList<>()); orp.setEoscifguidelines(new ArrayList<>());
if (containsCriteriaGalaxy(orp)) { if (containsCriteriaGalaxy(orp)) {
addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); addEIG(
orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "",
"compliesWith");
} }
if (containscriteriaTwitter(orp)) { if (containscriteriaTwitter(orp)) {
addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith");
} }
return orp; return orp;
}, Encoders.bean(OtherResearchProduct.class)) }, Encoders.bean(OtherResearchProduct.class))
@ -133,7 +138,7 @@ public class SparkEoscTag {
if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent())
d.setEoscifguidelines(new ArrayList<>()); d.setEoscifguidelines(new ArrayList<>());
if (containscriteriaTwitter(d)) { if (containscriteriaTwitter(d)) {
addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith");
} }
return d; return d;
}, Encoders.bean(Dataset.class)) }, Encoders.bean(Dataset.class))
@ -149,12 +154,12 @@ public class SparkEoscTag {
.json(inputPath + "/dataset"); .json(inputPath + "/dataset");
} }
private static void addEIG(List<EoscIfGuidelines> eoscifguidelines, String code, String label, String url, String sem) { private static void addEIG(List<EoscIfGuidelines> eoscifguidelines, String code, String label, String url,
String sem) {
if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code))) if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code)))
eoscifguidelines.add(newInstance(code, label, url, sem)); eoscifguidelines.add(newInstance(code, label, url, sem));
} }
private static boolean containscriteriaTwitter(Result r) { private static boolean containscriteriaTwitter(Result r) {
Set<String> words = getWordsSP(r.getTitle()); Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription())); words.addAll(getWordsF(r.getDescription()));

View File

@ -126,9 +126,12 @@ public class EOSCTagJobTest {
.assertEquals( .assertEquals(
4, 4,
tmp tmp
.filter(s -> s.getEoscifguidelines()!= null) .filter(s -> s.getEoscifguidelines() != null)
.filter( .filter(
s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count()); .count());
Assertions Assertions
@ -141,13 +144,13 @@ public class EOSCTagJobTest {
.size()); .size());
Assertions Assertions
.assertEquals( .assertEquals(
1, tmp 1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
tmp tmp
@ -159,14 +162,14 @@ public class EOSCTagJobTest {
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
Assertions Assertions
.assertFalse( .assertFalse(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -186,12 +189,13 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue(tmp Assertions
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")) .assertTrue(
.collect() tmp
.get(0) .filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.getEoscifguidelines() == null .collect()
); .get(0)
.getEoscifguidelines() == null);
Assertions Assertions
.assertEquals( .assertEquals(
@ -211,22 +215,22 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions Assertions
.assertEquals( .assertEquals(
1, tmp 1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect() .collect()
.get(0) .get(0)
.getEoscifguidelines() .getEoscifguidelines()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")) .filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect() .collect()
.get(0) .get(0)
.getEoscifguidelines() .getEoscifguidelines()
.stream() .stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -245,13 +249,13 @@ public class EOSCTagJobTest {
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue( Assertions
.assertTrue(
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")) .filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect() .collect()
.get(0) .get(0)
.getEoscifguidelines() == null .getEoscifguidelines() == null);
);
Assertions Assertions
.assertEquals( .assertEquals(
@ -270,20 +274,24 @@ public class EOSCTagJobTest {
.getSubject() .getSubject()
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook"))); .anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertEquals(1, Assertions
.assertEquals(
1,
tmp tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect() .collect()
.get(0) .get(0)
.getEoscifguidelines() .getEoscifguidelines()
.size()); .size());
Assertions.assertTrue(tmp Assertions
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")) .assertTrue(
.collect() tmp
.get(0) .filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.getEoscifguidelines() .collect()
.stream() .get(0)
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); .getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp List<StructuredProperty> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
@ -315,13 +323,16 @@ public class EOSCTagJobTest {
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
0, sc 0, sc
.textFile(workingDir.toString() + "/input/dataset") .textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)) .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.filter( .filter(
ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook"))) ds -> ds
.count()); .getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count());
Assertions Assertions
.assertEquals( .assertEquals(
@ -336,17 +347,23 @@ public class EOSCTagJobTest {
.textFile(workingDir.toString() + "/input/otherresearchproduct") .textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter( .filter(
orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))) orp -> orp
.getSubject()
.stream()
.anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
0, sc 0, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct") .textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)) .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter( .filter(
orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook"))) orp -> orp
.count()); .getSubject()
.stream()
.anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook")))
.count());
// spark.stop(); // spark.stop();
} }
@ -413,20 +430,24 @@ public class EOSCTagJobTest {
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
1, 1,
tmp tmp
.filter( .filter(
s -> s.getEoscifguidelines()!=null) s -> s.getEoscifguidelines() != null)
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
1, 1,
tmp tmp
.filter( .filter(
s -> s.getEoscifguidelines()!=null) s -> s.getEoscifguidelines() != null)
.filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))) .filter(
.count()); s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
Assertions Assertions
.assertEquals( .assertEquals(
@ -446,18 +467,23 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(1, tmp Assertions
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .assertEquals(
.collect() 1, tmp
.get(0) .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.getEoscifguidelines() .collect()
.size() ); .get(0)
Assertions.assertTrue(tmp .getEoscifguidelines()
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")) .size());
.collect() Assertions
.get(0) .assertTrue(
.getEoscifguidelines() tmp
.stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))); .filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -499,10 +525,18 @@ public class EOSCTagJobTest {
.filter( .filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))) s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count()); .count());
orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o))); orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o)));
Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null) Assertions
.filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count()); .assertEquals(
1, orp
.filter(o -> o.getEoscifguidelines() != null)
.filter(
o -> o
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
Assertions Assertions
.assertEquals( .assertEquals(
@ -522,22 +556,22 @@ public class EOSCTagJobTest {
.stream() .stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow"))); .anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions Assertions
.assertEquals( .assertEquals(
1, orp 1, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect() .collect()
.get(0) .get(0)
.getEoscifguidelines() .getEoscifguidelines()
.size()); .size());
Assertions Assertions
.assertTrue( .assertTrue(
orp orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")) .filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect() .collect()
.get(0) .get(0)
.getEoscifguidelines() .getEoscifguidelines()
.stream() .stream()
.anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow"))); .anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow")));
Assertions Assertions
.assertEquals( .assertEquals(
@ -651,11 +685,15 @@ public class EOSCTagJobTest {
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data"))) .filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count()); .count());
Assertions Assertions
.assertEquals( .assertEquals(
3, 3,
orp orp
.filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .filter(
.count()); s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count());
JavaRDD<Dataset> dats = sc JavaRDD<Dataset> dats = sc
.textFile(workingDir.toString() + "/input/dataset") .textFile(workingDir.toString() + "/input/dataset")
@ -667,7 +705,11 @@ public class EOSCTagJobTest {
.assertEquals( .assertEquals(
3, 3,
dats dats
.filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data"))) .filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count()); .count());
} }