EOSC IF #230

Merged
claudio.atzori merged 6 commits from tagEosc into beta 2022-07-25 14:14:54 +02:00
2 changed files with 178 additions and 131 deletions
Showing only changes of commit 3be036f290 - Show all commits

View File

@ -29,7 +29,6 @@ public class SparkEoscTag {
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
@ -63,15 +62,16 @@ public class SparkEoscTag {
});
}
public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics){
public static EoscIfGuidelines newInstance(String code, String label, String url, String semantics) {
EoscIfGuidelines eig = new EoscIfGuidelines();
eig.setCode( code);
eig.setCode(code);
eig.setLabel(label);
eig.setUrl(url);
eig.setSemanticRelation(semantics);
return eig;
}
private static void execEoscTag(SparkSession spark, String inputPath, String workingPath) {
readPath(spark, inputPath + "/software", Software.class)
@ -80,14 +80,17 @@ public class SparkEoscTag {
if (containsCriteriaNotebook(s)) {
if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
s.setEoscifguidelines(new ArrayList<>());
addEIG(s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", "compliesWith");
addEIG(
s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "",
"compliesWith");
}
if (containsCriteriaGalaxy(s)) {
if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
s.setEoscifguidelines(new ArrayList<>());
addEIG(s.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith");
addEIG(
s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith");
}
return s;
}, Encoders.bean(Software.class))
@ -109,10 +112,12 @@ public class SparkEoscTag {
orp.setEoscifguidelines(new ArrayList<>());
if (containsCriteriaGalaxy(orp)) {
addEIG(orp.getEoscifguidelines(),"EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith");
addEIG(
orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "",
"compliesWith");
}
if (containscriteriaTwitter(orp)) {
addEIG(orp.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith");
addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith");
}
return orp;
}, Encoders.bean(OtherResearchProduct.class))
@ -133,7 +138,7 @@ public class SparkEoscTag {
if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent())
d.setEoscifguidelines(new ArrayList<>());
if (containscriteriaTwitter(d)) {
addEIG(d.getEoscifguidelines(),"EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith");
addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith");
}
return d;
}, Encoders.bean(Dataset.class))
@ -149,12 +154,12 @@ public class SparkEoscTag {
.json(inputPath + "/dataset");
}
private static void addEIG(List<EoscIfGuidelines> eoscifguidelines, String code, String label, String url, String sem) {
private static void addEIG(List<EoscIfGuidelines> eoscifguidelines, String code, String label, String url,
String sem) {
if (!eoscifguidelines.stream().anyMatch(eig -> eig.getCode().equals(code)))
eoscifguidelines.add(newInstance(code, label, url, sem));
}
private static boolean containscriteriaTwitter(Result r) {
Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription()));

View File

@ -126,9 +126,12 @@ public class EOSCTagJobTest {
.assertEquals(
4,
tmp
.filter(s -> s.getEoscifguidelines()!= null)
.filter(s -> s.getEoscifguidelines() != null)
.filter(
s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
@ -141,13 +144,13 @@ public class EOSCTagJobTest {
.size());
Assertions
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.size());
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
@ -159,14 +162,14 @@ public class EOSCTagJobTest {
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
@ -186,12 +189,13 @@ public class EOSCTagJobTest {
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue(tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getEoscifguidelines() == null
);
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getEoscifguidelines() == null);
Assertions
.assertEquals(
@ -211,22 +215,22 @@ public class EOSCTagJobTest {
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
@ -245,13 +249,13 @@ public class EOSCTagJobTest {
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue(
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getEoscifguidelines() == null
);
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getEoscifguidelines() == null);
Assertions
.assertEquals(
@ -270,20 +274,24 @@ public class EOSCTagJobTest {
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertEquals(1,
Assertions
.assertEquals(
1,
tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions.assertTrue(tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
@ -315,13 +323,16 @@ public class EOSCTagJobTest {
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.filter(
ds -> ds.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count());
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.filter(
ds -> ds
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
.assertEquals(
@ -336,17 +347,23 @@ public class EOSCTagJobTest {
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter(
orp -> orp.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
orp -> orp
.getSubject()
.stream()
.anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter(
orp -> orp.getSubject().stream().anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook")))
.count());
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter(
orp -> orp
.getSubject()
.stream()
.anyMatch(eig -> eig.getValue().equals("EOSC::Jupyter Notebook")))
.count());
// spark.stop();
}
@ -413,20 +430,24 @@ public class EOSCTagJobTest {
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
s -> s.getEoscifguidelines()!=null)
.count());
.assertEquals(
1,
tmp
.filter(
s -> s.getEoscifguidelines() != null)
.count());
Assertions
.assertEquals(
1,
tmp
.filter(
s -> s.getEoscifguidelines()!=null)
.filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
.assertEquals(
1,
tmp
.filter(
s -> s.getEoscifguidelines() != null)
.filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
@ -446,18 +467,23 @@ public class EOSCTagJobTest {
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.size() );
Assertions.assertTrue(tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
1, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
@ -499,10 +525,18 @@ public class EOSCTagJobTest {
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count());
orp.foreach(o-> System.out.println(OBJECT_MAPPER.writeValueAsString(o)));
orp.foreach(o -> System.out.println(OBJECT_MAPPER.writeValueAsString(o)));
Assertions.assertEquals(1, orp.filter(o -> o.getEoscifguidelines() != null)
.filter(o -> o.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow"))).count());
Assertions
.assertEquals(
1, orp
.filter(o -> o.getEoscifguidelines() != null)
.filter(
o -> o
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
@ -522,22 +556,22 @@ public class EOSCTagJobTest {
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
1, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
.assertEquals(
1, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getEoscifguidelines()
.size());
Assertions
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow")));
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getEoscifguidelines()
.stream()
.anyMatch(s -> s.getCode().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
@ -651,11 +685,15 @@ public class EOSCTagJobTest {
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
Assertions
.assertEquals(
3,
orp
.filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count());
.assertEquals(
3,
orp
.filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count());
JavaRDD<Dataset> dats = sc
.textFile(workingDir.toString() + "/input/dataset")
@ -667,7 +705,11 @@ public class EOSCTagJobTest {
.assertEquals(
3,
dats
.filter(s -> s.getEoscifguidelines().stream().anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.filter(
s -> s
.getEoscifguidelines()
.stream()
.anyMatch(eig -> eig.getCode().equals("EOSC::Twitter Data")))
.count());
}