1
0
Fork 0

[EOSC tag] avoid NPEs

This commit is contained in:
Claudio Atzori 2022-07-29 11:55:34 +02:00
parent 3329b6ce6b
commit 0727f0ef48
2 changed files with 25 additions and 18 deletions

View File

@ -23,6 +23,10 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class SparkEoscTag { public class SparkEoscTag {
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class); private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final String EOSC_GALAXY_WORKFLOW = "EOSC::Galaxy Workflow";
public static final String EOSC_TWITTER_DATA = "EOSC::Twitter Data";
public static final String EOSC_JUPYTER_NOTEBOOK = "EOSC::Jupyter Notebook";
public static final String COMPLIES_WITH = "compliesWith";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils String jsonConfiguration = IOUtils
@ -76,8 +80,8 @@ public class SparkEoscTag {
if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent()) if (!Optional.ofNullable(s.getEoscifguidelines()).isPresent())
s.setEoscifguidelines(new ArrayList<>()); s.setEoscifguidelines(new ArrayList<>());
addEIG( addEIG(
s.getEoscifguidelines(), "EOSC::Jupyter Notebook", "EOSC::Jupyter Notebook", "", s.getEoscifguidelines(), EOSC_JUPYTER_NOTEBOOK, EOSC_JUPYTER_NOTEBOOK, "",
"compliesWith"); COMPLIES_WITH);
} }
if (containsCriteriaGalaxy(s)) { if (containsCriteriaGalaxy(s)) {
@ -85,7 +89,7 @@ public class SparkEoscTag {
s.setEoscifguidelines(new ArrayList<>()); s.setEoscifguidelines(new ArrayList<>());
addEIG( addEIG(
s.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", "compliesWith"); s.getEoscifguidelines(), EOSC_GALAXY_WORKFLOW, EOSC_GALAXY_WORKFLOW, "", COMPLIES_WITH);
} }
return s; return s;
}, Encoders.bean(Software.class)) }, Encoders.bean(Software.class))
@ -108,11 +112,11 @@ public class SparkEoscTag {
if (containsCriteriaGalaxy(orp)) { if (containsCriteriaGalaxy(orp)) {
addEIG( addEIG(
orp.getEoscifguidelines(), "EOSC::Galaxy Workflow", "EOSC::Galaxy Workflow", "", orp.getEoscifguidelines(), EOSC_GALAXY_WORKFLOW, EOSC_GALAXY_WORKFLOW, "",
"compliesWith"); COMPLIES_WITH);
} }
if (containscriteriaTwitter(orp)) { if (containscriteriaTwitter(orp)) {
addEIG(orp.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); addEIG(orp.getEoscifguidelines(), EOSC_TWITTER_DATA, EOSC_TWITTER_DATA, "", COMPLIES_WITH);
} }
return orp; return orp;
}, Encoders.bean(OtherResearchProduct.class)) }, Encoders.bean(OtherResearchProduct.class))
@ -133,7 +137,7 @@ public class SparkEoscTag {
if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent()) if (!Optional.ofNullable(d.getEoscifguidelines()).isPresent())
d.setEoscifguidelines(new ArrayList<>()); d.setEoscifguidelines(new ArrayList<>());
if (containscriteriaTwitter(d)) { if (containscriteriaTwitter(d)) {
addEIG(d.getEoscifguidelines(), "EOSC::Twitter Data", "EOSC::Twitter Data", "", "compliesWith"); addEIG(d.getEoscifguidelines(), EOSC_TWITTER_DATA, EOSC_TWITTER_DATA, "", COMPLIES_WITH);
} }
return d; return d;
}, Encoders.bean(Dataset.class)) }, Encoders.bean(Dataset.class))
@ -163,10 +167,12 @@ public class SparkEoscTag {
(words.contains("data") || words.contains("dataset"))) (words.contains("data") || words.contains("dataset")))
return true; return true;
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) && return Optional
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data"))) .ofNullable(r.getSubject())
return true; .map(
return false; s -> s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
.orElse(false);
} }
private static boolean containsCriteriaGalaxy(Result r) { private static boolean containsCriteriaGalaxy(Result r) {
@ -176,14 +182,16 @@ public class SparkEoscTag {
words.contains("workflow")) words.contains("workflow"))
return true; return true;
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) && return Optional
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) .ofNullable(r.getSubject())
return true; .map(
return false; s -> s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
s.stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow")))
.orElse(false);
} }
private static boolean containsCriteriaNotebook(Software s) { private static boolean containsCriteriaNotebook(Software s) {
if(!Optional.ofNullable(s.getSubject()).isPresent()) if (!Optional.ofNullable(s.getSubject()).isPresent())
return false; return false;
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter"))) if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
return true; return true;
@ -225,6 +233,5 @@ public class SparkEoscTag {
Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))))); Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" ")))));
return words; return words;
} }
} }

View File

@ -6,7 +6,6 @@ import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -24,6 +23,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.bulktag.eosc.SparkEoscTag;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
public class EOSCTagJobTest { public class EOSCTagJobTest {