[EOSCTag] last test and change in the implementation to search in title and descriptio

This commit is contained in:
Miriam Baglioni 2022-05-02 17:43:20 +02:00
parent e37177e1ce
commit a21fe310e5
7 changed files with 620 additions and 291 deletions

View File

@ -584,12 +584,10 @@ case object Crossref2Oaf {
if (dp.length == 10) {
return GraphCleaningFunctions.cleanDate(dp)
}
}
else if (res.size ==2) {
} else if (res.size == 2) {
val dp = f"${res.head}-${res(1)}%02d-01"
return GraphCleaningFunctions.cleanDate(dp)
}
else if (res.size ==1) {
} else if (res.size == 1) {
return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
}
}

View File

@ -73,10 +73,10 @@ class CrossrefMappingTest {
}
@Test
def crossrefIssueDateTest(): Unit = {
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
val json =
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
assertNotNull(json)
assertFalse(json.isEmpty)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)

View File

@ -1,10 +1,14 @@
package eu.dnetlib.dhp.bulktag;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
@ -14,29 +18,38 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import com.fasterxml.jackson.databind.ObjectMapper;
import static eu.dnetlib.dhp.PropagationConstant.readPath;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class SparkEoscTag {
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils.qualifier("eosc",
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
.qualifier(
"eosc",
"European Open Science Cloud",
ModelConstants.DNET_SUBJECT_TYPOLOGIES,ModelConstants.DNET_SUBJECT_TYPOLOGIES);
public static final DataInfo EOSC_DATAINFO = OafMapperUtils.dataInfo(false, "propagation", true, false,
OafMapperUtils.qualifier("propagation:subject","Inferred by OpenAIRE",
ModelConstants.DNET_PROVENANCE_ACTIONS,ModelConstants.DNET_PROVENANCE_ACTIONS), "0.9");
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils.structuredProperty(
"EOSC::Jupyter Notebook", EOSC_QUALIFIER,EOSC_DATAINFO);
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils.structuredProperty(
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
public static final DataInfo EOSC_DATAINFO = OafMapperUtils
.dataInfo(
false, "propagation", true, false,
OafMapperUtils
.qualifier(
"propagation:subject", "Inferred by OpenAIRE",
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
"0.9");
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
.structuredProperty(
"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
.structuredProperty(
"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils.structuredProperty(
"EOSC::Twitter Data", EOSC_QUALIFIER,EOSC_DATAINFO);
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
.structuredProperty(
"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
@ -80,50 +93,49 @@ public class SparkEoscTag {
s.setSubject(new ArrayList<>());
sbject = s.getSubject();
if(containsCriteriaNotebook(s)){
if (containsCriteriaNotebook(s)) {
sbject.add(EOSC_NOTEBOOK);
}
if(containsCriteriaGalaxy(s)){
if (containsCriteriaGalaxy(s)) {
sbject.add(EOSC_GALAXY);
}
return s;
}, Encoders.bean(Software.class) )
}, Encoders.bean(Software.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(workingPath + "/software");
readPath(spark, workingPath + "/software" , Software.class)
readPath(spark, workingPath + "/software", Software.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(inputPath + "/software");
readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp ->
{
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
List<StructuredProperty> sbject;
if (!Optional.ofNullable(orp.getSubject()).isPresent())
orp.setSubject(new ArrayList<>());
sbject = orp.getSubject();
if(containsCriteriaGalaxy(orp)){
if (containsCriteriaGalaxy(orp)) {
sbject.add(EOSC_GALAXY);
}
if(containscriteriaTwitter(orp)){
if (containscriteriaTwitter(orp)) {
sbject.add(EOSC_TWITTER);
}
return orp;
}, Encoders.bean(OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(workingPath + "/otherresearchproduct");
readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(inputPath + "/otherresearchproduct");
readPath(spark, inputPath + "/dataset", Dataset.class)
@ -132,44 +144,45 @@ public class SparkEoscTag {
if (!Optional.ofNullable(d.getSubject()).isPresent())
d.setSubject(new ArrayList<>());
sbject = d.getSubject();
if(containscriteriaTwitter(d)){
if (containscriteriaTwitter(d)) {
sbject.add(EOSC_TWITTER);
}
return d;
} , Encoders.bean(Dataset.class) )
}, Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(workingPath + "/dataset");
readPath(spark, workingPath + "/dataset" , Dataset.class)
readPath(spark, workingPath + "/dataset", Dataset.class)
.write()
.mode(SaveMode.Overwrite)
.option("compression","gzip")
.option("compression", "gzip")
.json(inputPath + "/dataset");
}
private static boolean containscriteriaTwitter(Result r) {
if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("twitter") &&
t.getValue().toLowerCase().contains("data")))
Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription()));
if (words.contains("twitter") &&
(words.contains("data") || words.contains("dataset")))
return true;
if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("twitter") &&
d.getValue().toLowerCase().contains("data") ))
return true;
if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
return true;
return false;
}
private static boolean containsCriteriaGalaxy(Result r) {
if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("galaxy") &&
(t.getValue().toLowerCase().contains("workflow") || t.getValue().toLowerCase().contains("software"))))
Set<String> words = getWordsSP(r.getTitle());
words.addAll(getWordsF(r.getDescription()));
if (words.contains("galaxy") &&
(words.contains("workflow") || words.contains("software")))
return true;
if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("galaxy") &&
(d.getValue().toLowerCase().contains("workflow") || d.getValue().toLowerCase().contains("software"))))
return true;
if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) ||
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("software")))
return true;
@ -177,22 +190,42 @@ public class SparkEoscTag {
}
private static boolean containsCriteriaNotebook(Software s) {
if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
return true;
if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python") &&
if (s
.getSubject()
.stream()
.anyMatch(
sbj -> sbj.getValue().toLowerCase().contains("python") &&
sbj.getValue().toLowerCase().contains("notebook")))
return true;
if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
return true;
return false;
}
private static boolean containsTitleNotebook(Software s) {
if (s.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("jupyter") &&
t.getValue().toLowerCase().contains("notebook")))
return true;
return false;
private static Set<String> getSubjects(List<StructuredProperty> s) {
Set<String> subjects = new HashSet<>();
s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
return subjects;
}
private static Set<String> getWordsSP(List<StructuredProperty> elem) {
Set<String> words = new HashSet<>();
elem
.forEach(
t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
return words;
}
private static Set<String> getWordsF(List<Field<String>> elem) {
Set<String> words = new HashSet<>();
elem
.forEach(
t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
return words;
}
}

View File

@ -1,8 +1,13 @@
package eu.dnetlib.dhp.bulktag;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.*;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
@ -11,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
@ -19,26 +25,20 @@ import org.junit.jupiter.api.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import com.fasterxml.jackson.databind.ObjectMapper;
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
import eu.dnetlib.dhp.schema.oaf.*;
public class EOSCTagJobTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path workingDir;
private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
@ -70,22 +70,38 @@ public class EOSCTagJobTest {
@Test
void jupyterUpdatesTest() throws Exception {
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
.map(
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
Encoders.bean(Software.class))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/software");
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
.map(
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
.readValue(value, OtherResearchProduct.class),
Encoders.bean(OtherResearchProduct.class))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/otherresearchproduct");
SparkEoscTag
@ -106,36 +122,109 @@ public class EOSCTagJobTest {
Assertions.assertEquals(10, tmp.count());
Assertions.assertEquals(4, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
Assertions
.assertEquals(
4,
tmp
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
2, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
5, tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
.get(0).getSubject().size());
Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
9, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
5, tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
.get(0).getSubject().size());
Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions
.assertEquals(
9, tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
List<StructuredProperty> subjects = tmp.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")).collect()
.get(0).getSubject();
List<StructuredProperty> subjects = tmp
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
.collect()
.get(0)
.getSubject();
Assertions.assertEquals(8, subjects.size());
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
@ -146,44 +235,75 @@ public class EOSCTagJobTest {
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));
Assertions.assertEquals(10, sc
Assertions
.assertEquals(
10, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).count());
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.count());
Assertions.assertEquals(0, sc
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
.filter(
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
Assertions.assertEquals(10, sc
Assertions
.assertEquals(
10, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).count());
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.count());
Assertions.assertEquals(0, sc
Assertions
.assertEquals(
0, sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
.filter(
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
.count());
// spark.stop();
}
@Test
void galaxyUpdatesTest() throws Exception {
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
.map(
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
Encoders.bean(Software.class))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/software");
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
.map(
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
.readValue(value, OtherResearchProduct.class),
Encoders.bean(OtherResearchProduct.class))
.write()
.option("compression","gzip")
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/otherresearchproduct");
SparkEoscTag
@ -204,46 +324,224 @@ public class EOSCTagJobTest {
Assertions.assertEquals(10, tmp.count());
Assertions.assertEquals(2, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
Assertions
.assertEquals(
2,
tmp
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count());
Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
2, tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
6, tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(6, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
8, tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
tmp
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(8, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
.get(0).getSubject().size());
Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
JavaRDD<OtherResearchProduct> orp = sc.textFile(workingDir.toString() + "/input/otherresearchproduct").map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
JavaRDD<OtherResearchProduct> orp = sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
Assertions.assertEquals(10, orp.count());
Assertions.assertEquals(2, orp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
Assertions
.assertEquals(
2,
orp
.filter(
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
.count());
Assertions
.assertEquals(
3, orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
2, orp
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertFalse(
orp
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(2, orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
.get(0).getSubject().size());
Assertions.assertFalse(orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
.get(0).getSubject().size());
Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
Assertions
.assertEquals(
3, orp
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertTrue(
orp
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
.collect()
.get(0)
.getSubject()
.stream()
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
}
@Test
void twitterUpdatesTest() throws Exception {
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
.map(
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
Encoders.bean(Software.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/software");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
.map(
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/dataset");
spark
.read()
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
.map(
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
.readValue(value, OtherResearchProduct.class),
Encoders.bean(OtherResearchProduct.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingDir.toString() + "/input/otherresearchproduct");
SparkEoscTag
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath",
workingDir.toString() + "/input",
"-workingPath", workingDir.toString() + "/working"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Software> tmp = sc
.textFile(workingDir.toString() + "/input/software")
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
Assertions.assertEquals(10, tmp.count());
Assertions
.assertEquals(
0,
tmp
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
JavaRDD<OtherResearchProduct> orp = sc
.textFile(workingDir.toString() + "/input/otherresearchproduct")
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
Assertions.assertEquals(10, orp.count());
Assertions
.assertEquals(
3,
orp
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
JavaRDD<Dataset> dats = sc
.textFile(workingDir.toString() + "/input/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(10, dats.count());
Assertions
.assertEquals(
2,
dats
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
.count());
}
}