forked from D-Net/dnet-hadoop
[EOSCTag] last test and change in the implementation to search in title and descriptio
This commit is contained in:
parent
e37177e1ce
commit
a21fe310e5
|
@ -584,12 +584,10 @@ case object Crossref2Oaf {
|
|||
if (dp.length == 10) {
|
||||
return GraphCleaningFunctions.cleanDate(dp)
|
||||
}
|
||||
}
|
||||
else if (res.size ==2) {
|
||||
} else if (res.size == 2) {
|
||||
val dp = f"${res.head}-${res(1)}%02d-01"
|
||||
return GraphCleaningFunctions.cleanDate(dp)
|
||||
}
|
||||
else if (res.size ==1) {
|
||||
} else if (res.size == 1) {
|
||||
return GraphCleaningFunctions.cleanDate(s"${res.head}-01-01")
|
||||
}
|
||||
}
|
||||
|
|
|
@ -73,10 +73,10 @@ class CrossrefMappingTest {
|
|||
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
def crossrefIssueDateTest(): Unit = {
|
||||
val json = Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
|
||||
val json =
|
||||
Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/issue_date.json")).mkString
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty)
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
|
|
|
@ -1,10 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import static eu.dnetlib.dhp.PropagationConstant.readPath;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
|
@ -14,29 +18,38 @@ import org.apache.spark.sql.SparkSession;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import static eu.dnetlib.dhp.PropagationConstant.readPath;
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class SparkEoscTag {
|
||||
private static final Logger log = LoggerFactory.getLogger(SparkEoscTag.class);
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils.qualifier("eosc",
|
||||
public static final Qualifier EOSC_QUALIFIER = OafMapperUtils
|
||||
.qualifier(
|
||||
"eosc",
|
||||
"European Open Science Cloud",
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES,ModelConstants.DNET_SUBJECT_TYPOLOGIES);
|
||||
public static final DataInfo EOSC_DATAINFO = OafMapperUtils.dataInfo(false, "propagation", true, false,
|
||||
OafMapperUtils.qualifier("propagation:subject","Inferred by OpenAIRE",
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,ModelConstants.DNET_PROVENANCE_ACTIONS), "0.9");
|
||||
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils.structuredProperty(
|
||||
"EOSC::Jupyter Notebook", EOSC_QUALIFIER,EOSC_DATAINFO);
|
||||
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils.structuredProperty(
|
||||
ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES);
|
||||
public static final DataInfo EOSC_DATAINFO = OafMapperUtils
|
||||
.dataInfo(
|
||||
false, "propagation", true, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
"propagation:subject", "Inferred by OpenAIRE",
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"0.9");
|
||||
public final static StructuredProperty EOSC_NOTEBOOK = OafMapperUtils
|
||||
.structuredProperty(
|
||||
"EOSC::Jupyter Notebook", EOSC_QUALIFIER, EOSC_DATAINFO);
|
||||
public final static StructuredProperty EOSC_GALAXY = OafMapperUtils
|
||||
.structuredProperty(
|
||||
"EOSC::Galaxy Workflow", EOSC_QUALIFIER, EOSC_DATAINFO);
|
||||
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils.structuredProperty(
|
||||
"EOSC::Twitter Data", EOSC_QUALIFIER,EOSC_DATAINFO);
|
||||
public final static StructuredProperty EOSC_TWITTER = OafMapperUtils
|
||||
.structuredProperty(
|
||||
"EOSC::Twitter Data", EOSC_QUALIFIER, EOSC_DATAINFO);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
|
@ -80,50 +93,49 @@ public class SparkEoscTag {
|
|||
s.setSubject(new ArrayList<>());
|
||||
sbject = s.getSubject();
|
||||
|
||||
if(containsCriteriaNotebook(s)){
|
||||
if (containsCriteriaNotebook(s)) {
|
||||
sbject.add(EOSC_NOTEBOOK);
|
||||
|
||||
}
|
||||
if(containsCriteriaGalaxy(s)){
|
||||
if (containsCriteriaGalaxy(s)) {
|
||||
sbject.add(EOSC_GALAXY);
|
||||
}
|
||||
return s;
|
||||
}, Encoders.bean(Software.class) )
|
||||
}, Encoders.bean(Software.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/software");
|
||||
|
||||
readPath(spark, workingPath + "/software" , Software.class)
|
||||
readPath(spark, workingPath + "/software", Software.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "/software");
|
||||
|
||||
readPath(spark, inputPath + "/otherresearchproduct", OtherResearchProduct.class)
|
||||
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp ->
|
||||
{
|
||||
.map((MapFunction<OtherResearchProduct, OtherResearchProduct>) orp -> {
|
||||
List<StructuredProperty> sbject;
|
||||
if (!Optional.ofNullable(orp.getSubject()).isPresent())
|
||||
orp.setSubject(new ArrayList<>());
|
||||
sbject = orp.getSubject();
|
||||
if(containsCriteriaGalaxy(orp)){
|
||||
if (containsCriteriaGalaxy(orp)) {
|
||||
sbject.add(EOSC_GALAXY);
|
||||
}
|
||||
if(containscriteriaTwitter(orp)){
|
||||
if (containscriteriaTwitter(orp)) {
|
||||
sbject.add(EOSC_TWITTER);
|
||||
}
|
||||
return orp;
|
||||
}, Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/otherresearchproduct");
|
||||
|
||||
readPath(spark, workingPath + "/otherresearchproduct", OtherResearchProduct.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "/otherresearchproduct");
|
||||
|
||||
readPath(spark, inputPath + "/dataset", Dataset.class)
|
||||
|
@ -132,44 +144,45 @@ public class SparkEoscTag {
|
|||
if (!Optional.ofNullable(d.getSubject()).isPresent())
|
||||
d.setSubject(new ArrayList<>());
|
||||
sbject = d.getSubject();
|
||||
if(containscriteriaTwitter(d)){
|
||||
if (containscriteriaTwitter(d)) {
|
||||
sbject.add(EOSC_TWITTER);
|
||||
}
|
||||
return d;
|
||||
} , Encoders.bean(Dataset.class) )
|
||||
}, Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath + "/dataset");
|
||||
|
||||
readPath(spark, workingPath + "/dataset" , Dataset.class)
|
||||
readPath(spark, workingPath + "/dataset", Dataset.class)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression","gzip")
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "/dataset");
|
||||
}
|
||||
|
||||
private static boolean containscriteriaTwitter(Result r) {
|
||||
if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("twitter") &&
|
||||
t.getValue().toLowerCase().contains("data")))
|
||||
Set<String> words = getWordsSP(r.getTitle());
|
||||
words.addAll(getWordsF(r.getDescription()));
|
||||
|
||||
if (words.contains("twitter") &&
|
||||
(words.contains("data") || words.contains("dataset")))
|
||||
return true;
|
||||
if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("twitter") &&
|
||||
d.getValue().toLowerCase().contains("data") ))
|
||||
return true;
|
||||
if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
|
||||
|
||||
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("twitter")) &&
|
||||
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("data")))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean containsCriteriaGalaxy(Result r) {
|
||||
if (r.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("galaxy") &&
|
||||
(t.getValue().toLowerCase().contains("workflow") || t.getValue().toLowerCase().contains("software"))))
|
||||
Set<String> words = getWordsSP(r.getTitle());
|
||||
words.addAll(getWordsF(r.getDescription()));
|
||||
if (words.contains("galaxy") &&
|
||||
(words.contains("workflow") || words.contains("software")))
|
||||
return true;
|
||||
if(r.getDescription().stream().anyMatch(d -> d.getValue().toLowerCase().contains("galaxy") &&
|
||||
(d.getValue().toLowerCase().contains("workflow") || d.getValue().toLowerCase().contains("software"))))
|
||||
return true;
|
||||
if(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
|
||||
|
||||
if (r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("galaxy")) &&
|
||||
(r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("workflow"))) ||
|
||||
r.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("software")))
|
||||
return true;
|
||||
|
@ -177,22 +190,42 @@ public class SparkEoscTag {
|
|||
}
|
||||
|
||||
private static boolean containsCriteriaNotebook(Software s) {
|
||||
if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
|
||||
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("jupyter")))
|
||||
return true;
|
||||
if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python") &&
|
||||
if (s
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(
|
||||
sbj -> sbj.getValue().toLowerCase().contains("python") &&
|
||||
sbj.getValue().toLowerCase().contains("notebook")))
|
||||
return true;
|
||||
if(s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
|
||||
if (s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("python")) &&
|
||||
s.getSubject().stream().anyMatch(sbj -> sbj.getValue().toLowerCase().contains("notebook")))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean containsTitleNotebook(Software s) {
|
||||
if (s.getTitle().stream().anyMatch(t -> t.getValue().toLowerCase().contains("jupyter") &&
|
||||
t.getValue().toLowerCase().contains("notebook")))
|
||||
return true;
|
||||
return false;
|
||||
private static Set<String> getSubjects(List<StructuredProperty> s) {
|
||||
Set<String> subjects = new HashSet<>();
|
||||
s.stream().forEach(sbj -> subjects.addAll(Arrays.asList(sbj.getValue().toLowerCase().split(" "))));
|
||||
s.stream().forEach(sbj -> subjects.add(sbj.getValue().toLowerCase()));
|
||||
return subjects;
|
||||
}
|
||||
|
||||
private static Set<String> getWordsSP(List<StructuredProperty> elem) {
|
||||
Set<String> words = new HashSet<>();
|
||||
elem
|
||||
.forEach(
|
||||
t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
|
||||
return words;
|
||||
}
|
||||
|
||||
private static Set<String> getWordsF(List<Field<String>> elem) {
|
||||
Set<String> words = new HashSet<>();
|
||||
elem
|
||||
.forEach(
|
||||
t -> words.addAll(Arrays.asList(t.getValue().toLowerCase().replaceAll("[^a-zA-Z ]", "").split(" "))));
|
||||
return words;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -11,6 +16,7 @@ import org.apache.spark.api.java.JavaSparkContext;
|
|||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
|
@ -19,26 +25,20 @@ import org.junit.jupiter.api.Test;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import static eu.dnetlib.dhp.bulktag.community.TaggingConstants.ZENODO_COMMUNITY_INDICATOR;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class EOSCTagJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(EOSCTagJobTest.class);
|
||||
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
workingDir = Files.createTempDirectory(EOSCTagJobTest.class.getSimpleName());
|
||||
|
@ -70,22 +70,38 @@ public class EOSCTagJobTest {
|
|||
@Test
|
||||
void jupyterUpdatesTest() throws Exception {
|
||||
|
||||
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
|
||||
.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/software").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.write()
|
||||
.option("compression","gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/software");
|
||||
|
||||
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
|
||||
.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/dataset").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.option("compression","gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/dataset");
|
||||
|
||||
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
|
||||
.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/jupyter/otherresearchproduct").getPath())
|
||||
.map(
|
||||
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
|
||||
.readValue(value, OtherResearchProduct.class),
|
||||
Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.option("compression","gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/otherresearchproduct");
|
||||
|
||||
SparkEoscTag
|
||||
|
@ -106,36 +122,109 @@ public class EOSCTagJobTest {
|
|||
|
||||
Assertions.assertEquals(10, tmp.count());
|
||||
|
||||
Assertions.assertEquals(4, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
4,
|
||||
tmp
|
||||
.filter(
|
||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
9, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
5, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions.assertEquals(5, tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::5aec1186054301b66c0c5dc35972a589")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
9, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
Assertions.assertEquals(9, tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::639909adfad9d708308f2aedb733e4a0")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
|
||||
List<StructuredProperty> subjects = tmp.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")).collect()
|
||||
.get(0).getSubject();
|
||||
List<StructuredProperty> subjects = tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject();
|
||||
Assertions.assertEquals(8, subjects.size());
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("EOSC::Jupyter Notebook")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("jupyter")));
|
||||
|
@ -146,44 +235,75 @@ public class EOSCTagJobTest {
|
|||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de gaz")));
|
||||
Assertions.assertTrue(subjects.stream().anyMatch(s -> s.getValue().equals("flux de liquide")));
|
||||
|
||||
|
||||
Assertions.assertEquals(10, sc
|
||||
Assertions
|
||||
.assertEquals(
|
||||
10, sc
|
||||
.textFile(workingDir.toString() + "/input/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).count());
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(0, sc
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0, sc
|
||||
.textFile(workingDir.toString() + "/input/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class))
|
||||
.filter(
|
||||
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
|
||||
.count());
|
||||
|
||||
|
||||
Assertions.assertEquals(10, sc
|
||||
Assertions
|
||||
.assertEquals(
|
||||
10, sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).count());
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(0, sc
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0, sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)).filter(ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook"))).count());
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class))
|
||||
.filter(
|
||||
ds -> ds.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Jupyter Notebook")))
|
||||
.count());
|
||||
|
||||
// spark.stop();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void galaxyUpdatesTest() throws Exception {
|
||||
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
|
||||
.map((MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class), Encoders.bean(Software.class))
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/software").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.write()
|
||||
.option("compression","gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/software");
|
||||
|
||||
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
|
||||
.map((MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class), Encoders.bean(Dataset.class))
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/dataset").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.option("compression","gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/dataset");
|
||||
|
||||
spark.read().textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
|
||||
.map((MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER.readValue(value, OtherResearchProduct.class), Encoders.bean(OtherResearchProduct.class))
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/galaxy/otherresearchproduct").getPath())
|
||||
.map(
|
||||
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
|
||||
.readValue(value, OtherResearchProduct.class),
|
||||
Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.option("compression","gzip")
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/otherresearchproduct");
|
||||
|
||||
SparkEoscTag
|
||||
|
@ -204,46 +324,224 @@ public class EOSCTagJobTest {
|
|||
|
||||
Assertions.assertEquals(10, tmp.count());
|
||||
|
||||
Assertions.assertEquals(2, tmp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
tmp
|
||||
.filter(
|
||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(2, tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
6, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions.assertEquals(6, tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(tmp.filter(sw -> sw.getId().equals("50|od______1582::501b25d420f808c8eddcd9b16e917f11")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
8, tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
tmp
|
||||
.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions.assertEquals(8, tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertFalse(tmp.filter(sw -> sw.getId().equals("50|od______1582::581621232a561b7e8b4952b18b8b0e56")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
JavaRDD<OtherResearchProduct> orp = sc.textFile(workingDir.toString() + "/input/otherresearchproduct").map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
|
||||
JavaRDD<OtherResearchProduct> orp = sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
|
||||
|
||||
Assertions.assertEquals(10, orp.count());
|
||||
|
||||
Assertions.assertEquals(2, orp.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow"))).count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
orp
|
||||
.filter(
|
||||
s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Galaxy Workflow")))
|
||||
.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::0750a4d0782265873d669520f5e33c07")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2, orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertFalse(
|
||||
orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions.assertEquals(2, orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertFalse(orp.filter(sw -> sw.getId().equals("50|od______2017::1bd97baef19dbd2db3203b112bb83bc5")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
Assertions.assertEquals(3, orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
|
||||
.get(0).getSubject().size());
|
||||
Assertions.assertTrue(orp.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72")).collect()
|
||||
.get(0).getSubject().stream().anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.size());
|
||||
Assertions
|
||||
.assertTrue(
|
||||
orp
|
||||
.filter(sw -> sw.getId().equals("50|od______2017::1e400f1747487fd15998735c41a55c72"))
|
||||
.collect()
|
||||
.get(0)
|
||||
.getSubject()
|
||||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("EOSC::Galaxy Workflow")));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void twitterUpdatesTest() throws Exception {
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/software").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Software>) value -> OBJECT_MAPPER.readValue(value, Software.class),
|
||||
Encoders.bean(Software.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/software");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/dataset").getPath())
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) value -> OBJECT_MAPPER.readValue(value, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/dataset");
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(getClass().getResource("/eu/dnetlib/dhp/eosctag/twitter/otherresearchproduct").getPath())
|
||||
.map(
|
||||
(MapFunction<String, OtherResearchProduct>) value -> OBJECT_MAPPER
|
||||
.readValue(value, OtherResearchProduct.class),
|
||||
Encoders.bean(OtherResearchProduct.class))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingDir.toString() + "/input/otherresearchproduct");
|
||||
|
||||
SparkEoscTag
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath",
|
||||
workingDir.toString() + "/input",
|
||||
"-workingPath", workingDir.toString() + "/working"
|
||||
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Software> tmp = sc
|
||||
.textFile(workingDir.toString() + "/input/software")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Software.class));
|
||||
|
||||
Assertions.assertEquals(10, tmp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
0,
|
||||
tmp
|
||||
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
|
||||
.count());
|
||||
|
||||
JavaRDD<OtherResearchProduct> orp = sc
|
||||
.textFile(workingDir.toString() + "/input/otherresearchproduct")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class));
|
||||
|
||||
Assertions.assertEquals(10, orp.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3,
|
||||
orp
|
||||
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
|
||||
.count());
|
||||
|
||||
JavaRDD<Dataset> dats = sc
|
||||
.textFile(workingDir.toString() + "/input/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
|
||||
Assertions.assertEquals(10, dats.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
2,
|
||||
dats
|
||||
.filter(s -> s.getSubject().stream().anyMatch(sbj -> sbj.getValue().equals("EOSC::Twitter Data")))
|
||||
.count());
|
||||
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue