FOS #175

Merged
miriam.baglioni merged 5 commits from FOS into beta 2021-12-23 09:06:57 +01:00
2 changed files with 146 additions and 154 deletions
Showing only changes of commit 10579c0dd0 - Show all commits

View File

@ -65,35 +65,35 @@ public class PrepareFOSSparkJob implements Serializable {
private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) { private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) {
Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class); Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class);
fosDataset.groupByKey((MapFunction<FOSDataModel,String>)v->v.getDoi(), Encoders.STRING()) fosDataset
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>)(k,it)->{ .groupByKey((MapFunction<FOSDataModel, String>) v -> v.getDoi(), Encoders.STRING())
Result r = new Result(); .mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
FOSDataModel first = it.next(); Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(first.getDoi(), DOI)); FOSDataModel first = it.next();
HashSet<String> level1 = new HashSet<>(); r.setId(DHPUtils.generateUnresolvedIdentifier(first.getDoi(), DOI));
HashSet<String> level2 = new HashSet<>(); HashSet<String> level1 = new HashSet<>();
HashSet<String> level3 = new HashSet<>(); HashSet<String> level2 = new HashSet<>();
addLevels(level1, level2, level3, first); HashSet<String> level3 = new HashSet<>();
it.forEachRemaining(v -> addLevels(level1, level2, level3, v)); addLevels(level1, level2, level3, first);
List<StructuredProperty>sbjs = new ArrayList<>(); it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME))); List<StructuredProperty> sbjs = new ArrayList<>();
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME))); level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME))); level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME)));
r.setSubject(sbjs); level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME)));
return r; r.setSubject(sbjs);
}, Encoders.bean(Result.class)) return r;
}, Encoders.bean(Result.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath + "/fos"); .json(outputPath + "/fos");
} }
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3, FOSDataModel first) { private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
FOSDataModel first) {
level1.add(first.getLevel1()); level1.add(first.getLevel1());
level2.add(first.getLevel2()); level2.add(first.getLevel2());
level3.add(first.getLevel3()); level3.add(first.getLevel3());
} }
} }

View File

@ -67,29 +67,29 @@ public class ProduceTest {
} }
@Test @Test
void produceTestSubjects()throws Exception{ void produceTestSubjects() throws Exception {
JavaRDD<Result> tmp = getResultJavaRDD(); JavaRDD<Result> tmp = getResultJavaRDD();
List<StructuredProperty> sbjs = tmp List<StructuredProperty> sbjs = tmp
.filter(row -> row.getSubject()!= null && row.getSubject().size()>0) .filter(row -> row.getSubject() != null && row.getSubject().size() > 0)
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
sbjs.forEach(sbj -> Assertions.assertEquals("FOS", sbj.getQualifier().getClassid())); sbjs.forEach(sbj -> Assertions.assertEquals("FOS", sbj.getQualifier().getClassid()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals( .assertEquals(
"Fields of Science and Technology classification", sbj.getQualifier().getClassname())); "Fields of Science and Technology classification", sbj.getQualifier().getClassname()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid())); .assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemeid()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename())); .assertEquals(ModelConstants.DNET_SUBJECT_TYPOLOGIES, sbj.getQualifier().getSchemename()));
sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference())); sbjs.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
sbjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred())); sbjs.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
@ -97,38 +97,36 @@ public class ProduceTest {
sbjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust())); sbjs.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
sbjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance())); sbjs.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid())); sbj -> Assertions.assertEquals("subject:fos", sbj.getDataInfo().getProvenanceaction().getClassid()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname())); .assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals( .assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid())); ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
sbjs sbjs
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals( .assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
sbj.getDataInfo().getProvenanceaction().getSchemename())); sbj.getDataInfo().getProvenanceaction().getSchemename()));
} }
@Test @Test
void produceTestMeasuress()throws Exception{ void produceTestMeasuress() throws Exception {
JavaRDD<Result> tmp = getResultJavaRDD(); JavaRDD<Result> tmp = getResultJavaRDD();
List<KeyValue> mes = tmp List<KeyValue> mes = tmp
.filter(row -> row.getInstance()!= null && row.getInstance().size()>0) .filter(row -> row.getInstance() != null && row.getInstance().size() > 0)
.flatMap(row -> row.getInstance().iterator()) .flatMap(row -> row.getInstance().iterator())
.flatMap(i->i.getMeasures().iterator()) .flatMap(i -> i.getMeasures().iterator())
.flatMap(m ->m.getUnit().iterator()) .flatMap(m -> m.getUnit().iterator())
.collect(); .collect();
mes.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference())); mes.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
mes.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred())); mes.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
@ -136,119 +134,118 @@ public class ProduceTest {
mes.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust())); mes.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
mes.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance())); mes.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
mes mes
.forEach( .forEach(
sbj -> Assertions.assertEquals("measure:bip", sbj.getDataInfo().getProvenanceaction().getClassid())); sbj -> Assertions.assertEquals("measure:bip", sbj.getDataInfo().getProvenanceaction().getClassid()));
mes mes
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname())); .assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
mes mes
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals( .assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid())); ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
mes mes
.forEach( .forEach(
sbj -> Assertions sbj -> Assertions
.assertEquals( .assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
sbj.getDataInfo().getProvenanceaction().getSchemename())); sbj.getDataInfo().getProvenanceaction().getSchemename()));
} }
@Test @Test
void produceTest6Subjects() throws Exception{ void produceTest6Subjects() throws Exception {
final String doi = "unresolved::10.3390/s18072310::doi"; final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD(); JavaRDD<Result> tmp = getResultJavaRDD();
Assertions Assertions
.assertEquals( .assertEquals(
6, tmp 6, tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
List<StructuredProperty> sbjs = tmp List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
Assertions Assertions
.assertEquals( .assertEquals(
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("04 agricultural and veterinary sciences"))); true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("04 agricultural and veterinary sciences")));
Assertions Assertions
.assertEquals( .assertEquals(
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0404 agricultural biotechnology"))); true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0404 agricultural biotechnology")));
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("040502 food science"))); Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("040502 food science")));
Assertions Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("03 medical and health sciences"))); .assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("03 medical and health sciences")));
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0303 health sciences"))); Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0303 health sciences")));
Assertions Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("030309 nutrition & dietetics"))); .assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("030309 nutrition & dietetics")));
} }
@Test @Test
void produceTest3Measures()throws Exception{ void produceTest3Measures() throws Exception {
final String doi = "unresolved::10.3390/s18072310::doi"; final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD(); JavaRDD<Result> tmp = getResultJavaRDD();
Assertions Assertions
.assertEquals( .assertEquals(
3, tmp 3, tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.collect() .collect()
.get(0) .get(0)
.getInstance() .getInstance()
.get(0) .get(0)
.getMeasures() .getMeasures()
.size()); .size());
List<Measure> measures = tmp List<Measure> measures = tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getInstance().iterator()) .flatMap(row -> row.getInstance().iterator())
.flatMap(inst -> inst.getMeasures().iterator()) .flatMap(inst -> inst.getMeasures().iterator())
.collect(); .collect();
Assertions Assertions
.assertEquals( .assertEquals(
"7.5597134689e-09", measures "7.5597134689e-09", measures
.stream() .stream()
.filter(mes -> mes.getId().equals("influence")) .filter(mes -> mes.getId().equals("influence"))
.collect(Collectors.toList()) .collect(Collectors.toList())
.get(0) .get(0)
.getUnit() .getUnit()
.get(0) .get(0)
.getValue()); .getValue());
Assertions Assertions
.assertEquals( .assertEquals(
"4.903880192", measures "4.903880192", measures
.stream() .stream()
.filter(mes -> mes.getId().equals("popularity_alt")) .filter(mes -> mes.getId().equals("popularity_alt"))
.collect(Collectors.toList()) .collect(Collectors.toList())
.get(0) .get(0)
.getUnit() .getUnit()
.get(0) .get(0)
.getValue()); .getValue());
Assertions Assertions
.assertEquals( .assertEquals(
"1.17977512835e-08", measures "1.17977512835e-08", measures
.stream() .stream()
.filter(mes -> mes.getId().equals("popularity")) .filter(mes -> mes.getId().equals("popularity"))
.collect(Collectors.toList()) .collect(Collectors.toList())
.get(0) .get(0)
.getUnit() .getUnit()
.get(0) .get(0)
.getValue()); .getValue());
} }
@Test @Test
void produceTestSomeNumbers() throws Exception { void produceTestSomeNumbers() throws Exception {
@ -316,43 +313,38 @@ public class ProduceTest {
.map(item -> OBJECT_MAPPER.readValue(item, Result.class)); .map(item -> OBJECT_MAPPER.readValue(item, Result.class));
} }
@Test @Test
void prepareTest5Subjects()throws Exception{ void prepareTest5Subjects() throws Exception {
final String doi = "unresolved::10.3390/s18072310::doi"; final String doi = "unresolved::10.1063/5.0032658::doi";
JavaRDD<Result> tmp = getResultJavaRDD(); JavaRDD<Result> tmp = getResultJavaRDD();
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count()); Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
Assertions Assertions
.assertEquals( .assertEquals(
5, tmp 5, tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.collect() .collect()
.get(0) .get(0)
.getSubject() .getSubject()
.size()); .size());
List<StructuredProperty> sbjs = tmp List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals(doi)) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
Assertions Assertions
.assertEquals( .assertEquals(
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("01 natural sciences"))); true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("01 natural sciences")));
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0103 physical sciences"))); Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0103 physical sciences")));
Assertions Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010304 chemical physics"))); .assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010304 chemical physics")));
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0104 chemical sciences"))); Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0104 chemical sciences")));
Assertions Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010402 general chemistry"))); .assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010402 general chemistry")));
} }