[FOS]added logic to include only different subjects. Test refactoring and extention

This commit is contained in:
Miriam Baglioni 2021-12-22 23:04:22 +01:00
parent b81efb6a9d
commit 6116fc5d40
2 changed files with 251 additions and 139 deletions

View File

@ -79,6 +79,7 @@ public class PrepareFOSSparkJob implements Serializable {
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME))); level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME))); level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME))); level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME)));
r.setSubject(sbjs);
return r; return r;
}, Encoders.bean(Result.class)) }, Encoders.bean(Result.class))
.write() .write()

View File

@ -67,72 +67,12 @@ public class ProduceTest {
} }
@Test @Test
void produceTest() throws Exception { void produceTestSubjects()throws Exception{
final String bipPath = getClass() JavaRDD<Result> tmp = getResultJavaRDD();
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", bipPath,
"--outputPath", workingDir.toString() + "/work"
});
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", fosPath,
"-outputPath", workingDir.toString() + "/work"
});
SparkSaveUnresolved.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", workingDir.toString() + "/work",
"-outputPath", workingDir.toString() + "/unresolved"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Result> tmp = sc
.textFile(workingDir.toString() + "/unresolved")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
Assertions.assertEquals(105, tmp.count());
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")).count());
Assertions
.assertEquals(
6, tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
.collect()
.get(0)
.getSubject()
.size());
Assertions
.assertEquals(
3, tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi"))
.collect()
.get(0)
.getInstance()
.get(0)
.getMeasures()
.size());
List<StructuredProperty> sbjs = tmp List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")) .filter(row -> row.getSubject()!= null && row.getSubject().size()>0)
.flatMap(row -> row.getSubject().iterator()) .flatMap(row -> row.getSubject().iterator())
.collect(); .collect();
@ -174,11 +114,71 @@ public class ProduceTest {
.assertEquals( .assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
sbj.getDataInfo().getProvenanceaction().getSchemename())); sbj.getDataInfo().getProvenanceaction().getSchemename()));
}
@Test
void produceTestMeasuress()throws Exception{
JavaRDD<Result> tmp = getResultJavaRDD();
List<KeyValue> mes = tmp
.filter(row -> row.getInstance()!= null && row.getInstance().size()>0)
.flatMap(row -> row.getInstance().iterator())
.flatMap(i->i.getMeasures().iterator())
.flatMap(m ->m.getUnit().iterator())
.collect();
mes.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getDeletedbyinference()));
mes.forEach(sbj -> Assertions.assertEquals(true, sbj.getDataInfo().getInferred()));
mes.forEach(sbj -> Assertions.assertEquals(false, sbj.getDataInfo().getInvisible()));
mes.forEach(sbj -> Assertions.assertEquals("", sbj.getDataInfo().getTrust()));
mes.forEach(sbj -> Assertions.assertEquals("update", sbj.getDataInfo().getInferenceprovenance()));
mes
.forEach(
sbj -> Assertions.assertEquals("measure:bip", sbj.getDataInfo().getProvenanceaction().getClassid()));
mes
.forEach(
sbj -> Assertions
.assertEquals("Inferred by OpenAIRE", sbj.getDataInfo().getProvenanceaction().getClassname()));
mes
.forEach(
sbj -> Assertions
.assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS, sbj.getDataInfo().getProvenanceaction().getSchemeid()));
mes
.forEach(
sbj -> Assertions
.assertEquals(
ModelConstants.DNET_PROVENANCE_ACTIONS,
sbj.getDataInfo().getProvenanceaction().getSchemename()));
}
@Test
void produceTest6Subjects() throws Exception{
final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD();
Assertions
.assertEquals(
6, tmp
.filter(row -> row.getId().equals(doi))
.collect()
.get(0)
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
Assertions Assertions
.assertEquals( .assertEquals(
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("04 agricultural and veterinary sciences"))); true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("04 agricultural and veterinary sciences")));
Assertions.assertEquals(false, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("nano-technology")));
Assertions Assertions
.assertEquals( .assertEquals(
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0404 agricultural biotechnology"))); true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0404 agricultural biotechnology")));
@ -190,8 +190,28 @@ public class ProduceTest {
Assertions Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("030309 nutrition & dietetics"))); .assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("030309 nutrition & dietetics")));
}
@Test
void produceTest3Measures()throws Exception{
final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD();
Assertions
.assertEquals(
3, tmp
.filter(row -> row.getId().equals(doi))
.collect()
.get(0)
.getInstance()
.get(0)
.getMeasures()
.size());
List<Measure> measures = tmp List<Measure> measures = tmp
.filter(row -> row.getId().equals("unresolved::10.3390/s18072310::doi")) .filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getInstance().iterator()) .flatMap(row -> row.getInstance().iterator())
.flatMap(inst -> inst.getMeasures().iterator()) .flatMap(inst -> inst.getMeasures().iterator())
.collect(); .collect();
@ -228,10 +248,21 @@ public class ProduceTest {
.get(0) .get(0)
.getValue()); .getValue());
}
@Test
void produceTestSomeNumbers() throws Exception {
final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD();
Assertions.assertEquals(105, tmp.count());
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
Assertions Assertions
.assertEquals( .assertEquals(
19, tmp 19, tmp
.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi")) .filter(row -> !row.getId().equals(doi))
.filter(row -> row.getSubject() != null) .filter(row -> row.getSubject() != null)
.count()); .count());
@ -239,10 +270,90 @@ public class ProduceTest {
.assertEquals( .assertEquals(
85, 85,
tmp tmp
.filter(row -> !row.getId().equals("unresolved::10.3390/s18072310::doi")) .filter(row -> !row.getId().equals(doi))
.filter(r -> r.getInstance() != null && r.getInstance().size() > 0) .filter(r -> r.getInstance() != null && r.getInstance().size() > 0)
.count()); .count());
} }
private JavaRDD<Result> getResultJavaRDD() throws Exception {
final String bipPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/bip/bip.json")
.getPath();
PrepareBipFinder
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", bipPath,
"--outputPath", workingDir.toString() + "/work"
});
final String fosPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/createunresolvedentities/fos/fos.json")
.getPath();
PrepareFOSSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", fosPath,
"-outputPath", workingDir.toString() + "/work"
});
SparkSaveUnresolved.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--sourcePath", workingDir.toString() + "/work",
"-outputPath", workingDir.toString() + "/unresolved"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
return sc
.textFile(workingDir.toString() + "/unresolved")
.map(item -> OBJECT_MAPPER.readValue(item, Result.class));
}
@Test
void prepareTest5Subjects()throws Exception{
final String doi = "unresolved::10.3390/s18072310::doi";
JavaRDD<Result> tmp = getResultJavaRDD();
Assertions.assertEquals(1, tmp.filter(row -> row.getId().equals(doi)).count());
Assertions
.assertEquals(
5, tmp
.filter(row -> row.getId().equals(doi))
.collect()
.get(0)
.getSubject()
.size());
List<StructuredProperty> sbjs = tmp
.filter(row -> row.getId().equals(doi))
.flatMap(row -> row.getSubject().iterator())
.collect();
Assertions
.assertEquals(
true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("01 natural sciences")));
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0103 physical sciences")));
Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010304 chemical physics")));
Assertions.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("0104 chemical sciences")));
Assertions
.assertEquals(true, sbjs.stream().anyMatch(sbj -> sbj.getValue().equals("010402 general chemistry")));
}
} }