forked from D-Net/dnet-hadoop
[graph merge beta] merge datasource originalid, collectedfrom, and pid lists
This commit is contained in:
parent
77bc9863e9
commit
5d3b4a9c25
|
@ -5,6 +5,7 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -136,7 +137,7 @@ public class MergeGraphTableSparkJob {
|
|||
/**
|
||||
* Datasources involved in the merge operation doesn't obey to the infra precedence policy, but relies on a custom
|
||||
* behaviour that, given two datasources from beta and prod returns the one from prod with the highest
|
||||
* compatibility among the two.
|
||||
* compatibility among the two. Furthermore, the procedure merges the collectedfrom, originalId, and pid lists.
|
||||
*
|
||||
* @param p datasource from PROD
|
||||
* @param b datasource from BETA
|
||||
|
@ -160,9 +161,37 @@ public class MergeGraphTableSparkJob {
|
|||
|
||||
List<Qualifier> list = Arrays.asList(dp.getOpenairecompatibility(), db.getOpenairecompatibility());
|
||||
dp.setOpenairecompatibility(Collections.min(list, new DatasourceCompatibilityComparator()));
|
||||
dp
|
||||
.setCollectedfrom(
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(dp.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()),
|
||||
Optional
|
||||
.ofNullable(db.getCollectedfrom())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty()))
|
||||
.distinct() // relies on KeyValue.equals
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
dp.setOriginalId(mergeLists(dp.getOriginalId(), db.getOriginalId()));
|
||||
dp.setPid(mergeLists(dp.getPid(), db.getPid()));
|
||||
|
||||
return (P) dp;
|
||||
}
|
||||
|
||||
private static final <T> List<T> mergeLists(final List<T>... lists) {
|
||||
return Arrays
|
||||
.stream(lists)
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(List::stream)
|
||||
.filter(Objects::nonNull)
|
||||
.distinct()
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static <P extends Oaf, B extends Oaf> P mergeWithPriorityToPROD(Optional<P> p, Optional<B> b) {
|
||||
if (b.isPresent() & !p.isPresent()) {
|
||||
return (P) b.get();
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.oa.graph.merge;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
@ -25,7 +26,23 @@ class MergeGraphTableSparkJobTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
void testMergeDatasources() throws IOException {
|
||||
void testMerge() throws IOException {
|
||||
Datasource d = MergeGraphTableSparkJob
|
||||
.mergeDatasource(
|
||||
d("datasource_cris.json"),
|
||||
d("datasource_openaire2.0.json"));
|
||||
|
||||
assertEquals("10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", d.getId());
|
||||
assertNotNull(d.getOriginalId());
|
||||
assertEquals(2, d.getOriginalId().size());
|
||||
assertNotNull(d.getCollectedfrom());
|
||||
assertEquals(2, d.getCollectedfrom().size());
|
||||
assertNotNull(d.getPid());
|
||||
assertEquals(1, d.getPid().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMergeCompatibility() throws IOException {
|
||||
assertEquals(
|
||||
"openaire-cris_1.1",
|
||||
MergeGraphTableSparkJob
|
||||
|
|
|
@ -1 +1,5 @@
|
|||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire-cris_1.1" }}
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire-cris_1.1" },
|
||||
"originalId": ["eurocrisdris::1234"],
|
||||
"collectedfrom": [{"key": "eurocrisdris::2b29d08e383ff4cd8a2b6b226ce37e38", "value": "Directory of Research Information System (DRIS)"}],
|
||||
"pid": [{"value": "10.1010.xyx", "qualifier": {"classid": "doi"}}]
|
||||
}
|
|
@ -1 +1,4 @@
|
|||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0" }}
|
||||
{ "id": "10|274269ac6f3b::2a2e2793b500f3f7b47ef24b1a9277b7", "openairecompatibility": { "classid": "openaire2.0" },
|
||||
"originalId": ["opendoar____::1234"],
|
||||
"collectedfrom": [{"key": "openaire____::47ce9e9f4fad46e732cff06419ecaabb", "value": "OpenDOAR"}]
|
||||
}
|
Loading…
Reference in New Issue