1
0
Fork 0

Merge branch 'beta' into graph_cleaning

This commit is contained in:
Claudio Atzori 2022-12-02 14:49:00 +01:00
commit 8248da40d9
28 changed files with 1424 additions and 181 deletions

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.util.Collection; import java.lang.reflect.InvocationTargetException;
import java.util.Iterator; import java.util.*;
import java.util.List; import java.util.stream.Collectors;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -15,6 +17,7 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -74,33 +77,42 @@ public class DedupRecordFactory {
public static <T extends OafEntity> T entityMerger( public static <T extends OafEntity> T entityMerger(
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
throws IllegalAccessException, InstantiationException { throws IllegalAccessException, InstantiationException, InvocationTargetException {
T entity = clazz.newInstance(); final Comparator<Identifier<T>> idComparator = new IdentifierComparator<>();
entity.setDataInfo(dataInfo);
final LinkedList<T> entityList = Lists
.newArrayList(entities)
.stream()
.map(t -> Identifier.newInstance(t._2()))
.sorted(idComparator)
.map(Identifier::getEntity)
.collect(Collectors.toCollection(LinkedList::new));
final T entity = clazz.newInstance();
final T first = entityList.removeFirst();
BeanUtils.copyProperties(entity, first);
final Collection<String> dates = Lists.newArrayList(); final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList(); final List<List<Author>> authors = Lists.newArrayList();
entities entityList
.forEachRemaining( .forEach(
t -> { duplicate -> {
T duplicate = t._2();
entity.mergeFrom(duplicate); entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) { if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result r1 = (Result) duplicate; Result r1 = (Result) duplicate;
if (r1.getAuthor() != null && !r1.getAuthor().isEmpty()) if (r1.getAuthor() != null && StringUtils.isNotBlank(r1.getDateofacceptance().getValue()))
authors.add(r1.getAuthor()); authors.add(r1.getAuthor());
if (r1.getDateofacceptance() != null) if (r1.getDateofacceptance() != null)
dates.add(r1.getDateofacceptance().getValue()); dates.add(r1.getDateofacceptance().getValue());
} }
}); });
// set authors and date // set authors and date
if (ModelSupport.isSubClass(entity, Result.class)) { if (ModelSupport.isSubClass(entity, Result.class)) {
((Result) entity).setDateofacceptance(DatePicker.pick(dates)); // ((Result) entity).setDateofacceptance(DatePicker.pick(dates));
((Result) entity).setAuthor(AuthorMerger.merge(authors)); ((Result) entity).setAuthor(AuthorMerger.merge(authors));
} }

View File

@ -18,6 +18,10 @@ public class IdGenerator implements Serializable {
if (pids == null || pids.isEmpty()) if (pids == null || pids.isEmpty())
return defaultID; return defaultID;
return generateId(pids);
}
private static <T extends OafEntity> String generateId(List<Identifier<T>> pids) {
Identifier<T> bp = pids Identifier<T> bp = pids
.stream() .stream()
.min(Identifier::compareTo) .min(Identifier::compareTo)

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.oa.dedup;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class IdentifierComparator<T extends OafEntity> implements Comparator<Identifier<T>> {
public static int compareIdentifiers(Identifier left, Identifier right) {
return new IdentifierComparator<>().compare(left, right);
}
@Override
public int compare(Identifier<T> left, Identifier<T> i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID
Set<String> lKeys = Optional
.ofNullable(left.getCollectedFrom())
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
Set<String> rKeys = cf
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (left.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (left.getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
return 1;
}
if (left.getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
return 1;
}
if (left.getDate().compareTo(i.getDate()) == 0) {// same date
// we need to take the alphabetically lower id
return left.getOriginalID().compareTo(i.getOriginalID());
} else
// we need to take the elder date
return left.getDate().compareTo(i.getDate());
} else {
return new PidComparator<>(left.getEntity()).compare(toSP(left.getPidType()), toSP(i.getPidType()));
}
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
}
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
}

View File

@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.DatePicker; import eu.dnetlib.dhp.oa.dedup.DatePicker;
import eu.dnetlib.dhp.oa.dedup.IdentifierComparator;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -83,60 +84,12 @@ public class Identifier<T extends OafEntity> implements Serializable, Comparable
return entity.getId(); return entity.getId();
} }
private PidType getPidType() { public PidType getPidType() {
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_")); return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
} }
@Override @Override
public int compareTo(Identifier<T> i) { public int compareTo(Identifier<T> i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) return IdentifierComparator.compareIdentifiers(this, i);
// alphabetical order of the originalID
Set<String> lKeys = Optional
.ofNullable(getCollectedFrom())
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
Set<String> rKeys = cf
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
return 1;
}
if (getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
return 1;
}
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
// we need to take the alphabetically lower id
return this.getOriginalID().compareTo(i.getOriginalID());
} else
// we need to take the elder date
return this.getDate().compareTo(i.getDate());
} else {
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
}
}
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
} }
} }

View File

@ -7,6 +7,7 @@ import java.io.BufferedReader;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -54,7 +55,7 @@ class EntityMergerTest implements Serializable {
} }
@Test @Test
void softwareMergerTest() throws InstantiationException, IllegalAccessException { void softwareMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
List<Tuple2<String, Software>> softwares = readSample( List<Tuple2<String, Software>> softwares = readSample(
testEntityBasePath + "/software_merge.json", Software.class); testEntityBasePath + "/software_merge.json", Software.class);
@ -69,7 +70,7 @@ class EntityMergerTest implements Serializable {
} }
@Test @Test
void publicationMergerTest() throws InstantiationException, IllegalAccessException { void publicationMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
@ -134,7 +135,7 @@ class EntityMergerTest implements Serializable {
} }
@Test @Test
void publicationMergerTest2() throws InstantiationException, IllegalAccessException { void publicationMergerTest2() throws InstantiationException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
@ -146,7 +147,7 @@ class EntityMergerTest implements Serializable {
} }
@Test @Test
void publicationMergerTest3() throws InstantiationException, IllegalAccessException { void publicationMergerTest3() throws InstantiationException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
@ -156,7 +157,8 @@ class EntityMergerTest implements Serializable {
} }
@Test @Test
void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException { void publicationMergerTest4()
throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
@ -166,7 +168,8 @@ class EntityMergerTest implements Serializable {
} }
@Test @Test
void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException { void publicationMergerTest5()
throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {
System.out System.out
.println( .println(

View File

@ -4,8 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory; import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count; import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
import java.io.File; import java.io.File;
@ -14,7 +13,11 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
@ -35,10 +38,13 @@ import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
@ -105,57 +111,27 @@ public class SparkDedupTest implements Serializable {
lenient() lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
.thenReturn( .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"));
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml")));
lenient() lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
.thenReturn( .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"));
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
lenient() lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn( .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
lenient() lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
.thenReturn( .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"));
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
lenient() lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
.thenReturn( .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"));
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
lenient() lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
.thenReturn( .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"));
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
} }
@Test @Test
@ -163,11 +139,7 @@ public class SparkDedupTest implements Serializable {
void createSimRelsTest() throws Exception { void createSimRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"));
.toString(
SparkCreateSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
parser parser
.parseArgument( .parseArgument(
@ -207,7 +179,7 @@ public class SparkDedupTest implements Serializable {
.count(); .count();
assertEquals(3076, orgs_simrel); assertEquals(3076, orgs_simrel);
assertEquals(7040, pubs_simrel); assertEquals(7046, pubs_simrel);
assertEquals(336, sw_simrel); assertEquals(336, sw_simrel);
assertEquals(442, ds_simrel); assertEquals(442, ds_simrel);
assertEquals(6784, orp_simrel); assertEquals(6784, orp_simrel);
@ -223,11 +195,7 @@ public class SparkDedupTest implements Serializable {
void whitelistSimRelsTest() throws Exception { void whitelistSimRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json"));
.toString(
SparkWhitelistSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json")));
parser parser
.parseArgument( .parseArgument(
@ -264,7 +232,7 @@ public class SparkDedupTest implements Serializable {
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist) // entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
assertEquals(3076, orgs_simrel); assertEquals(3076, orgs_simrel);
assertEquals(7040, pubs_simrel); assertEquals(7046, pubs_simrel);
assertEquals(442, ds_simrel); assertEquals(442, ds_simrel);
assertEquals(6784, orp_simrel); assertEquals(6784, orp_simrel);
// System.out.println("orgs_simrel = " + orgs_simrel); // System.out.println("orgs_simrel = " + orgs_simrel);
@ -306,11 +274,7 @@ public class SparkDedupTest implements Serializable {
void cutMergeRelsTest() throws Exception { void cutMergeRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
.toString(
SparkCreateMergeRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
parser parser
.parseArgument( .parseArgument(
@ -402,11 +366,7 @@ public class SparkDedupTest implements Serializable {
void createMergeRelsTest() throws Exception { void createMergeRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
.toString(
SparkCreateMergeRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
parser parser
.parseArgument( .parseArgument(
@ -427,10 +387,10 @@ public class SparkDedupTest implements Serializable {
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
.count(); .count();
long pubs_mergerel = spark final Dataset<Relation> pubs = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
.count(); .as(Encoders.bean(Relation.class));
long sw_mergerel = spark long sw_mergerel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
@ -445,8 +405,35 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count(); .count();
final List<Relation> merges = pubs
.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, merges.size());
Set<String> dups = Sets
.newHashSet(
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
merges.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.MERGES, r.getRelClass());
assertTrue(dups.contains(r.getTarget()));
});
final List<Relation> mergedIn = pubs
.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, mergedIn.size());
mergedIn.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getSource()));
});
assertEquals(1268, orgs_mergerel); assertEquals(1268, orgs_mergerel);
assertEquals(1444, pubs_mergerel); assertEquals(1450, pubs.count());
assertEquals(286, sw_mergerel); assertEquals(286, sw_mergerel);
assertEquals(472, ds_mergerel); assertEquals(472, ds_mergerel);
assertEquals(738, orp_mergerel); assertEquals(738, orp_mergerel);
@ -463,11 +450,7 @@ public class SparkDedupTest implements Serializable {
void createDedupRecordTest() throws Exception { void createDedupRecordTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"));
.toString(
SparkCreateDedupRecord.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
parser parser
.parseArgument( .parseArgument(
new String[] { new String[] {
@ -483,12 +466,18 @@ public class SparkDedupTest implements Serializable {
new SparkCreateDedupRecord(parser, spark).run(isLookUpService); new SparkCreateDedupRecord(parser, spark).run(isLookUpService);
final ObjectMapper mapper = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Dataset<Publication> pubs = spark
.read()
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
.map(
(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
Encoders.bean(Publication.class));
long orgs_deduprecord = jsc long orgs_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord") .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord")
.count(); .count();
long pubs_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
.count();
long sw_deduprecord = jsc long sw_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord") .textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
.count(); .count();
@ -499,11 +488,13 @@ public class SparkDedupTest implements Serializable {
.count(); .count();
assertEquals(86, orgs_deduprecord); assertEquals(86, orgs_deduprecord);
assertEquals(67, pubs_deduprecord); assertEquals(68, pubs.count());
assertEquals(49, sw_deduprecord); assertEquals(49, sw_deduprecord);
assertEquals(97, ds_deduprecord); assertEquals(97, ds_deduprecord);
assertEquals(92, orp_deduprecord); assertEquals(92, orp_deduprecord);
verifyRoot_1(mapper, pubs);
// System.out.println("orgs_deduprecord = " + orgs_deduprecord); // System.out.println("orgs_deduprecord = " + orgs_deduprecord);
// System.out.println("pubs_deduprecord = " + pubs_deduprecord); // System.out.println("pubs_deduprecord = " + pubs_deduprecord);
// System.out.println("sw_deduprecord = " + sw_deduprecord); // System.out.println("sw_deduprecord = " + sw_deduprecord);
@ -511,16 +502,63 @@ public class SparkDedupTest implements Serializable {
// System.out.println("orp_deduprecord = " + orp_deduprecord); // System.out.println("orp_deduprecord = " + orp_deduprecord);
} }
private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
Publication root = pubs
.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.first();
assertNotNull(root);
final Dataset<String> publication = spark
.read()
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "publication"));
Publication crossref_duplicate = publication
.map(
(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
Encoders.bean(Publication.class))
.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList()
.get(0);
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> rootPids = root
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> dupPids = crossref_duplicate
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
Optional<Instance> instance_cr = root
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
.findFirst();
assertTrue(instance_cr.isPresent());
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
assertEquals(
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
}
@Test @Test
@Order(6) @Order(6)
void updateEntityTest() throws Exception { void updateEntityTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"));
.toString(
SparkUpdateEntity.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
parser parser
.parseArgument( .parseArgument(
new String[] { new String[] {
@ -587,7 +625,7 @@ public class SparkDedupTest implements Serializable {
.distinct() .distinct()
.count(); .count();
assertEquals(898, publications); assertEquals(902, publications);
assertEquals(839, organizations); assertEquals(839, organizations);
assertEquals(100, projects); assertEquals(100, projects);
assertEquals(100, datasource); assertEquals(100, datasource);
@ -640,11 +678,7 @@ public class SparkDedupTest implements Serializable {
void propagateRelationTest() throws Exception { void propagateRelationTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"));
.toString(
SparkPropagateRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
parser parser
.parseArgument( .parseArgument(
new String[] { new String[] {
@ -714,4 +748,12 @@ public class SparkDedupTest implements Serializable {
public boolean isDeletedByInference(String s) { public boolean isDeletedByInference(String s) {
return s.contains("\"deletedbyinference\":true"); return s.contains("\"deletedbyinference\":true");
} }
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(path));
}
} }

View File

@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count(); .count();
assertEquals(288, orgs_simrel); assertEquals(290, orgs_simrel);
} }
@Test @Test
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count(); .count();
assertEquals(324, orgs_simrel); assertEquals(326, orgs_simrel);
} }
@Test @Test

View File

@ -0,0 +1,403 @@
package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkPublicationRootsTest implements Serializable {
@Mock(serializable = true)
ISLookUpService isLookUpService;
private static SparkSession spark;
private static String workingPath;
private static String graphInputPath;
private static String graphOutputPath;
private static final String testActionSetId = "test-orchestrator";
private static Path testBaseTmpPath;
private static final ObjectMapper MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@BeforeAll
public static void init() throws IOException, URISyntaxException {
testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest.class.getSimpleName() + "-");
final File entitiesSources = Paths
.get(SparkPublicationRootsTest.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
.toFile();
FileUtils
.copyDirectory(
entitiesSources,
testBaseTmpPath.resolve("input").toFile());
workingPath = testBaseTmpPath.resolve("workingPath").toString();
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
graphOutputPath = testBaseTmpPath.resolve("output").toString();
FileUtils.deleteDirectory(new File(workingPath));
FileUtils.deleteDirectory(new File(graphOutputPath));
final SparkConf conf = new SparkConf();
conf.set("spark.sql.shuffle.partitions", "10");
spark = SparkSession
.builder()
.appName(SparkPublicationRootsTest.class.getSimpleName())
.master("local[*]")
.config(conf)
.getOrCreate();
}
@BeforeEach
public void setUp() throws IOException, ISLookUpException {
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
}
@AfterAll
public static void tearDown() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
spark.close();
}
@Test
@Order(1)
void createSimRelsTest() throws Exception {
new SparkCreateSimRels(args(
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath,
"--numPartitions", "5"
}), spark)
.run(isLookUpService);
long pubs_simrel = spark
.read()
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
.count();
assertEquals(74, pubs_simrel);
}
@Test
@Order(2)
void cutMergeRelsTest() throws Exception {
new SparkCreateMergeRels(args(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath,
"--cutConnectedComponent", "3"
}), spark)
.run(isLookUpService);
long pubs_mergerel = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
.groupBy("source")
.agg(count("target").alias("cnt"))
.select("source", "cnt")
.where("cnt > 3")
.count();
assertEquals(0, pubs_mergerel);
FileUtils.deleteDirectory(new File(workingPath + "/" + testActionSetId + "/publication_mergerel"));
}
@Test
@Order(3)
void createMergeRelsTest() throws Exception {
new SparkCreateMergeRels(args(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Relation> merges = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class));
final List<Relation> mergeList = merges
.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, mergeList.size());
Set<String> dups = Sets
.newHashSet(
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
mergeList.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.MERGES, r.getRelClass());
assertTrue(dups.contains(r.getTarget()));
});
final List<Relation> mergedIn = merges
.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, mergedIn.size());
mergedIn.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getSource()));
});
assertEquals(32, merges.count());
}
@Test
@Order(4)
void createDedupRecordTest() throws Exception {
new SparkCreateDedupRecord(args(
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Publication> roots = spark
.read()
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
assertEquals(3, roots.count());
final Dataset<Publication> pubs = spark
.read()
.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
verifyRoot_case_1(roots, pubs);
verifyRoot_case_2(roots, pubs);
verifyRoot_case_3(roots, pubs);
}
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.first();
assertNotNull(root);
Publication crossref_duplicate = pubs
.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList()
.get(0);
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> rootPids = root
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> dupPids = crossref_duplicate
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
Optional<Instance> instance_cr = root
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
.findFirst();
assertTrue(instance_cr.isPresent());
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
assertEquals(
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
}
private void verifyRoot_case_2(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|doi_dedup___::18aff3b55fb6876466a5d4bd82434885'")
.first();
assertNotNull(root);
Publication crossref_duplicate = pubs
.filter("id = '50|doi_________::18aff3b55fb6876466a5d4bd82434885'")
.first();
// System.err.println(new ObjectMapper().writeValueAsString(root));
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnOnline(), root.getJournal().getIssnOnline());
assertEquals(crossref_duplicate.getJournal().getVol(), root.getJournal().getVol());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> dups_cf = pubs
.collectAsList()
.stream()
.flatMap(p -> p.getCollectedfrom().stream())
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> root_cf = root
.getCollectedfrom()
.stream()
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
}
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
.first();
assertNotNull(root);
Publication pivot_duplicate = pubs
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
.first();
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> dups_cf = pubs
.collectAsList()
.stream()
.flatMap(p -> p.getCollectedfrom().stream())
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> root_cf = root
.getCollectedfrom()
.stream()
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
}
@Test
@Order(6)
void updateEntityTest() throws Exception {
new SparkUpdateEntity(args(
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--workingPath", workingPath,
"--dedupGraphPath", graphOutputPath
}), spark)
.run(isLookUpService);
long publications = spark.read().textFile(graphOutputPath + "/publication").count();
long mergedPubs = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
.distinct()
.count();
assertEquals(19, publications); // 16 originals + 3 roots
long deletedPubs = spark
.read()
.textFile(graphOutputPath + "/publication")
.map(asEntity(Publication.class), Encoders.bean(Publication.class))
.filter("datainfo.deletedbyinference == true")
.map((MapFunction<Publication, String>) OafEntity::getId, Encoders.STRING())
.distinct()
.count();
assertEquals(mergedPubs, deletedPubs);
}
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
SparkPublicationRootsTest.class
.getResourceAsStream(path));
}
private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
return value -> MAPPER.readValue(value, clazz);
}
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
parser.parseArgument(args);
return parser;
}
}

View File

@ -0,0 +1,251 @@
package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkPublicationRootsTest2 implements Serializable {
@Mock(serializable = true)
ISLookUpService isLookUpService;
private static SparkSession spark;
private static String workingPath;
private static String graphInputPath;
private static String graphOutputPath;
private static final String testActionSetId = "test-orchestrator";
private static Path testBaseTmpPath;
private static final ObjectMapper MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@BeforeAll
public static void init() throws IOException, URISyntaxException {
testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest2.class.getSimpleName() + "-");
final File entitiesSources = Paths
.get(SparkPublicationRootsTest2.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
.toFile();
FileUtils
.copyDirectory(
entitiesSources,
testBaseTmpPath.resolve("input").toFile());
FileUtils
.copyFileToDirectory(
Paths
.get(
SparkPublicationRootsTest2.class
.getResource(
"/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz")
.toURI())
.toFile(),
testBaseTmpPath.resolve("input").resolve("entities").resolve("publication").toFile());
workingPath = testBaseTmpPath.resolve("workingPath").toString();
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
graphOutputPath = testBaseTmpPath.resolve("output").toString();
final SparkConf conf = new SparkConf();
conf.set("spark.sql.shuffle.partitions", "10");
spark = SparkSession
.builder()
.appName(SparkPublicationRootsTest2.class.getSimpleName())
.master("local[*]")
.config(conf)
.getOrCreate();
}
@BeforeEach
public void setUp() throws IOException, ISLookUpException {
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
}
@AfterAll
public static void tearDown() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
}
@Test
@Order(7)
void dedupAlteredDatasetTest() throws Exception {
new SparkCreateSimRels(args(
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath,
"--numPartitions", "5"
}), spark)
.run(isLookUpService);
new SparkCreateMergeRels(args(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Relation> merges = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class));
assertEquals(
3, merges
.filter("relclass == 'isMergedIn'")
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
.distinct()
.count());
assertEquals(
4, merges
.filter("source == '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
.count());
new SparkCreateDedupRecord(args(
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Publication> roots = spark
.read()
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
assertEquals(3, roots.count());
final Dataset<Publication> pubs = spark
.read()
.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
Publication root = roots
.filter("id = '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
.first();
assertNotNull(root);
Publication crossref_duplicate = pubs
.filter("id = '50|doi_________::b3aec7985136e36827176aaa1dd5082d'")
.collectAsList()
.get(0);
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> rootPids = root
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> dupPids = crossref_duplicate
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
assertTrue(rootPids.contains("10.1109/jstqe.2023.9999999"));
Optional<Instance> instance_cr = root
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
.findFirst();
assertTrue(instance_cr.isPresent());
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
assertEquals(
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
}
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
SparkPublicationRootsTest2.class
.getResourceAsStream(path));
}
private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
return value -> MAPPER.readValue(value, clazz);
}
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
parser.parseArgument(args);
return parser;
}
}

View File

@ -168,11 +168,11 @@ public class SparkStatsTest implements Serializable {
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats") .textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
.count(); .count();
assertEquals(477, orgs_blocks); assertEquals(480, orgs_blocks);
assertEquals(295, pubs_blocks); assertEquals(295, pubs_blocks);
assertEquals(122, sw_blocks); assertEquals(122, sw_blocks);
assertEquals(191, ds_blocks); assertEquals(191, ds_blocks);
assertEquals(171, orp_blocks); assertEquals(178, orp_blocks);
} }
@AfterAll @AfterAll

View File

@ -0,0 +1,24 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value=""/>
<RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
<RESOURCE_KIND value="DedupOrchestrationDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
</HEADER>
<BODY>
<CONFIGURATION enabled="true">
<DEDUPLICATION>
<ENTITY code="20" label="Organization" name="organization"/>
<ACTION_SET id="test-orchestrator"/>
<SCAN_SEQUENCE>
<SCAN id="publication"/>
</SCAN_SEQUENCE>
</DEDUPLICATION>
</CONFIGURATION>
<STATUS>
<LAST_UPDATE value="2001-12-31T12:00:00"/>
</STATUS>
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
</BODY>
</RESOURCE_PROFILE>

View File

@ -0,0 +1,47 @@
# Root logger option
log4j.rootLogger=DEBUG, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
# Change this to set Spark log level
log4j.logger.org.apache.spark=ERROR
log4j.rootCategory=WARN
# Silence akka remoting
log4j.logger.Remoting=WARN
# Ignore messages below warning level from Jetty, because it's a bit verbose
log4j.logger.org.eclipse.jetty=WARN
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
#log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN
#log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN
log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN
#log4j.logger.org.apache.hadoop.io.compress=WARN
#log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN
log4j.logger.parquet.hadoop.ColumnChunkPageWriteStore=ERROR
log4j.logger.com.jayway.jsonpath.internal.path.CompiledPath=WARN
log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=ERROR
log4j.logger.parquet.hadoop=WARN
log4j.logger.org.eclipse.jetty.server.handler.ContextHandlerCollection=WARN
log4j.logger.org.spark_project.jetty.util.component.ContainerLifeCycle=WARN
log4j.logger.org.apache.hadoop.mapred.FileInputFormat=WARN
log4j.logger.org.spark_project.jetty.servlet.ServletHandler=WARN
log4j.logger.org.apache.commons.beanutils.converters.BooleanConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.StringConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.LongConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.ArrayConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.FloatConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.IntegerConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.DoubleConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.CharacterConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.ByteConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.BigIntegerConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.BigDecimalConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.ShortConverter=WARN
log4j.logger.org.apache.commons.beanutils.BeanUtils=WARN

View File

@ -26,6 +26,9 @@ public class QueryInformationSystem {
+ " return " + " return "
+ " <community> " + " <community> "
+ " { $x//CONFIGURATION/context/@id} " + " { $x//CONFIGURATION/context/@id} "
+ " <advancedConstraints>" +
"{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }" +
"</advancedConstraints>"
+ " <subjects> " + " <subjects> "
+ " {for $y in tokenize($subj,',') " + " {for $y in tokenize($subj,',') "
+ " return " + " return "

View File

@ -9,16 +9,16 @@ import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.jayway.jsonpath.DocumentContext; import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
/** Created by miriam on 02/08/2018. */ /** Created by miriam on 02/08/2018. */
public class ResultTagger implements Serializable { public class ResultTagger implements Serializable {

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
@VerbClass("starts_with")
public class StartsWithVerb implements Selection, Serializable {
private String param;
public StartsWithVerb() {
}
public StartsWithVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.startsWith(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -16,6 +16,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -45,7 +46,9 @@ public class BulkTagJobTest {
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\"," + " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\", " + " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" }"; + " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='subject:fos')].value\"} ";
private static SparkSession spark; private static SparkSession spark;
@ -769,28 +772,14 @@ public class BulkTagJobTest {
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query); org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false); idExplodeCommunity.show(false);
Assertions.assertEquals(4, idExplodeCommunity.count()); Assertions.assertEquals(5, idExplodeCommunity.count());
Assertions Assertions
.assertEquals( .assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions Assertions
.assertEquals( .assertEquals(
1, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count()); 2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
} }
// @Test
// void test1(){
// ProtoMap params = new Gson().fromJson(pathMap, ProtoMap.class);
// HashMap<String, String> param = new HashMap<>();
// for (String key : params.keySet()) {
// try {
// param.put(key, jsonContext.read(params.get(key)));
// } catch (com.jayway.jsonpath.PathNotFoundException e) {
// param.put(key, new ArrayList<>());
// }
// }
// return param;
// }
// }
} }

View File

@ -844,6 +844,89 @@
<organizations/> <organizations/>
</community> </community>
<community id="dariah"> <community id="dariah">
<advancedConstraints>
{
"criteria": [
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
}
]
}
</advancedConstraints>
<subjects/> <subjects/>
<datasources> <datasources>
<datasource> <datasource>
@ -1174,7 +1257,9 @@
</zenodocommunities> </zenodocommunities>
<organizations/> <organizations/>
</community> </community>
<community id="euromarine"> <community id="euromarine">
<subjects/> <subjects/>
<datasources/> <datasources/>
<zenodocommunities/> <zenodocommunities/>

View File

@ -142,6 +142,26 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record); testRecordTransformation(record);
} }
@Test
public void testForEOSCFutureZenodo7353841() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/zenodo7353841.xml"));
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureZenodo7351393() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/zenodo7351393.xml"));
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureZenodo7351221() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/zenodo7351221.xml"));
testRecordTransformation(record);
}
@Test @Test
void testDoiUrlNormalization() throws MalformedURLException { void testDoiUrlNormalization() throws MalformedURLException {

View File

@ -0,0 +1,99 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::9cb0664d4c891c4baaf73f007c0c9de0</dri:objIdentifier>
<dri:dateOfCollection>2022-11-25T12:55:13Z</dri:dateOfCollection>
<dri:status>under curation</dri:status>
<counters />
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">COVID-19 Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
<creator rank="1" name="" surname="">Karki, Reagon</creator>
<dateofacceptance />
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
<description><p>A Knowledge graph representation of compounds and associated biological entities in the BY-COVID and EOSC Future project.</p> <p><strong>Current status</strong></p> <ul> <li>Number of Nodes: 35952</li> <li>Number of Edges: 279462</li> <li>Human Proteins: 1347</li> <li>Assay: 15835</li> <li>Chemical/Compound: 4096</li> <li>Mechanism of Action: 739</li> <li>Pathway: 1513</li> <li>Disease: 1585</li> <li>SideEffect: 7420</li> <li>Biological Process: 2085</li> <li>Molecular Function: 1332</li> </ul> <p>Please check the BY_COVID_update_August.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in &#39;data\export&#39; folder.</p> <p></p></description>
<country classid="" classname="" schemeid="" schemename="" />
<subject classid="" classname="" schemeid="" schemename="" />
<relevantdate classid="" classname="" schemeid="" schemename="" />
<publisher>Zenodo</publisher>
<embargoenddate />
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
<source />
<fulltext />
<format />
<storagedate />
<resourcetype classid="" classname="" schemeid="" schemename="" />
<device />
<size />
<version />
<lastmetadataupdate />
<metadataversionnumber />
<documentationUrl />
<codeRepositoryUrl />
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
<contactperson />
<contactgroup />
<tool />
<originalId>oai:zenodo.org:7351221</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351221</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351221</pid>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<context id="covid-19" label="COVID-19" type="community"></context>
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance />
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
<code>101017536</code>
<acronym>EOSC Future</acronym>
<title>EOSC Future</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl />
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
<code>101046203</code>
<acronym>BY-COVID</acronym>
<title>Beyond COVID</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
</funding>
<websiteurl />
</rel>
</rels>
<children>
<instance id="od______2659::040cee965a4544e343a2ba149783c3fc">
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<dateofacceptance />
<webresource>
<url>https://zenodo.org/record/7351221</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,100 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::07fdccabd77830e3caccf0b33c083f1b</dri:objIdentifier>
<dri:dateOfCollection>2022-11-25T01:08:31Z</dri:dateOfCollection>
<dri:status>under curation</dri:status>
<counters />
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Monkeypox Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
<creator rank="1" name="" surname="">Karki, Reagon</creator>
<creator rank="2" name="" surname="">Andrea, Zaliani</creator>
<creator rank="3" name="" surname="">Gadiya, Yojana</creator>
<creator rank="4" name="" surname="">Gribbon, Philip</creator>
<dateofacceptance />
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
<description><p>The Monkeypox KG is built using viral and human proteins reported in different resources. Additionally, the KG represents chemicals tested against Monkeypox and their targets, associated biological processes, molecular functions, diseases and side effects.</p> <p><strong>KG status</strong></p> <p>Version 1 stats:</p> <ul> <li>Number of Nodes: 8235</li> <li>Number of Edges: 40422</li> </ul> <p>Version 2 stats (2nd September) :</p> <ul> <li>Number of Nodes: 9129</li> <li>Number of Edges: 44568</li> </ul> <p>Please check the graph.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in &#39;data\export&#39; folder.</p> <p></p></description>
<country classid="" classname="" schemeid="" schemename="" />
<subject classid="" classname="" schemeid="" schemename="" />
<relevantdate classid="" classname="" schemeid="" schemename="" />
<publisher>Zenodo</publisher>
<embargoenddate />
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
<source />
<fulltext />
<format />
<storagedate />
<resourcetype classid="" classname="" schemeid="" schemename="" />
<device />
<size />
<version />
<lastmetadataupdate />
<metadataversionnumber />
<documentationUrl />
<codeRepositoryUrl />
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
<contactperson />
<contactgroup />
<tool />
<originalId>oai:zenodo.org:7351393</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351393</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351393</pid>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance />
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
<code>101017536</code>
<acronym>EOSC Future</acronym>
<title>EOSC Future</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl />
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
<code>101046203</code>
<acronym>BY-COVID</acronym>
<title>Beyond COVID</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
</funding>
<websiteurl />
</rel>
</rels>
<children>
<instance id="od______2659::db2bc6381545f80dc9feec808a173ec0">
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<dateofacceptance />
<webresource>
<url>https://zenodo.org/record/7351393</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,85 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::93d39dd7edef016928788c3500e149f1</dri:objIdentifier>
<dri:dateOfCollection>2022-11-24T08:41:37Z</dri:dateOfCollection>
<dri:status>under curation</dri:status>
<counters/>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">ENVRI SP - Dashboard State of the Environment - Ocean Indicators</title>
<creator rank="1" name="" surname="">Tjerk Krijger</creator>
<dateofacceptance />
<resulttype classid="other" classname="other" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
<description><p>The attached .yaml file is used as input to the Dashboard State of the Environment, which is a science project of the ENVRI-FAIR science cluster within EOSC-FUTURE. The contents of the file enable the visualization of Ocean indicators on the dashboard. It is possible to download the attached file and change the contents to include indicators from different domains such as atmosphere or biodiversity.</p></description>
<country classid="" classname="" schemeid="" schemename="" />
<subject classid="" classname="" schemeid="" schemename="" />
<relevantdate classid="" classname="" schemeid="" schemename="" />
<publisher>Zenodo</publisher>
<embargoenddate />
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
<source />
<fulltext />
<format />
<storagedate />
<resourcetype classid="" classname="" schemeid="" schemename="" />
<device />
<size />
<version />
<lastmetadataupdate />
<metadataversionnumber />
<documentationUrl />
<codeRepositoryUrl />
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
<contactperson />
<contactgroup />
<tool />
<originalId>oai:zenodo.org:7353841</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7353841</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7353841</pid>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance />
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
<code>101017536</code>
<acronym>EOSC Future</acronym>
<title>EOSC Future</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl />
</rel>
</rels>
<children>
<instance id="od______2659::3e4323c221f269e5f3d6db4c61dd2ec8">
<instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<dateofacceptance />
<webresource>
<url>https://zenodo.org/record/7353841</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -48,7 +48,9 @@ create table TARGET.result stored as parquet as
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a' -- Nanyang Technological University 'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb' -- McMaster University
) )) foo; ) )) foo;
compute stats TARGET.result; compute stats TARGET.result;