WIP: subjectPropagation #269

Draft
miriam.baglioni wants to merge 5 commits from subjectPropagation into beta
90 changed files with 8386 additions and 636 deletions
Showing only changes of commit e1317edd23 - Show all commits

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.common.action;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.ResultSet;
import java.sql.SQLException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class ReadDatasourceMasterDuplicateFromDB {
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
+
"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
throws IOException {
int count = 0;
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
log.info("running query: {}", QUERY);
log.info("storing results in: {}", hdfsPath);
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
dbClient.processResults(QUERY, rs -> writeMap(datasourceMasterMap(rs), writer));
count++;
}
}
return count;
}
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
try {
final MasterDuplicate md = new MasterDuplicate();
final String duplicateId = rs.getString("duplicateId");
final String masterId = rs.getString("masterId");
final String masterName = rs.getString("masterName");
md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
md.setMasterName(masterName);
return md;
} catch (final SQLException e) {
throw new RuntimeException(e);
}
}
private static void writeMap(final MasterDuplicate dm, final BufferedWriter writer) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
writer.newLine();
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.dhp.common.action.model;
import java.io.Serializable;
/**
* @author miriam.baglioni
* @Date 21/07/22
*/
public class MasterDuplicate implements Serializable {
private String duplicateId;
private String masterId;
private String masterName;
public String getDuplicateId() {
return duplicateId;
}
public void setDuplicateId(String duplicateId) {
this.duplicateId = duplicateId;
}
public String getMasterId() {
return masterId;
}
public void setMasterId(String masterId) {
this.masterId = masterId;
}
public String getMasterName() {
return masterName;
}
public void setMasterName(String masterName) {
this.masterName = masterName;
}
}

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
@ -66,27 +67,39 @@ public class Vocabulary implements Serializable {
}
public Qualifier getTermAsQualifier(final String termId) {
if (StringUtils.isBlank(termId)) {
return getTermAsQualifier(termId, false);
}
public Qualifier getTermAsQualifier(final String termId, boolean strict) {
final VocabularyTerm term = getTerm(termId);
if (Objects.nonNull(term)) {
return OafMapperUtils.qualifier(term.getId(), term.getName(), getId(), getName());
} else if (Objects.isNull(term) && strict) {
return OafMapperUtils.unknown(getId(), getName());
} else if (termExists(termId)) {
final VocabularyTerm t = getTerm(termId);
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
} else {
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
}
}
public Qualifier getSynonymAsQualifier(final String syn) {
return getSynonymAsQualifier(syn, false);
}
public Qualifier getSynonymAsQualifier(final String syn, boolean strict) {
return Optional
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId()))
.map(term -> getTermAsQualifier(term.getId(), strict))
.orElse(null);
}
public Qualifier lookup(String id) {
return lookup(id, false);
}
public Qualifier lookup(String id, boolean strict) {
return Optional
.ofNullable(getSynonymAsQualifier(id))
.orElse(getTermAsQualifier(id));
.ofNullable(getSynonymAsQualifier(id, strict))
.orElse(getTermAsQualifier(id, strict));
}
}

View File

@ -23,6 +23,8 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import me.xuender.unidecode.Unidecode;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
public class GraphCleaningFunctions extends CleaningFunctions {
@ -201,6 +203,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(s -> {
if ("dnet:result_subject".equals(s.getQualifier().getClassid())) {
s.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
s.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
}
return s;
})
.map(GraphCleaningFunctions::cleanValue)
.collect(
Collectors
@ -211,7 +220,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.orElse(s.getValue()),
Function.identity(),
(s1, s2) -> Collections
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
.min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator())))
.values());
r.setSubject(subjects);
}
@ -333,7 +342,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
if (Objects.isNull(i.getRefereed()) || StringUtils.isBlank(i.getRefereed().getClassid())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
if (Objects.nonNull(i.getDateofacceptance())) {

View File

@ -1,100 +0,0 @@
package eu.dnetlib.dhp.oa.merge;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
class AuthorMergerTest {
private String publicationsBasePath;
private List<List<Author>> authors;
@BeforeEach
public void setUp() throws Exception {
publicationsBasePath = Paths
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
.toFile()
.getAbsolutePath();
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
.stream()
.map(p -> p._2().getAuthor())
.collect(Collectors.toList());
}
@Test
void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1));
for (Author author : authors1) {
System.out.println(authorToString(author));
}
}
List<Author> merge = AuthorMerger.merge(authors);
System.out.println("Merge ");
for (Author author : merge) {
System.out.println(authorToString(author));
}
Assertions.assertEquals(7, merge.size());
}
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
List<Tuple2<String, T>> res = new ArrayList<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
while (line != null) {
res
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
// read next line
line = reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
public String authorToString(Author a) {
String print = "Fullname = ";
print += a.getFullname() + " pid = [";
if (a.getPid() != null)
for (StructuredProperty sp : a.getPid()) {
print += sp.toComparableString() + " ";
}
print += "]";
return print;
}
}

File diff suppressed because one or more lines are too long

View File

@ -49,7 +49,7 @@ object DataciteToOAFTransformation {
/** This method should skip record if json contains invalid text
* defined in file datacite_filter
*
* @param record : unparsed datacite record
* @param record : not parsed Datacite record
* @param json : parsed record
* @return True if the record should be skipped
*/
@ -98,6 +98,10 @@ object DataciteToOAFTransformation {
}
/** This utility method indicates whether the embargo date has been reached
* @param embargo_end_date
* @return True if the embargo date has been reached, false otherwise
*/
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
@ -142,6 +146,21 @@ object DataciteToOAFTransformation {
}
}
/** *
* Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
* Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
* to generate one of the following main entities:
* - publication
* - dataset
* - software
* - otherresearchproduct
*
* @param resourceType
* @param resourceTypeGeneral
* @param schemaOrg
* @param vocabularies
* @return
*/
def getTypeQualifier(
resourceType: String,
resourceTypeGeneral: String,
@ -330,6 +349,7 @@ object DataciteToOAFTransformation {
if (result == null)
return List()
// DOI is mapped on a PID inside a Instance object
val doi_q = OafMapperUtils.qualifier(
"doi",
"doi",
@ -338,6 +358,8 @@ object DataciteToOAFTransformation {
)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
// This identifiere will be replaced in a second moment using the PID logic generation
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
result.setOriginalId(List(doi).asJava)
@ -386,6 +408,10 @@ object DataciteToOAFTransformation {
a
}
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(
@ -409,10 +435,6 @@ object DataciteToOAFTransformation {
.asJava
)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)

View File

@ -27,7 +27,7 @@ object SparkCreateBaselineDataFrame {
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
val result = data.lines
val result = data.linesWithSeparators.map(l =>l.stripLineEnd)
.filter(l => l.startsWith("<a href="))
.map { l =>
val end = l.lastIndexOf("\">")

View File

@ -130,7 +130,7 @@
</xsl:if>
<oaf:hostedBy name="{$varOfficialName}" id="{$varDataSourceId}" />
<oaf:collectedFrom name="{$varOfficialName}" id="{$varDataSourceId}ß" />
<oaf:collectedFrom name="{$varOfficialName}" id="{$varDataSourceId}" />
<xsl:variable name="varKnownFileEndings" select="('.bmp', '.doc', '.docx', '.epub', '.flv', '.jpeg', '.jpg', '.m4v', '.mp4', '.mpg', '.odp', '.pdf', '.png', '.ppt', '.tiv', '.txt', '.xls', '.xlsx', '.zip')" />
<xsl:variable name="varIdDoi" select="distinct-values((//dc:identifier[starts-with(., '10.')][matches(., '(10[.][0-9]{4,}[^\s/&gt;]*/[^\s&gt;]+)')], //dc:identifier[starts-with(., 'http') and (contains(., '://dx.doi.org/10.') or contains(., '://doi.org/10.'))]/substring-after(., 'doi.org/'), //dc:identifier[starts-with(lower-case(.), 'doi:10.')]/substring-after(lower-case(.), 'doi:')))" />

View File

@ -63,7 +63,7 @@ class BioScholixTest extends AbstractVocabularyTest {
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
.mkString
val r: List[Oaf] = records.lines.toList
val r: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList
.map(s => mapper.readValue(s, classOf[PMArticle]))
.map(a => PubMedToOaf.convert(a, vocabularies))
assertEquals(10, r.size)
@ -173,9 +173,9 @@ class BioScholixTest extends AbstractVocabularyTest {
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
@ -194,9 +194,9 @@ class BioScholixTest extends AbstractVocabularyTest {
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
assertTrue(result.nonEmpty)
result.foreach(r => assertNotNull(r))
@ -239,9 +239,9 @@ class BioScholixTest extends AbstractVocabularyTest {
val records: String = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
assertNotNull(result)
assertTrue(result.nonEmpty)
@ -276,11 +276,11 @@ class BioScholixTest extends AbstractVocabularyTest {
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
)
.mkString
records.lines.foreach(s => assertTrue(s.nonEmpty))
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
val l: List[ScholixResolved] = records.lines.map { input =>
val l: List[ScholixResolved] = records.linesWithSeparators.map(l =>l.stripLineEnd).map { input =>
lazy val json = parse(input)
json.extract[ScholixResolved]
}.toList

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.oa.dedup;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.lang.reflect.InvocationTargetException;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
@ -15,6 +17,7 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -74,33 +77,39 @@ public class DedupRecordFactory {
public static <T extends OafEntity> T entityMerger(
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
throws IllegalAccessException, InstantiationException {
throws IllegalAccessException, InstantiationException, InvocationTargetException {
T entity = clazz.newInstance();
entity.setDataInfo(dataInfo);
final Comparator<Identifier<T>> idComparator = new IdentifierComparator<>();
final LinkedList<T> entityList = Lists
.newArrayList(entities)
.stream()
.map(t -> Identifier.newInstance(t._2()))
.sorted(idComparator)
.map(Identifier::getEntity)
.collect(Collectors.toCollection(LinkedList::new));
final T entity = clazz.newInstance();
final T first = entityList.removeFirst();
BeanUtils.copyProperties(entity, first);
final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList();
entities
.forEachRemaining(
t -> {
T duplicate = t._2();
entityList
.forEach(
duplicate -> {
entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result r1 = (Result) duplicate;
if (r1.getAuthor() != null && !r1.getAuthor().isEmpty())
authors.add(r1.getAuthor());
if (r1.getDateofacceptance() != null)
dates.add(r1.getDateofacceptance().getValue());
Optional
.ofNullable(r1.getAuthor())
.ifPresent(a -> authors.add(a));
}
});
// set authors and date
if (ModelSupport.isSubClass(entity, Result.class)) {
((Result) entity).setDateofacceptance(DatePicker.pick(dates));
((Result) entity).setAuthor(AuthorMerger.merge(authors));
}

View File

@ -18,6 +18,10 @@ public class IdGenerator implements Serializable {
if (pids == null || pids.isEmpty())
return defaultID;
return generateId(pids);
}
private static <T extends OafEntity> String generateId(List<Identifier<T>> pids) {
Identifier<T> bp = pids
.stream()
.min(Identifier::compareTo)

View File

@ -0,0 +1,81 @@
package eu.dnetlib.dhp.oa.dedup;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class IdentifierComparator<T extends OafEntity> implements Comparator<Identifier<T>> {
public static int compareIdentifiers(Identifier left, Identifier right) {
return new IdentifierComparator<>().compare(left, right);
}
@Override
public int compare(Identifier<T> left, Identifier<T> i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID
Set<String> lKeys = Optional
.ofNullable(left.getCollectedFrom())
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
Set<String> rKeys = cf
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (left.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (left.getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
return 1;
}
if (left.getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
return 1;
}
if (left.getDate().compareTo(i.getDate()) == 0) {// same date
// we need to take the alphabetically lower id
return left.getOriginalID().compareTo(i.getOriginalID());
} else
// we need to take the elder date
return left.getDate().compareTo(i.getDate());
} else {
return new PidComparator<>(left.getEntity()).compare(toSP(left.getPidType()), toSP(i.getPidType()));
}
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
}
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
}

View File

@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.DatePicker;
import eu.dnetlib.dhp.oa.dedup.IdentifierComparator;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -83,60 +84,12 @@ public class Identifier<T extends OafEntity> implements Serializable, Comparable
return entity.getId();
}
private PidType getPidType() {
public PidType getPidType() {
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
}
@Override
public int compareTo(Identifier<T> i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID
Set<String> lKeys = Optional
.ofNullable(getCollectedFrom())
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
Set<String> rKeys = cf
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
return 1;
}
if (getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
return -1;
if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
return 1;
}
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
// we need to take the alphabetically lower id
return this.getOriginalID().compareTo(i.getOriginalID());
} else
// we need to take the elder date
return this.getDate().compareTo(i.getDate());
} else {
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
}
}
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId);
return IdentifierComparator.compareIdentifiers(this, i);
}
}

View File

@ -7,6 +7,7 @@ import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
@ -54,7 +55,7 @@ class EntityMergerTest implements Serializable {
}
@Test
void softwareMergerTest() throws InstantiationException, IllegalAccessException {
void softwareMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
List<Tuple2<String, Software>> softwares = readSample(
testEntityBasePath + "/software_merge.json", Software.class);
@ -69,7 +70,7 @@ class EntityMergerTest implements Serializable {
}
@Test
void publicationMergerTest() throws InstantiationException, IllegalAccessException {
void publicationMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
@ -134,7 +135,7 @@ class EntityMergerTest implements Serializable {
}
@Test
void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
void publicationMergerTest2() throws InstantiationException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
@ -146,7 +147,7 @@ class EntityMergerTest implements Serializable {
}
@Test
void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
void publicationMergerTest3() throws InstantiationException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
@ -156,7 +157,8 @@ class EntityMergerTest implements Serializable {
}
@Test
void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
void publicationMergerTest4()
throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
@ -166,7 +168,8 @@ class EntityMergerTest implements Serializable {
}
@Test
void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
void publicationMergerTest5()
throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {
System.out
.println(

View File

@ -4,8 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.File;
@ -14,7 +13,11 @@ import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
@ -35,10 +38,13 @@ import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
@ -105,57 +111,27 @@ public class SparkDedupTest implements Serializable {
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
.thenReturn(
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml")));
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
.thenReturn(
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn(
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
.thenReturn(
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
.thenReturn(
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
.thenReturn(
IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"));
}
@Test
@ -163,11 +139,7 @@ public class SparkDedupTest implements Serializable {
void createSimRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"));
parser
.parseArgument(
@ -207,7 +179,7 @@ public class SparkDedupTest implements Serializable {
.count();
assertEquals(3076, orgs_simrel);
assertEquals(7040, pubs_simrel);
assertEquals(7046, pubs_simrel);
assertEquals(336, sw_simrel);
assertEquals(442, ds_simrel);
assertEquals(6784, orp_simrel);
@ -223,11 +195,7 @@ public class SparkDedupTest implements Serializable {
void whitelistSimRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkWhitelistSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json"));
parser
.parseArgument(
@ -264,7 +232,7 @@ public class SparkDedupTest implements Serializable {
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
assertEquals(3076, orgs_simrel);
assertEquals(7040, pubs_simrel);
assertEquals(7046, pubs_simrel);
assertEquals(442, ds_simrel);
assertEquals(6784, orp_simrel);
// System.out.println("orgs_simrel = " + orgs_simrel);
@ -306,11 +274,7 @@ public class SparkDedupTest implements Serializable {
void cutMergeRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateMergeRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
parser
.parseArgument(
@ -402,11 +366,7 @@ public class SparkDedupTest implements Serializable {
void createMergeRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateMergeRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
parser
.parseArgument(
@ -427,10 +387,10 @@ public class SparkDedupTest implements Serializable {
.read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
.count();
long pubs_mergerel = spark
final Dataset<Relation> pubs = spark
.read()
.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
.count();
.as(Encoders.bean(Relation.class));
long sw_mergerel = spark
.read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
@ -445,8 +405,35 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count();
final List<Relation> merges = pubs
.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, merges.size());
Set<String> dups = Sets
.newHashSet(
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
merges.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.MERGES, r.getRelClass());
assertTrue(dups.contains(r.getTarget()));
});
final List<Relation> mergedIn = pubs
.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, mergedIn.size());
mergedIn.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getSource()));
});
assertEquals(1268, orgs_mergerel);
assertEquals(1444, pubs_mergerel);
assertEquals(1450, pubs.count());
assertEquals(286, sw_mergerel);
assertEquals(472, ds_mergerel);
assertEquals(738, orp_mergerel);
@ -463,11 +450,7 @@ public class SparkDedupTest implements Serializable {
void createDedupRecordTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkCreateDedupRecord.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"));
parser
.parseArgument(
new String[] {
@ -483,12 +466,18 @@ public class SparkDedupTest implements Serializable {
new SparkCreateDedupRecord(parser, spark).run(isLookUpService);
final ObjectMapper mapper = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
final Dataset<Publication> pubs = spark
.read()
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
.map(
(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
Encoders.bean(Publication.class));
long orgs_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord")
.count();
long pubs_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
.count();
long sw_deduprecord = jsc
.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
.count();
@ -499,11 +488,13 @@ public class SparkDedupTest implements Serializable {
.count();
assertEquals(86, orgs_deduprecord);
assertEquals(67, pubs_deduprecord);
assertEquals(68, pubs.count());
assertEquals(49, sw_deduprecord);
assertEquals(97, ds_deduprecord);
assertEquals(92, orp_deduprecord);
verifyRoot_1(mapper, pubs);
// System.out.println("orgs_deduprecord = " + orgs_deduprecord);
// System.out.println("pubs_deduprecord = " + pubs_deduprecord);
// System.out.println("sw_deduprecord = " + sw_deduprecord);
@ -511,16 +502,63 @@ public class SparkDedupTest implements Serializable {
// System.out.println("orp_deduprecord = " + orp_deduprecord);
}
private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
Publication root = pubs
.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.first();
assertNotNull(root);
final Dataset<String> publication = spark
.read()
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "publication"));
Publication crossref_duplicate = publication
.map(
(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
Encoders.bean(Publication.class))
.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList()
.get(0);
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> rootPids = root
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> dupPids = crossref_duplicate
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
Optional<Instance> instance_cr = root
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
.findFirst();
assertTrue(instance_cr.isPresent());
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
assertEquals(
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
}
@Test
@Order(6)
void updateEntityTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkUpdateEntity.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"));
parser
.parseArgument(
new String[] {
@ -587,7 +625,7 @@ public class SparkDedupTest implements Serializable {
.distinct()
.count();
assertEquals(898, publications);
assertEquals(902, publications);
assertEquals(839, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
@ -640,11 +678,7 @@ public class SparkDedupTest implements Serializable {
void propagateRelationTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SparkPropagateRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"));
parser
.parseArgument(
new String[] {
@ -714,4 +748,12 @@ public class SparkDedupTest implements Serializable {
public boolean isDeletedByInference(String s) {
return s.contains("\"deletedbyinference\":true");
}
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
SparkDedupTest.class
.getResourceAsStream(path));
}
}

View File

@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
assertEquals(288, orgs_simrel);
assertEquals(290, orgs_simrel);
}
@Test
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
assertEquals(324, orgs_simrel);
assertEquals(326, orgs_simrel);
}
@Test

View File

@ -0,0 +1,403 @@
package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkPublicationRootsTest implements Serializable {
@Mock(serializable = true)
ISLookUpService isLookUpService;
private static SparkSession spark;
private static String workingPath;
private static String graphInputPath;
private static String graphOutputPath;
private static final String testActionSetId = "test-orchestrator";
private static Path testBaseTmpPath;
private static final ObjectMapper MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@BeforeAll
public static void init() throws IOException, URISyntaxException {
testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest.class.getSimpleName() + "-");
final File entitiesSources = Paths
.get(SparkPublicationRootsTest.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
.toFile();
FileUtils
.copyDirectory(
entitiesSources,
testBaseTmpPath.resolve("input").toFile());
workingPath = testBaseTmpPath.resolve("workingPath").toString();
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
graphOutputPath = testBaseTmpPath.resolve("output").toString();
FileUtils.deleteDirectory(new File(workingPath));
FileUtils.deleteDirectory(new File(graphOutputPath));
final SparkConf conf = new SparkConf();
conf.set("spark.sql.shuffle.partitions", "10");
spark = SparkSession
.builder()
.appName(SparkPublicationRootsTest.class.getSimpleName())
.master("local[*]")
.config(conf)
.getOrCreate();
}
@BeforeEach
public void setUp() throws IOException, ISLookUpException {
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
}
@AfterAll
public static void tearDown() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
spark.close();
}
@Test
@Order(1)
void createSimRelsTest() throws Exception {
new SparkCreateSimRels(args(
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath,
"--numPartitions", "5"
}), spark)
.run(isLookUpService);
long pubs_simrel = spark
.read()
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
.count();
assertEquals(74, pubs_simrel);
}
@Test
@Order(2)
void cutMergeRelsTest() throws Exception {
new SparkCreateMergeRels(args(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath,
"--cutConnectedComponent", "3"
}), spark)
.run(isLookUpService);
long pubs_mergerel = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
.groupBy("source")
.agg(count("target").alias("cnt"))
.select("source", "cnt")
.where("cnt > 3")
.count();
assertEquals(0, pubs_mergerel);
FileUtils.deleteDirectory(new File(workingPath + "/" + testActionSetId + "/publication_mergerel"));
}
@Test
@Order(3)
void createMergeRelsTest() throws Exception {
new SparkCreateMergeRels(args(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Relation> merges = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class));
final List<Relation> mergeList = merges
.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, mergeList.size());
Set<String> dups = Sets
.newHashSet(
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
mergeList.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.MERGES, r.getRelClass());
assertTrue(dups.contains(r.getTarget()));
});
final List<Relation> mergedIn = merges
.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList();
assertEquals(3, mergedIn.size());
mergedIn.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getSource()));
});
assertEquals(32, merges.count());
}
@Test
@Order(4)
void createDedupRecordTest() throws Exception {
new SparkCreateDedupRecord(args(
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Publication> roots = spark
.read()
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
assertEquals(3, roots.count());
final Dataset<Publication> pubs = spark
.read()
.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
verifyRoot_case_1(roots, pubs);
verifyRoot_case_2(roots, pubs);
verifyRoot_case_3(roots, pubs);
}
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
.first();
assertNotNull(root);
Publication crossref_duplicate = pubs
.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
.collectAsList()
.get(0);
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> rootPids = root
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> dupPids = crossref_duplicate
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
Optional<Instance> instance_cr = root
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
.findFirst();
assertTrue(instance_cr.isPresent());
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
assertEquals(
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
}
private void verifyRoot_case_2(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|doi_dedup___::18aff3b55fb6876466a5d4bd82434885'")
.first();
assertNotNull(root);
Publication crossref_duplicate = pubs
.filter("id = '50|doi_________::18aff3b55fb6876466a5d4bd82434885'")
.first();
// System.err.println(new ObjectMapper().writeValueAsString(root));
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnOnline(), root.getJournal().getIssnOnline());
assertEquals(crossref_duplicate.getJournal().getVol(), root.getJournal().getVol());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> dups_cf = pubs
.collectAsList()
.stream()
.flatMap(p -> p.getCollectedfrom().stream())
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> root_cf = root
.getCollectedfrom()
.stream()
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
}
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
Publication root = roots
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
.first();
assertNotNull(root);
Publication pivot_duplicate = pubs
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
.first();
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> dups_cf = pubs
.collectAsList()
.stream()
.flatMap(p -> p.getCollectedfrom().stream())
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> root_cf = root
.getCollectedfrom()
.stream()
.map(KeyValue::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
}
@Test
@Order(6)
void updateEntityTest() throws Exception {
new SparkUpdateEntity(args(
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--workingPath", workingPath,
"--dedupGraphPath", graphOutputPath
}), spark)
.run(isLookUpService);
long publications = spark.read().textFile(graphOutputPath + "/publication").count();
long mergedPubs = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
.distinct()
.count();
assertEquals(19, publications); // 16 originals + 3 roots
long deletedPubs = spark
.read()
.textFile(graphOutputPath + "/publication")
.map(asEntity(Publication.class), Encoders.bean(Publication.class))
.filter("datainfo.deletedbyinference == true")
.map((MapFunction<Publication, String>) OafEntity::getId, Encoders.STRING())
.distinct()
.count();
assertEquals(mergedPubs, deletedPubs);
}
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
SparkPublicationRootsTest.class
.getResourceAsStream(path));
}
private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
return value -> MAPPER.readValue(value, clazz);
}
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
parser.parseArgument(args);
return parser;
}
}

View File

@ -0,0 +1,251 @@
package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkPublicationRootsTest2 implements Serializable {
@Mock(serializable = true)
ISLookUpService isLookUpService;
private static SparkSession spark;
private static String workingPath;
private static String graphInputPath;
private static String graphOutputPath;
private static final String testActionSetId = "test-orchestrator";
private static Path testBaseTmpPath;
private static final ObjectMapper MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@BeforeAll
public static void init() throws IOException, URISyntaxException {
testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest2.class.getSimpleName() + "-");
final File entitiesSources = Paths
.get(SparkPublicationRootsTest2.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
.toFile();
FileUtils
.copyDirectory(
entitiesSources,
testBaseTmpPath.resolve("input").toFile());
FileUtils
.copyFileToDirectory(
Paths
.get(
SparkPublicationRootsTest2.class
.getResource(
"/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz")
.toURI())
.toFile(),
testBaseTmpPath.resolve("input").resolve("entities").resolve("publication").toFile());
workingPath = testBaseTmpPath.resolve("workingPath").toString();
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
graphOutputPath = testBaseTmpPath.resolve("output").toString();
final SparkConf conf = new SparkConf();
conf.set("spark.sql.shuffle.partitions", "10");
spark = SparkSession
.builder()
.appName(SparkPublicationRootsTest2.class.getSimpleName())
.master("local[*]")
.config(conf)
.getOrCreate();
}
@BeforeEach
public void setUp() throws IOException, ISLookUpException {
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
}
@AfterAll
public static void tearDown() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
}
@Test
@Order(7)
void dedupAlteredDatasetTest() throws Exception {
new SparkCreateSimRels(args(
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath,
"--numPartitions", "5"
}), spark)
.run(isLookUpService);
new SparkCreateMergeRels(args(
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Relation> merges = spark
.read()
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
.as(Encoders.bean(Relation.class));
assertEquals(
3, merges
.filter("relclass == 'isMergedIn'")
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
.distinct()
.count());
assertEquals(
4, merges
.filter("source == '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
.count());
new SparkCreateDedupRecord(args(
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
new String[] {
"--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl",
"--workingPath", workingPath
}), spark)
.run(isLookUpService);
final Dataset<Publication> roots = spark
.read()
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
assertEquals(3, roots.count());
final Dataset<Publication> pubs = spark
.read()
.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
Publication root = roots
.filter("id = '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
.first();
assertNotNull(root);
Publication crossref_duplicate = pubs
.filter("id = '50|doi_________::b3aec7985136e36827176aaa1dd5082d'")
.collectAsList()
.get(0);
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
Set<String> rootPids = root
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
Set<String> dupPids = crossref_duplicate
.getPid()
.stream()
.map(StructuredProperty::getValue)
.collect(Collectors.toCollection(HashSet::new));
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
assertTrue(rootPids.contains("10.1109/jstqe.2023.9999999"));
Optional<Instance> instance_cr = root
.getInstance()
.stream()
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
.findFirst();
assertTrue(instance_cr.isPresent());
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
assertEquals(
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
}
private static String classPathResourceAsString(String path) throws IOException {
return IOUtils
.toString(
SparkPublicationRootsTest2.class
.getResourceAsStream(path));
}
private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
return value -> MAPPER.readValue(value, clazz);
}
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
parser.parseArgument(args);
return parser;
}
}

View File

@ -168,11 +168,11 @@ public class SparkStatsTest implements Serializable {
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
.count();
assertEquals(477, orgs_blocks);
assertEquals(480, orgs_blocks);
assertEquals(295, pubs_blocks);
assertEquals(122, sw_blocks);
assertEquals(191, ds_blocks);
assertEquals(171, orp_blocks);
assertEquals(178, orp_blocks);
}
@AfterAll

View File

@ -0,0 +1,24 @@
<RESOURCE_PROFILE>
<HEADER>
<RESOURCE_IDENTIFIER value=""/>
<RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
<RESOURCE_KIND value="DedupOrchestrationDSResources"/>
<RESOURCE_URI value=""/>
<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
</HEADER>
<BODY>
<CONFIGURATION enabled="true">
<DEDUPLICATION>
<ENTITY code="20" label="Organization" name="organization"/>
<ACTION_SET id="test-orchestrator"/>
<SCAN_SEQUENCE>
<SCAN id="publication"/>
</SCAN_SEQUENCE>
</DEDUPLICATION>
</CONFIGURATION>
<STATUS>
<LAST_UPDATE value="2001-12-31T12:00:00"/>
</STATUS>
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
</BODY>
</RESOURCE_PROFILE>

View File

@ -0,0 +1,47 @@
# Root logger option
log4j.rootLogger=DEBUG, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
# Change this to set Spark log level
log4j.logger.org.apache.spark=ERROR
log4j.rootCategory=WARN
# Silence akka remoting
log4j.logger.Remoting=WARN
# Ignore messages below warning level from Jetty, because it's a bit verbose
log4j.logger.org.eclipse.jetty=WARN
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
#log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN
#log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN
log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN
#log4j.logger.org.apache.hadoop.io.compress=WARN
#log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN
log4j.logger.parquet.hadoop.ColumnChunkPageWriteStore=ERROR
log4j.logger.com.jayway.jsonpath.internal.path.CompiledPath=WARN
log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=ERROR
log4j.logger.parquet.hadoop=WARN
log4j.logger.org.eclipse.jetty.server.handler.ContextHandlerCollection=WARN
log4j.logger.org.spark_project.jetty.util.component.ContainerLifeCycle=WARN
log4j.logger.org.apache.hadoop.mapred.FileInputFormat=WARN
log4j.logger.org.spark_project.jetty.servlet.ServletHandler=WARN
log4j.logger.org.apache.commons.beanutils.converters.BooleanConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.StringConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.LongConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.ArrayConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.FloatConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.IntegerConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.DoubleConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.CharacterConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.ByteConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.BigIntegerConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.BigDecimalConverter=WARN
log4j.logger.org.apache.commons.beanutils.converters.ShortConverter=WARN
log4j.logger.org.apache.commons.beanutils.BeanUtils=WARN

View File

@ -554,7 +554,7 @@ public class PublicationToOaf implements Serializable {
private KeyValue createCollectedFrom() {
KeyValue cf = new KeyValue();
cf.setValue(ModelConstants.ORCID.toUpperCase());
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "cd0f74b5955dc87fd0605745c4b49ee8");
return cf;
}

View File

@ -31,13 +31,13 @@ class CrossrefMappingTest {
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
.mkString
for (line <- funder_doi.lines) {
for (line <- funder_doi.linesWithSeparators.map(l =>l.stripLineEnd)) {
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)
checkRelation(resultList)
}
for (line <- funder_name.lines) {
for (line <- funder_name.linesWithSeparators.map(l =>l.stripLineEnd)) {
val json = template.replace("%s", line)
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
assertTrue(resultList.nonEmpty)

View File

@ -25,7 +25,7 @@ class MappingORCIDToOAFTest {
.mkString
assertNotNull(json)
assertFalse(json.isEmpty)
json.lines.foreach(s => {
json.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => {
assertNotNull(ORCIDToOAF.extractValueFromInputString(s))
})
}

View File

@ -22,7 +22,7 @@ class UnpayWallMappingTest {
.mkString
var i: Int = 0
for (line <- Ilist.lines) {
for (line <- Ilist.linesWithSeparators.map(l =>l.stripLineEnd)) {
val p = UnpayWallToOAF.convertToOAF(line)
if (p != null) {
@ -43,7 +43,7 @@ class UnpayWallMappingTest {
i = i + 1
}
val l = Ilist.lines.next()
val l = Ilist.linesWithSeparators.map(l =>l.stripLineEnd).next()
val item = UnpayWallToOAF.convertToOAF(l)

View File

@ -230,10 +230,15 @@ public class PropagationConstant {
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
if (HdfsSupport.exists(inputPath, spark.sparkContext().hadoopConfiguration())) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
} else {
return spark.emptyDataset(Encoders.bean(clazz));
}
}
public static <R extends Oaf> Dataset<R> readOafKryoPath(

View File

@ -90,12 +90,12 @@ public class CommunityConfigurationFactory {
}
private static SelectionConstraints parseConstrains(Node node) {
Node aconstraints = node.selectSingleNode("./advancedConstraints");
if (aconstraints == null) {
Node advConstsNode = node.selectSingleNode("./advancedConstraints");
if (advConstsNode == null || StringUtils.isBlank(StringUtils.trim(advConstsNode.getText()))) {
return null;
}
SelectionConstraints selectionConstraints = new Gson()
.fromJson(aconstraints.getText(), SelectionConstraints.class);
.fromJson(advConstsNode.getText(), SelectionConstraints.class);
selectionConstraints.setSelection(resolver);
return selectionConstraints;

View File

@ -1,8 +1,10 @@
package eu.dnetlib.dhp.bulktag.community;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.dom4j.DocumentException;
import org.xml.sax.SAXException;
@ -13,71 +15,17 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class QueryInformationSystem {
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
+ " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
+ " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
+ " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
+ " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
+ " let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text() "
+ " let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text() "
+
"let $zenodo := $x//param[./@name='zenodoCommunity']/text() "
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden' "
+ " return "
+ " <community> "
+ " { $x//CONFIGURATION/context/@id} "
+ " <subjects> "
+ " {for $y in tokenize($subj,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " {for $y in tokenize($fos,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " {for $y in tokenize($sdg,',') "
+ " return "
+ " <subject>{$y}</subject>} "
+ " </subjects> "
+ " <datasources> "
+ " {for $d in $datasources "
+ " where $d/param[./@name='enabled']/text()='true' "
+ " return "
+ " <datasource> "
+ " <openaireId> "
+ " {$d//param[./@name='openaireId']/text()} "
+ " </openaireId> "
+ " <selcriteria> "
+ " {$d/param[./@name='selcriteria']/text()} "
+ " </selcriteria> "
+ " </datasource> } "
+ " </datasources> " +
" <zenodocommunities> " +
"{for $zc in $zenodo " +
"return " +
"<zenodocommunity> " +
"<zenodoid> " +
"{$zc} " +
"</zenodoid> " +
"</zenodocommunity>}"
+ " {for $zc in $communities "
+ " return "
+ " <zenodocommunity> "
+ " <zenodoid> "
+ " {$zc/param[./@name='zenodoid']/text()} "
+ " </zenodoid> "
+ " <selcriteria> "
+ " {$zc/param[./@name='selcriteria']/text()} "
+ " </selcriteria> "
+ " </zenodocommunity>} "
+ " </zenodocommunities> "
+ "<advancedConstraint>"
+ "{$x//CONFIGURATION/context/param[./@name='advancedConstraint']/text()} "
+ "</advancedConstraint>"
+ " </community>";
public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
throws ISLookUpException, DocumentException, SAXException {
throws ISLookUpException, DocumentException, SAXException, IOException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
final List<String> res = isLookUp.quickSearchProfile(XQUERY);
final List<String> res = isLookUp
.quickSearchProfile(
IOUtils
.toString(
QueryInformationSystem.class
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/query.xq")));
final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>";

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
@VerbClass("contains_ignorecase")
@VerbClass("contains_caseinsensitive")
public class ContainsVerbIgnoreCase implements Selection, Serializable {
private String param;

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
@VerbClass("equals_ignorecase")
@VerbClass("equals_caseinsensitive")
public class EqualVerbIgnoreCase implements Selection, Serializable {
private String param;

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
@VerbClass("not_contains_ignorecase")
@VerbClass("not_contains_caseinsensitive")
public class NotContainsVerbIgnoreCase implements Selection, Serializable {
private String param;

View File

@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
@VerbClass("not_equals_ignorecase")
@VerbClass("not_equals_caseinsensitive")
public class NotEqualVerbIgnoreCase implements Selection, Serializable {
private String param;

View File

@ -0,0 +1,30 @@
package eu.dnetlib.dhp.bulktag.criteria;
import java.io.Serializable;
@VerbClass("starts_with")
public class StartsWithVerb implements Selection, Serializable {
private String param;
public StartsWithVerb() {
}
public StartsWithVerb(final String param) {
this.param = param;
}
@Override
public boolean apply(String value) {
return value.startsWith(param);
}
public String getParam() {
return param;
}
public void setParam(String param) {
this.param = param;
}
}

View File

@ -38,13 +38,13 @@
{
"paramName": "test",
"paramLongName": "isTest",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramDescription": "Parameter intended for testing purposes only. True if the reun is relatesd to a test and so the taggingConf parameter should be loaded",
"paramRequired": false
},
{
"paramName": "tg",
"paramLongName": "taggingConf",
"paramDescription": "true if the spark session is managed, false otherwise",
"paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed",
"paramRequired": false
}

View File

@ -0,0 +1,58 @@
for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')
let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text()
let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept
let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept
let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept
let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text()
let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text()
let $zenodo := $x//param[./@name='zenodoCommunity']/text()
where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden'
return
<community>
{ $x//CONFIGURATION/context/@id}
<advancedConstraints>
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
</advancedConstraints>
<subjects>
{for $y in tokenize($subj,',')
return
<subject>{$y}</subject>}
{for $y in tokenize($fos,',')
return
<subject>{$y}</subject>}
{for $y in tokenize($sdg,',')
return
<subject>{$y}</subject>}
</subjects>
<datasources>
{for $d in $datasources
where $d/param[./@name='enabled']/text()='true'
return
<datasource>
<openaireId>
{$d//param[./@name='openaireId']/text()}
</openaireId>
<selcriteria>
{$d/param[./@name='selcriteria']/text()}
</selcriteria>
</datasource> }
</datasources>
<zenodocommunities>
{for $zc in $zenodo
return
<zenodocommunity>
<zenodoid>
{$zc}
</zenodoid>
</zenodocommunity>}
{for $zc in $communities
return
<zenodocommunity>
<zenodoid>
{$zc/param[./@name='zenodoid']/text()}
</zenodoid>
<selcriteria>
{$zc/param[./@name='selcriteria']/text()}
</selcriteria>
</zenodocommunity>}
</zenodocommunities>

View File

@ -16,6 +16,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.ForeachFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
@ -45,7 +46,9 @@ public class BulkTagJobTest {
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
+ " \"contributor\" : \"$['contributor'][*]['value']\","
+ " \"description\" : \"$['description'][*]['value']\", "
+ " \"subject\" :\"$['subject'][*]['value']\" }";
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='subject:fos')].value\"} ";
private static SparkSession spark;
@ -769,28 +772,14 @@ public class BulkTagJobTest {
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(4, idExplodeCommunity.count());
Assertions.assertEquals(5, idExplodeCommunity.count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
Assertions
.assertEquals(
1, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
}
// @Test
// void test1(){
// ProtoMap params = new Gson().fromJson(pathMap, ProtoMap.class);
// HashMap<String, String> param = new HashMap<>();
// for (String key : params.keySet()) {
// try {
// param.put(key, jsonContext.read(params.get(key)));
// } catch (com.jayway.jsonpath.PathNotFoundException e) {
// param.put(key, new ArrayList<>());
// }
// }
// return param;
// }
// }
}

View File

@ -83,4 +83,36 @@ class CommunityConfigurationFactoryTest {
Assertions.assertEquals("dariah", comm.get(0));
}
@Test
void loadSelCriteriaTest2() throws DocumentException, IOException, SAXException {
String xml = IOUtils
.toString(
getClass()
.getResourceAsStream(
"/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit2.xml"));
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
Map<String, List<String>> param = new HashMap<>();
param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi")));
param
.put(
"description",
new ArrayList<>(
Collections
.singletonList(
"This work has been partially supported by DARIAH-EU infrastructure")));
param
.put(
"contributor",
new ArrayList<>(
Collections
.singletonList(
"Author X helped to write the paper. X works for DARIAH")));
List<String> comm = cc
.getCommunityForDatasource(
"openaire____::1cfdb2e14977f31a98e0118283401f32", param);
// TODO add more assertions
Assertions.assertEquals(0, comm.size());
}
}

View File

@ -844,6 +844,89 @@
<organizations/>
</community>
<community id="dariah">
<advancedConstraints>
{
"criteria": [
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "North America"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "05"
}
]
},
{
"constraint": [
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Mexico"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "United States"
},
{
"verb": "equals_caseinsensitive",
"field": "subject",
"value": "Canada"
},
{
"verb": "contains",
"field": "fos",
"value": "06"
}
]
}
]
}
</advancedConstraints>
<subjects/>
<datasources>
<datasource>
@ -1174,7 +1257,9 @@
</zenodocommunities>
<organizations/>
</community>
<community id="euromarine">
<subjects/>
<datasources/>
<zenodocommunities/>
@ -1193,7 +1278,7 @@
<organizations/>
</community>
<community id="science-innovation-policy">
<advancedConstraints>{"criteria":[{"constraint":[{"verb":"equals_ignorecase","field":"subject","value":"ciencias de la comunicación"},
<advancedConstraints>{"criteria":[{"constraint":[{"verb":"equals_caseinsensitive","field":"subject","value":"ciencias de la comunicación"},
{"verb":"equals","field":"subject","value":"Miriam"}]},
{"constraint":[{"verb":"equals","field":"subject","value":"miriam"}]}]}</advancedConstraints>
<subjects>
@ -1317,81 +1402,81 @@
<datasources>
<datasource>
<openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>re3data_____::7b0ad08687b2c960d5aeef06f811d5e6</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>driver______::bee53aa31dc2cbb538c10c2b65fa5824</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>openaire____::437f4b072b1aa198adcbc35910ff3b98</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>openaire____::081b82f96300b6a6e3d282bad31cb6e2</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>openaire____::9e3be59865b2c1c335d32dae2fe7b254</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>opendoar____::8b6dd7db9af49e67306feb59a8bdc52c</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>share_______::4719356ec8d7d55d3feb384ce879ad6c</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>share_______::bbd802baad85d1fd440f32a7a3a2c2b1</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>opendoar____::6f4922f45568161a8cdf4ad2299f6d23</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
</selcriteria>
</datasource>
<datasource>
<openaireId>re3data_____::7980778c78fb4cf0fab13ce2159030dc</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]}</selcriteria>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]}</selcriteria>
</datasource>
<datasource>
<openaireId>re3data_____::978378def740bbf2bfb420de868c460b</openaireId>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]}</selcriteria>
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]}</selcriteria>
</datasource>
</datasources>
<zenodocommunities>

View File

@ -0,0 +1,25 @@
# Root logger option
log4j.rootLogger=DEBUG, stdout
# Direct log messages to stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
# Change this to set Spark log level
log4j.logger.org.apache.spark=ERROR
log4j.rootCategory=WARN
# Silence akka remoting
log4j.logger.Remoting=WARN
# Ignore messages below warning level from Jetty, because it's a bit verbose
log4j.logger.org.eclipse.jetty=WARN
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN
log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN
log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN
log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN

View File

@ -47,8 +47,8 @@ public class CleanContextSparkJob implements Serializable {
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
String contextId = parser.get("contextId");
log.info("contextId: {}", contextId);
@ -67,12 +67,12 @@ public class CleanContextSparkJob implements Serializable {
isSparkSessionManaged,
spark -> {
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingPath);
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingDir);
});
}
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam,
String inputPath, Class<T> entityClazz, String workingPath) {
String inputPath, Class<T> entityClazz, String workingDir) {
Dataset<T> res = spark
.read()
.textFile(inputPath)
@ -106,11 +106,11 @@ public class CleanContextSparkJob implements Serializable {
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
.json(workingDir);
spark
.read()
.textFile(workingPath)
.textFile(workingDir)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang3.SerializationUtils;
@ -10,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
@ -31,29 +33,30 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
}
private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
return;
} else {
// TODO cleaning based on different subject vocabs can be added here
}
cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject);
// TODO cleaning based on different subject vocabs can be added here
}
private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
Subject subject) {
AtomicReference<Boolean> modified = new AtomicReference<>(false);
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
return;
}
Qualifier newValue = vocabulary.lookup(subject.getValue());
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
subject.setValue(newValue.getClassid());
subject.getQualifier().setClassid(vocabularyId);
subject.getQualifier().setClassname(vocabulary.getName());
modified.set(true);
if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
Qualifier newValue = vocabulary.lookup(subject.getValue(), true);
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
subject.setValue(newValue.getClassid());
subject.getQualifier().setClassid(vocabularyId);
subject.getQualifier().setClassname(vocabulary.getName());
}
} else if (vocabularyId.equals(subject.getQualifier().getClassid())) {
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
if (Objects.isNull(syn) && Objects.isNull(term)) {
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
}
}
});
return modified.get();
}
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.oa.graph.clean;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.action.ReadDatasourceMasterDuplicateFromDB;
public class MasterDuplicateAction {
private static final Logger log = LoggerFactory.getLogger(MasterDuplicateAction.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MasterDuplicateAction.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/datasourcemaster_parameters.json")));
parser.parseArgument(args);
final String dbUrl = parser.get("postgresUrl");
log.info("postgresUrl: {}", dbUrl);
final String dbUser = parser.get("postgresUser");
log.info("postgresUser: {}", dbUser);
final String dbPassword = parser.get("postgresPassword");
log.info("postgresPassword: {}", dbPassword);
final String hdfsPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", hdfsPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode: {}", hdfsNameNode);
int rows = ReadDatasourceMasterDuplicateFromDB.execute(dbUrl, dbUser, dbPassword, hdfsPath, hdfsNameNode);
log.info("written {} rows", rows);
}
}

View File

@ -0,0 +1,227 @@
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2;
public class CleanCfHbSparkJob {
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanCountrySparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_cfhb_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String resolvedPath = parser.get("resolvedPath");
log.info("resolvedPath: {}", resolvedPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
HdfsSupport.remove(resolvedPath, spark.sparkContext().hadoopConfiguration());
cleanCfHb(
spark, inputPath, entityClazz, resolvedPath, dsMasterDuplicatePath, outputPath);
});
}
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
String resolvedPath, String masterDuplicatePath, String outputPath) {
// read the master-duplicate tuples
Dataset<MasterDuplicate> md = spark
.read()
.textFile(masterDuplicatePath)
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
Dataset<IdCfHbMapping> resolved = spark
.read()
.textFile(inputPath)
.map(as(entityClazz), Encoders.bean(entityClazz))
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
// set the EMPTY master ID/NAME and save it
resolved
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()))
.write()
.mode(SaveMode.Overwrite)
.json(resolvedPath);
// read again the resolved CF|HB mapping
Dataset<IdCfHbMapping> resolvedDS = spark
.read()
.textFile(resolvedPath)
.map(as(IdCfHbMapping.class), Encoders.bean(IdCfHbMapping.class));
// read the result table
Dataset<T> res = spark
.read()
.textFile(inputPath)
.map(as(entityClazz), Encoders.bean(entityClazz));
// Join the results with the resolved CF|HB mapping, apply the mapping and save it
res
.joinWith(resolvedDS, res.col("id").equalTo(resolvedDS.col("resultId")), "left")
.groupByKey((MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> t._1().getId(), Encoders.STRING())
.mapGroups(getMapGroupsFunction(), Encoders.bean(entityClazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
return t -> {
final IdCfHbMapping mapping = t._1();
Optional
.ofNullable(t._2())
.ifPresent(t2 -> {
mapping.setMasterId(t2.getMasterId());
mapping.setMasterName(t2.getMasterName());
});
return mapping;
};
}
private static <T extends Result> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
return r -> Stream
.concat(
Optional
.ofNullable(r.getCollectedfrom())
.map(cf -> cf.stream().map(KeyValue::getKey))
.orElse(Stream.empty()),
Stream
.concat(
Optional
.ofNullable(r.getInstance())
.map(
instances -> instances
.stream()
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
.orElse(Stream.empty())
.filter(StringUtils::isNotBlank),
Optional
.ofNullable(r.getInstance())
.map(
instances -> instances
.stream()
.map(
i -> Optional
.ofNullable(i.getCollectedfrom())
.map(KeyValue::getKey)
.orElse("")))
.orElse(Stream.empty())
.filter(StringUtils::isNotBlank)))
.distinct()
.filter(StringUtils::isNotBlank)
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
.iterator();
}
private static <T extends Result> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
@Override
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
final Tuple2<T, IdCfHbMapping> first = values.next();
final T res = first._1();
updateResult(res, first._2());
values.forEachRemaining(t -> updateResult(res, t._2()));
return res;
}
private void updateResult(T res, IdCfHbMapping m) {
if (Objects.nonNull(m)) {
res.getCollectedfrom().forEach(kv -> updateKeyValue(kv, m));
res.getInstance().forEach(i -> {
updateKeyValue(i.getHostedby(), m);
updateKeyValue(i.getCollectedfrom(), m);
});
}
}
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
if (kv.getKey().equals(a.getCfhb())) {
kv.setKey(a.getMasterId());
kv.setValue(a.getMasterName());
}
}
};
}
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
IdCfHbMapping m = new IdCfHbMapping(resultId);
m.setCfhb(cfHb);
return m;
}
private static <R> MapFunction<String, R> as(Class<R> clazz) {
return s -> OBJECT_MAPPER.readValue(s, clazz);
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
import java.io.Serializable;
public class IdCfHbMapping implements Serializable {
private String resultId;
private String cfhb;
private String masterId;
private String masterName;
public IdCfHbMapping() {
}
public IdCfHbMapping(String id) {
this.resultId = id;
}
public String getResultId() {
return resultId;
}
public void setResultId(String resultId) {
this.resultId = resultId;
}
public String getCfhb() {
return cfhb;
}
public void setCfhb(String cfhb) {
this.cfhb = cfhb;
}
public String getMasterId() {
return masterId;
}
public void setMasterId(String masterId) {
this.masterId = masterId;
}
public String getMasterName() {
return masterName;
}
public void setMasterName(String masterName) {
this.masterName = masterName;
}
}

View File

@ -4,9 +4,12 @@ package eu.dnetlib.dhp.oa.graph.clean.country;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.swing.text.html.Option;
@ -30,6 +33,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
@ -43,7 +47,7 @@ public class CleanCountrySparkJob implements Serializable {
String jsonConfiguration = IOUtils
.toString(
CleanContextSparkJob.class
CleanCountrySparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -58,8 +62,8 @@ public class CleanCountrySparkJob implements Serializable {
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String workingDir = parser.get("workingDir");
log.info("workingDir: {}", workingDir);
String datasourcePath = parser.get("hostedBy");
log.info("datasourcePath: {}", datasourcePath);
@ -85,12 +89,12 @@ public class CleanCountrySparkJob implements Serializable {
spark -> {
cleanCountry(
spark, country, verifyParam, inputPath, entityClazz, workingPath, collectedfrom, datasourcePath);
spark, country, verifyParam, inputPath, entityClazz, workingDir, collectedfrom, datasourcePath);
});
}
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
String inputPath, Class<T> entityClazz, String workingPath, String collectedfrom, String datasourcePath) {
String inputPath, Class<T> entityClazz, String workingDir, String collectedfrom, String datasourcePath) {
List<String> hostedBy = spark
.read()
@ -110,8 +114,8 @@ public class CleanCountrySparkJob implements Serializable {
return r;
}
if (r
.getPid()
List<StructuredProperty> ids = getPidsAndAltIds(r).collect(Collectors.toList());
if (ids
.stream()
.anyMatch(
p -> p
@ -134,11 +138,11 @@ public class CleanCountrySparkJob implements Serializable {
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
.json(workingDir);
spark
.read()
.textFile(workingPath)
.textFile(workingDir)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
Encoders.bean(entityClazz))
@ -148,6 +152,42 @@ public class CleanCountrySparkJob implements Serializable {
.json(inputPath);
}
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
final Stream<StructuredProperty> resultPids = Optional
.ofNullable(r.getPid())
.map(Collection::stream)
.orElse(Stream.empty());
final Stream<StructuredProperty> instancePids = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getPid())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
final Stream<StructuredProperty> instanceAltIds = Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.flatMap(
i -> Optional
.ofNullable(i.getAlternateIdentifier())
.map(Collection::stream)
.orElse(Stream.empty())))
.orElse(Stream.empty());
return Stream
.concat(
Stream.concat(resultPids, instancePids),
instanceAltIds);
}
private static boolean pidInParam(String value, String[] verifyParam) {
for (String s : verifyParam)
if (value.startsWith(s))

View File

@ -54,8 +54,8 @@ public class GetDatasourceFromCountry implements Serializable {
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String workingPath = parser.get("workingPath");
log.info("workingPath: {}", workingPath);
String workingPath = parser.get("workingDir");
log.info("workingDir: {}", workingPath);
String country = parser.get("country");
log.info("country: {}", country);
@ -65,13 +65,12 @@ public class GetDatasourceFromCountry implements Serializable {
conf,
isSparkSessionManaged,
spark -> {
getDatasourceFromCountry(spark, country, inputPath, workingPath);
});
}
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath,
String workingPath) {
String workingDir) {
Dataset<Organization> organization = spark
.read()
@ -83,7 +82,6 @@ public class GetDatasourceFromCountry implements Serializable {
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
o.getCountry().getClassid().length() > 0 &&
o.getCountry().getClassid().equals(country));
;
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
Dataset<Relation> relation = spark
@ -97,12 +95,12 @@ public class GetDatasourceFromCountry implements Serializable {
!rel.getDataInfo().getDeletedbyinference());
organization
.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
.joinWith(relation, organization.col("id").equalTo(relation.col("target")))
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING())
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(workingPath);
.json(workingDir);
}
}

View File

@ -366,6 +366,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setInstance(instances);
r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
}
protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info);
@ -384,6 +385,25 @@ public abstract class AbstractMdRecordToOafMapper {
return list;
}
private List<EoscIfGuidelines> prepareEOSCIfGuidelines(Document doc, DataInfo info) {
final Set<EoscIfGuidelines> set = Sets.newHashSet();
for (final Object o : doc.selectNodes("//oaf:eoscifguidelines")) {
final String code = ((Node) o).valueOf("@code");
final String label = ((Node) o).valueOf("@label");
final String url = ((Node) o).valueOf("@url");
final String semrel = ((Node) o).valueOf("@semanticrelation");
if (StringUtils.isNotBlank(code)) {
final EoscIfGuidelines eig = new EoscIfGuidelines();
eig.setCode(code);
eig.setLabel(label);
eig.setUrl(url);
eig.setSemanticRelation(semrel);
set.add(eig);
}
}
return Lists.newArrayList(set);
}
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances(

View File

@ -317,13 +317,13 @@
</switch>
</decision>
<fork name="fork_clean_context">
<path start="clean_publication_context"/>
<path start="clean_dataset_context"/>
<path start="clean_otherresearchproduct_context"/>
<path start="clean_software_context"/>
</fork>
<action name="clean_publication_context">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
@ -343,7 +343,7 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
@ -370,7 +370,7 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
@ -397,7 +397,7 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
@ -424,7 +424,7 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
<arg>--contextId</arg><arg>${contextId}</arg>
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
</spark>
@ -432,14 +432,13 @@
<error to="Kill"/>
</action>
<join name="wait_clean_context" to="getHostedby"/>
<join name="wait_clean_context" to="select_datasourceId_from_country"/>
<action name="getHostedby">
<action name="select_datasourceId_from_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean publications context</name>
<name>Select datasource ID from country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
@ -453,25 +452,25 @@
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--country</arg><arg>${country}</arg>
</spark>
<ok to="fork_clean_country"/>
<error to="Kill"/>
</action>
<fork name="fork_clean_country">
<path start="clean_publication_country"/>
<path start="clean_dataset_country"/>
<path start="clean_otherresearchproduct_country"/>
<path start="clean_software_country"/>
</fork>
<action name="clean_publication_country">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean publications counmtry</name>
<name>Clean publication country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
@ -486,13 +485,13 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_context"/>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
@ -500,7 +499,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean datasets Country</name>
<name>Clean dataset country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
@ -515,13 +514,13 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_context"/>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
@ -529,7 +528,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean otherresearchproducts country</name>
<name>Clean otherresearchproduct country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
@ -544,13 +543,13 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_context"/>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
@ -558,7 +557,7 @@
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean softwares country</name>
<name>Clean software country</name>
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
@ -573,17 +572,212 @@
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
<arg>--country</arg><arg>${country}</arg>
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
</spark>
<ok to="wait_clean_country"/>
<error to="Kill"/>
</action>
<join name="wait_clean_country" to="End"/>
<join name="wait_clean_country" to="should_patch_datasource_ids"/>
<decision name="should_patch_datasource_ids">
<switch>
<case to="get_ds_master_duplicate">${wf:conf('shouldClean') eq true}</case>
<default to="End"/>
</switch>
</decision>
<action name="get_ds_master_duplicate">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
</java>
<ok to="fork_patch_cfhb"/>
<error to="Kill"/>
</action>
<fork name="fork_patch_cfhb">
<path start="patch_publication_cfhb"/>
<path start="patch_dataset_cfhb"/>
<path start="patch_otherresearchproduct_cfhb"/>
<path start="patch_software_cfhb"/>
</fork>
<action name="patch_publication_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch publication cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<action name="patch_dataset_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch dataset cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<action name="patch_otherresearchproduct_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch otherresearchproduct cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<action name="patch_software_cfhb">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>patch software cfhb</name>
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/software</arg>
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
</spark>
<ok to="wait_clean_cfhb"/>
<error to="Kill"/>
</action>
<join name="wait_clean_cfhb" to="fork_copy_cfhb_patched_results"/>
<fork name="fork_copy_cfhb_patched_results">
<path start="copy_cfhb_patched_publication"/>
<path start="copy_cfhb_patched_dataset"/>
<path start="copy_cfhb_patched_otherresearchproduct"/>
<path start="copy_cfhb_patched_software"/>
</fork>
<action name="copy_cfhb_patched_publication">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/publication"/>
</prepare>
<arg>${workingDir}/cfHbPatched/publication</arg>
<arg>${graphOutputPath}/publication</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_cfhb_patched_dataset">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/dataset"/>
</prepare>
<arg>${workingDir}/cfHbPatched/dataset</arg>
<arg>${graphOutputPath}/dataset</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_cfhb_patched_otherresearchproduct">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/otherresearchproduct"/>
</prepare>
<arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
<arg>${graphOutputPath}/otherresearchproduct</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<action name="copy_cfhb_patched_software">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<prepare>
<delete path="${graphOutputPath}/software"/>
</prepare>
<arg>${workingDir}/cfHbPatched/software</arg>
<arg>${graphOutputPath}/software</arg>
</distcp>
<ok to="copy_wait"/>
<error to="Kill"/>
</action>
<join name="copy_wait" to="End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,32 @@
[
{
"paramName": "pu",
"paramLongName": "postgresUrl",
"paramDescription": "the jdbc url to the postgres",
"paramRequired": true
},
{
"paramName": "uid",
"paramLongName": "postgresUser",
"paramDescription": "the postgres user",
"paramRequired": true
},
{
"paramName": "pwd",
"paramLongName": "postgresPassword",
"paramDescription": "the postgres password=",
"paramRequired": true
},
{
"paramName": "p",
"paramLongName": "hdfsPath",
"paramDescription": "the target path on HDFS",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "hdfsNameNode",
"paramDescription": "the HDFS nameNode",
"paramRequired": true
}
]

View File

@ -126,6 +126,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>8000</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>
@ -152,6 +153,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>4000</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>
@ -178,6 +180,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>3000</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>
@ -204,6 +207,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>300</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>
@ -230,6 +234,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>100</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>
@ -256,6 +261,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>400</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>
@ -309,6 +315,7 @@
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--numPartitions</arg><arg>10000</arg>
</spark>
<ok to="join_import"/>
<error to="Kill"/>

View File

@ -0,0 +1,38 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "in",
"paramLongName": "inputPath",
"paramDescription": "the path to the graph data dump to read",
"paramRequired": true
},
{
"paramName": "rp",
"paramLongName": "resolvedPath",
"paramDescription": "the path to store the resolved records",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},
{
"paramName": "class",
"paramLongName": "graphTableClassName",
"paramDescription": "class name moelling the graph table",
"paramRequired": true
},
{
"paramName": "md",
"paramLongName": "masterDuplicatePath",
"paramDescription": "path to the file on HDFS holding the datasource id tuples [master, duplicate]",
"paramRequired": true
}
]

View File

@ -12,8 +12,8 @@
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramName": "wd",
"paramLongName": "workingDir",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},

View File

@ -12,8 +12,8 @@
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramName": "wd",
"paramLongName": "workingDir",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},

View File

@ -12,8 +12,8 @@
"paramRequired": true
},
{
"paramName": "wp",
"paramLongName": "workingPath",
"paramName": "wd",
"paramLongName": "workingDir",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},

View File

@ -116,54 +116,45 @@ object SparkConvertRDDtoDataset {
.map(s => mapper.readValue(s, classOf[Relation]))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
.filter(r => filterRelations(subRelTypeFilter, relClassFilter, r))
//filter OpenCitations relations
.filter(r =>
r.getDataInfo.getProvenanceaction != null &&
!"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
)
.filter(r => filterRelations(r))
//filter OpenCitations relations
// .filter(r =>
// r.getDataInfo.getProvenanceaction != null &&
// !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
// )
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
}
private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = {
if (StringUtils.isNotBlank(subRelTypeFilter)) {
subRelTypeFilter.equalsIgnoreCase(r.getSubRelType)
} else {
!relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))
private def filterRelations(r: Relation): Boolean = {
/** *
* We filter relation generated by dedups
* and all the relation that have one single collectedFrom OpenCitation
*/
val relClassFilter = List(
ModelConstants.MERGES,
ModelConstants.IS_MERGED_IN,
ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS,
ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS
)
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
false
else {
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
false
else if (r.getCollectedfrom.size() > 1)
true
else if (
r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0) != null && "OpenCitations".equalsIgnoreCase(
r.getCollectedfrom.get(0).getValue
)
)
false
else
true
}
}
/*
//TODO: finalise implementation
private def processResult[T<: Result](
implicit ct: ClassTag[T],
log: Logger,
spark: SparkSession,
sourcePath: String,
entityPath: String,
clazz: Class[T]
): Unit = {
val entityType = clazz.getSimpleName.toLowerCase
log.info(s"Converting $entityType")
val mapper = new ObjectMapper() with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)
val rdd = spark.sparkContext
.textFile(s"$sourcePath/$entityType")
.map(s => mapper.readValue(s, clazz))
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference);
implicit val encoder: Encoder[T] = Encoders.kryo(clazz)
spark
.createDataset(rdd)
.as[T]
.write
.mode(SaveMode.Overwrite)
.save(s"$entityPath/$entityType")
}
*/
}

View File

@ -82,10 +82,10 @@ public class CleanContextTest {
CleanContextSparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/publication",
"-graphTableClassName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString() + "/working",
"-contextId", "sobigdata",
"-verifyParam", "gCube "
"--graphTableClassName", Publication.class.getCanonicalName(),
"--workingDir", workingDir.toString() + "/working",
"--contextId", "sobigdata",
"--verifyParam", "gCube "
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

View File

@ -5,6 +5,7 @@ import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
@ -83,12 +84,12 @@ public class CleanCountryTest {
CleanCountrySparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/publication",
"-graphTableClassName", Publication.class.getCanonicalName(),
"-workingPath", workingDir.toString() + "/working",
"-country", "NL",
"-verifyParam", "10.17632",
"-collectedfrom", "NARCIS",
"-hostedBy", getClass()
"--graphTableClassName", Publication.class.getCanonicalName(),
"--workingDir", workingDir.toString() + "/working",
"--country", "NL",
"--verifyParam", "10.17632",
"--collectedfrom", "NARCIS",
"--hostedBy", getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
.getPath()
});
@ -147,4 +148,44 @@ public class CleanCountryTest {
.size());
}
@Test
public void testDatasetClean() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json")
.getPath();
spark
.read()
.textFile(sourcePath)
.map(
(MapFunction<String, Dataset>) r -> OBJECT_MAPPER.readValue(r, Dataset.class),
Encoders.bean(Dataset.class))
.write()
.json(workingDir.toString() + "/dataset");
CleanCountrySparkJob.main(new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", workingDir.toString() + "/dataset",
"-graphTableClassName", Dataset.class.getCanonicalName(),
"-workingDir", workingDir.toString() + "/working",
"-country", "NL",
"-verifyParam", "10.17632",
"-collectedfrom", "NARCIS",
"-hostedBy", getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
.getPath()
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Dataset> tmp = sc
.textFile(workingDir.toString() + "/dataset")
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
Assertions.assertEquals(1, tmp.count());
Assertions.assertEquals(0, tmp.first().getCountry().size());
}
}

View File

@ -7,6 +7,7 @@ import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
@ -278,10 +279,25 @@ public class GraphCleaningFunctionsTest {
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
verify_keyword(p_cleaned, "In Situ Hybridization");
verify_keyword(p_cleaned, "Avicennia");
// TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned));
}
private static void verify_keyword(Publication p_cleaned, String subject) {
Optional<Subject> s1 = p_cleaned
.getSubject()
.stream()
.filter(s -> s.getValue().equals(subject))
.findFirst();
assertTrue(s1.isPresent());
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassid());
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassname());
}
private Stream<Qualifier> getAuthorPidTypes(Result pub) {
return pub
.getAuthor()

View File

@ -0,0 +1,213 @@
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class CleanCfHbSparkJobTest {
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJobTest.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static SparkSession spark;
private static Path testBaseTmpPath;
private static String resolvedPath;
private static String graphInputPath;
private static String graphOutputPath;
private static String dsMasterDuplicatePath;
@BeforeAll
public static void beforeAll() throws IOException, URISyntaxException {
testBaseTmpPath = Files.createTempDirectory(CleanCfHbSparkJobTest.class.getSimpleName());
log.info("using test base path {}", testBaseTmpPath);
final File entitiesSources = Paths
.get(CleanCfHbSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities").toURI())
.toFile();
FileUtils
.copyDirectory(
entitiesSources,
testBaseTmpPath.resolve("input").resolve("entities").toFile());
FileUtils
.copyFileToDirectory(
Paths
.get(
CleanCfHbSparkJobTest.class
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
.toURI())
.toFile(),
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
resolvedPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbResolved").toString();
graphOutputPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbPatched").toString();
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
SparkConf conf = new SparkConf();
conf.setAppName(CleanCfHbSparkJobTest.class.getSimpleName());
conf.setMaster("local[*]");
conf.set("spark.driver.host", "localhost");
conf.set("spark.ui.enabled", "false");
spark = SparkSession
.builder()
.appName(CleanCfHbSparkJobTest.class.getSimpleName())
.config(conf)
.getOrCreate();
}
@AfterAll
public static void afterAll() throws IOException {
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
spark.stop();
}
@Test
void testCleanCfHbSparkJob() throws Exception {
final String outputPath = graphOutputPath + "/dataset";
final String inputPath = graphInputPath + "/dataset";
org.apache.spark.sql.Dataset<Dataset> records = read(spark, inputPath, Dataset.class);
Dataset d = records
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
.first();
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getCollectedfrom().get(0).getKey());
assertEquals("Bacterial Protein Interaction Database - DUP", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"Bacterial Protein Interaction Database - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
d = records
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
.first();
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getCollectedfrom().get(0).getKey());
assertEquals("FILUR DATA - DUP", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("FILUR DATA - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", d.getInstance().get(0).getHostedby().getKey());
assertEquals("depositar - DUP", d.getInstance().get(0).getHostedby().getValue());
d = records
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
CleanCfHbSparkJob
.main(
new String[] {
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", inputPath,
"--outputPath", outputPath,
"--resolvedPath", resolvedPath + "/dataset",
"--graphTableClassName", Dataset.class.getCanonicalName(),
"--masterDuplicatePath", dsMasterDuplicatePath
});
assertTrue(Files.exists(Paths.get(graphOutputPath, "dataset")));
records = read(spark, outputPath, Dataset.class);
assertEquals(3, records.count());
d = records
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
.first();
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getCollectedfrom().get(0).getKey());
assertEquals("Bacterial Protein Interaction Database", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("Bacterial Protein Interaction Database", d.getInstance().get(0).getCollectedfrom().getValue());
d = records
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
.first();
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getCollectedfrom().get(0).getKey());
assertEquals("FULIR Data", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals("FULIR Data", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", d.getInstance().get(0).getHostedby().getKey());
assertEquals("depositar", d.getInstance().get(0).getHostedby().getValue());
d = records
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
d = records
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
.first();
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
assertEquals(
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
assertEquals(
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
}
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
return spark
.read()
.textFile(path)
.map(as(clazz), Encoders.bean(clazz));
}
private static <R> MapFunction<String, R> as(Class<R> clazz) {
return s -> OBJECT_MAPPER.readValue(s, clazz);
}
}

View File

@ -26,6 +26,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -238,7 +239,11 @@ class MappersTest {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
Publication p_cleaned = cleanup(p, vocs);
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
assertNotNull(p.getInstance().get(0).getPid());
assertEquals(2, p.getInstance().get(0).getPid().size());
@ -453,7 +458,10 @@ class MappersTest {
assertNotNull(i.getAccessright());
assertEquals("OPEN", i.getAccessright().getClassid());
});
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
Publication p_cleaned = cleanup(p, vocs);
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
}
@Test
@ -570,7 +578,9 @@ class MappersTest {
assertTrue(i.getUrl().contains("http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059"));
assertTrue(i.getUrl().contains("https://clinicaltrials.gov/ct2/show/NCT02321059"));
assertEquals("UNKNOWN", i.getRefereed().getClassid());
Dataset d_cleaned = cleanup(d, vocs);
assertEquals("0000", d_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", d_cleaned.getInstance().get(0).getRefereed().getClassname());
}
@Test
@ -871,7 +881,10 @@ class MappersTest {
assertNotNull(i.getAccessright());
assertEquals("UNKNOWN", i.getAccessright().getClassid());
});
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
Dataset p_cleaned = cleanup(p, vocs);
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
}
@Test
@ -947,6 +960,11 @@ class MappersTest {
Instance inst = p.getInstance().get(0);
assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getPid().get(0).getValue());
assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getUrl().get(0));
assertEquals(1, p.getEoscifguidelines().size());
assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getCode());
assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getLabel());
assertEquals("", p.getEoscifguidelines().get(0).getUrl());
assertEquals("compliesWith", p.getEoscifguidelines().get(0).getSemanticRelation());
}
@ -995,6 +1013,18 @@ class MappersTest {
}
@Test
void testEOSCFuture_ROHub() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("photic-zone-transformed.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
final OtherResearchProduct rocrate = (OtherResearchProduct) list.get(0);
assertNotNull(rocrate.getEoscifguidelines());
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(rocrate));
System.out.println("***************");
}
@Test
void testNotWellFormed() throws IOException {
final String xml = IOUtils

View File

@ -0,0 +1,4 @@
{ "duplicateId" : "10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", "masterId" : "10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", "masterName" : "Bacterial Protein Interaction Database" }
{ "duplicateId" : "10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", "masterId" : "10|re3data_____::fc1db64b3964826913b1e9eafe830490", "masterName" : "FULIR Data" }
{ "duplicateId" : "10|re3data_____::6ffd7bc058f762912dc494cd9c175341", "masterId" : "10|fairsharing_::3f647cadf56541fb9513cb63ec370187", "masterName" : "depositar" }
{ "duplicateId" : "10|scindeksserb::07022f78a8cc6d1171092454ecdbb47c", "masterId" : "10|doajarticles::07022f78a8cc6d1171092454ecdbb47c", "masterName" : "Artefact" }

File diff suppressed because one or more lines are too long

View File

@ -706,6 +706,28 @@
"source": [
],
"subject": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:repository",
"classname": "sysimport:crosswalk:repository",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": false,
"trust": "0.9"
},
"qualifier": {
"classid": "FOS",
"classname": "Fields of Science and Technology classification",
"schemeid": "dnet:result_subject",
"schemename": "dnet:result_subject"
},
"value": "In Situ Hybridization"
},
{
"dataInfo": {
"deletedbyinference": false,
@ -885,24 +907,23 @@
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"classid": "sysimport:actionset",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
"classid": "FOS",
"classname": "Fields of Science and Technology classification",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "doped silicon"
"value": "Avicennia"
},
{
"dataInfo": {

View File

@ -0,0 +1,108 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<dri:objIdentifier>fsh_____4119::68126da991bd76d8be494bddfbf7a1bb</dri:objIdentifier>
<dri:recordIdentifier>https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2022-11-15T12:29:19Z</dr:dateOfCollection>
<dr:dateOfTransformation>2022-11-15T12:29:19Z</dr:dateOfTransformation>
<oaf:datasourceprefix>fsh_____4119</oaf:datasourceprefix>
<identifier>https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</identifier>
<datestamp>2022-11-15T12:29:19Z</datestamp>
<setSpec>rohub_data</setSpec>
<setSpec>ro-crate_data</setSpec>
</header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="w3id">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</datacite:identifier>
<datacite:alternateIdentifiers>
<datacite:alternateIdentifier alternateIdentifierType="w3id">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</datacite:alternateIdentifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/b1b617b2-6b79-4bae-9fa6-b76945645626</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/78103994-30be-4875-bf89-5acd752b5c3d</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/18fd1c70-249b-4c67-80ee-539f801a0da7</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/32faa2eb-4cc8-401f-ac5c-bec2849b70e1</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/4c253f5a-d427-40c2-9e9f-6063ae087239</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/371b1957-078c-472b-a195-af7bce152c10</datacite:relatedIdentifier>
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/82f9e4b8-01b4-4e50-9e27-ec9d337c8d74</datacite:relatedIdentifier>
</datacite:relatedIdentifiers>
<datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_1843">RO-crate</datacite:resourceType>
<datacite:rightsList>
<datacite:rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</datacite:rights>
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
</datacite:rightsList>
<datacite:titles>
<datacite:title>Mapping the photic zone of the Mediterranean Sea</datacite:title>
</datacite:titles>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea</datacite:description>
</datacite:descriptions>
<datacite:publisher>CNR-ISMAR</datacite:publisher>
<creators xmlns="http://datacite.org/schema/kernel-4">
<creator>
<creatorName>Giorgio Castellan</creatorName>
</creator>
<creator>
<creatorName>Lorenzo Angeletti</creatorName>
</creator>
<creator>
<creatorName>Paolo Montagna</creatorName>
</creator>
<creator>
<creatorName>Marco Taviani</creatorName>
</creator>
</creators>
<dates xmlns="http://datacite.org/schema/kernel-4">
<date dateType="Issued">2022-11-14T16:32:45Z</date>
</dates>
<dc:descriptions>
<dc:description descriptionType="Abstract">Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea</dc:description>
</dc:descriptions>
<dc:publicationYear>2022</dc:publicationYear>
<rightsList xmlns="http://datacite.org/schema/kernel-4">
<rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights>
</rightsList>
<sizes xmlns="http://datacite.org/schema/kernel-4">
<size>813.478 KB</size>
</sizes>
<subjects xmlns="http://datacite.org/schema/kernel-4">
<subject>Earth sciences</subject>
<subject>Ecology</subject>
<subject>Optics</subject>
</subjects>
</datacite:resource>
<oaf:identifier identifierType="w3id">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</oaf:identifier>
<dr:CobjCategory type="other">0048</dr:CobjCategory>
<oaf:dateAccepted>2022-11-14</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>https://creativecommons.org/licenses/by/4.0/legalcode</oaf:license>
<oaf:language/>
<oaf:hostedBy name="ROHub" id="fairsharing_::4119"/>
<oaf:collectedFrom name="ROHub" id="fairsharing_::4119"/>
<oaf:eoscifguidelines code="EOSC::RO-crate"
label="EOSC::RO-crate"
url=""
semanticrelation="compliesWith"/>
<oaf:eoscifguidelines code="EOSC::Jupyter Notebook"
label="EOSC::Jupyter Notebook"
url=""
semanticrelation="compliesWith"/>
<oaf:eoscifguidelines code="EOSC::Data Cube"
label="EOSC::Data Cube"
url=""
semanticrelation="compliesWith"/>
</metadata>
</record>

View File

@ -65,7 +65,6 @@
</sizes>
<subjects xmlns="http://datacite.org/schema/kernel-4">
<subject>Ecology</subject>
<subject>EOSC::RO-crate</subject>
</subjects>
</datacite:resource>
<oaf:identifier identifierType="w3id">https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</oaf:identifier>
@ -75,5 +74,9 @@
<oaf:language/>
<oaf:hostedBy name="ROHub" id="fairsharing_::4119"/>
<oaf:collectedFrom name="ROHub" id="fairsharing_::4119"/>
<oaf:eoscifguidelines code="EOSC::RO-crate"
label="EOSC::RO-crate"
url=""
semanticrelation="compliesWith"/>
</metadata>
</record>

View File

@ -53,7 +53,7 @@ class ResolveEntitiesTest extends Serializable {
def generateUpdates(spark: SparkSession): Unit = {
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
val pids: List[String] = template.lines
val pids: List[String] = template.linesWithSeparators.map(l =>l.stripLineEnd)
.map { id =>
val r = new Result
r.setId(id.toLowerCase.trim)
@ -127,7 +127,7 @@ class ResolveEntitiesTest extends Serializable {
entities.foreach { e =>
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
spark
.createDataset(spark.sparkContext.parallelize(template.lines.toList))
.createDataset(spark.sparkContext.parallelize(template.linesWithSeparators.map(l =>l.stripLineEnd).toList))
.as[String]
.write
.option("compression", "gzip")
@ -264,7 +264,7 @@ class ResolveEntitiesTest extends Serializable {
Source
.fromInputStream(this.getClass.getResourceAsStream(s"publication"))
.mkString
.lines
.linesWithSeparators.map(l =>l.stripLineEnd)
.next(),
classOf[Publication]
)

View File

@ -47,7 +47,7 @@ class ScholixGraphTest extends AbstractVocabularyTest {
val inputRelations = Source
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
.mkString
val items = inputRelations.lines.toList
val items = inputRelations.linesWithSeparators.map(l =>l.stripLineEnd).toList
assertNotNull(items)
items.foreach(i => assertTrue(i.nonEmpty))
val result =
@ -69,7 +69,7 @@ class ScholixGraphTest extends AbstractVocabularyTest {
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
)
.mkString
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
val result: List[(Relation, ScholixSummary)] = inputRelations.linesWithSeparators.map(l =>l.stripLineEnd)
.sliding(2)
.map(s => (s.head, s(1)))
.map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))

View File

@ -0,0 +1,91 @@
package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException;
import java.io.StringReader;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
public class EOSCFuture_Test {
public static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
public static final String VERSION = "2021-04-15T10:05:53Z";
public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl";
private ContextMapper contextMapper;
@BeforeEach
public void setUp() {
contextMapper = new ContextMapper();
}
@Test
public void testEOSC_ROHub() throws IOException, DocumentException, TransformerException {
final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation);
final OtherResearchProduct p = OBJECT_MAPPER
.readValue(
IOUtils.toString(getClass().getResourceAsStream("eosc-future/photic-zone.json")),
OtherResearchProduct.class);
final String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
assertNotNull(xml);
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
System.out.println(doc.asXML());
testRecordTransformation(xml);
}
private void testRecordTransformation(final String record) throws IOException, TransformerException {
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
final String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt);
final Transformer tr = SaxonTransformerFactory.newInstance(transformer);
final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record);
final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID)
.parseDocument(indexRecordXML);
final String xmlDoc = ClientUtils.toXML(solrDoc);
Assertions.assertNotNull(xmlDoc);
System.out.println(xmlDoc);
}
}

View File

@ -128,6 +128,41 @@ public class IndexRecordTransformerTest {
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureSoftwareNotebook() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/software-justthink.xml"));
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureSoftwareNotebookClaim() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/software-justthink-claim.xml"));
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureZenodo7353841() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/zenodo7353841.xml"));
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureZenodo7351393() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/zenodo7351393.xml"));
testRecordTransformation(record);
}
@Test
public void testForEOSCFutureZenodo7351221() throws IOException, TransformerException {
final String record = IOUtils
.toString(getClass().getResourceAsStream("eosc-future/zenodo7351221.xml"));
testRecordTransformation(record);
}
@Test
void testDoiUrlNormalization() throws MalformedURLException {

View File

@ -0,0 +1,305 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri">
<header xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dri:objIdentifier>od______2659::3801993ea8f970cfc991277160edf277</dri:objIdentifier>
<dri:dateOfCollection>2022-08-08T03:06:13Z</dri:dateOfCollection>
<status>under curation</status>
<counters/>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf https://www.openaire.eu/schema/1.0/oaf-1.0.xsd">
<oaf:result>
<title classid="main title" classname="main title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">JUSThink
Alignment Analysis</title>
<creator rank="1" name="" surname="">Norman, Utku</creator>
<creator rank="2" name="" surname="">Dinkar, Tanvi</creator>
<creator rank="3" name="" surname="">Bruno, Barbara</creator>
<creator rank="4" name="" surname="">Clavel, Chloé</creator>
<dateofacceptance/>
<resulttype classid="software" classname="software"
schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
<language classid="eng" classname="English" schemeid="dnet:languages"
schemename="dnet:languages"/>
<description>
<p>
<strong>1. Description</strong>
</p>
<p>This repository contains<strong> tools to automatically analyse how
participants align their use of task-specific referents in their
dialogue and actions for a collaborative learning activity, and how
it relates to the task success</strong> (i.e. their learning
outcomes and task performance).</p>
<p>As a use case, it processes data from a collaborative problem solving
activity named JUSThink <a
href="https://zenodo.org/record/4675070#references">[1, 2]</a>, i.e.
JUSThink Dialogue and Actions Corpus data set that is available from the
Zenodo Repository, DOI: <a href="http://doi.org/10.5281/zenodo.4627104"
>10.5281/zenodo.4627104</a>, and reproduces the results and figures
in <a href="https://zenodo.org/record/4675070#references">[3]</a>.</p>
<p>In brief: </p>
<ol>
<li><strong>JUSThink Dialogue and Actions Corpus</strong> contains
transcripts, event logs, and test responses of children aged 9
through 12, as they participate in the JUSThink activity <a
href="https://zenodo.org/record/4675070#references">[1, 2]</a>
in pairs of two, to solve a problem on graphs together. </li>
<li><strong>The JUSThink activity and its study</strong> is first
described in <a href="https://zenodo.org/record/4675070#references"
>[1]</a>, and elaborated with findings concerning the link
between children&#39;s learning, performance in the activity, and
perception of self, the other and the robot in <a
href="https://zenodo.org/record/4675070#references">[2]</a>. </li>
<li><strong>Alignment analysis in our work <a
href="https://zenodo.org/record/4675070#references"
>[3]</a></strong> studies the participants&#39; use of
expressions that are related to the task at hand, their follow up
actions of these expressions, and how it links to task success.</li>
</ol>
<p>
<strong>2. Publications</strong>
</p>
<p>If you use this work in an academic context, please cite the following
publications:</p>
<ul>
<li>
<p>Norman*, U., Dinkar*, T., Bruno, B., &amp; Clavel, C. (2022).
Studying Alignment in a Collaborative Learning Activity via
Automatic Methods: The Link Between What We Say and Do. Dialogue
&amp; Discourse, 13(2), 1 - ;48. *Contributed equally to this
work. <a href="https://doi.org/10.5210/dad.2022.201"
>https://doi.org/10.5210/dad.2022.201</a></p>
</li>
<li>
<p>Norman, U., Dinkar, T., Bruno, B., &amp; Clavel, C. (2021).
JUSThink Alignment Analysis. In Dialogue &amp; Discourse
(v1.0.0, Vol. 13, Number 2, pp. 1 - ;48). Zenodo. <a
href="https://doi.org/10.5281/zenodo.4675070"
>https://doi.org/10.5281/zenodo.4675070</a></p>
</li>
</ul>
<p>
<strong>3. Content</strong>
</p>
<p>The tools provided in this repository consists of 7 Jupyter Notebooks
written in Python 3, and two additional external tools utilised by the
notebooks.</p>
<p>
<strong>3.1. Jupyter Notebooks</strong>
</p>
<p>We highlight that the notebooks up until the last (i.e. to test the
hypotheses (tools/7_test_the_hypotheses.ipynb)) present a general
pipeline to process event logs, test responses and transcripts to
extract measures of task performance, learning outcomes, and measures of
alignment.</p>
<ol>
<li><strong>Extract task performance (and other features) from the logs
</strong>(tools/1_extract_performance_and_other_features_from_logs.ipynb):
Extracts various measures of task behaviour from the logs, at
varying granularities of the activity (i.e. the whole corpus, task,
attempt, and turn levels). In later notebooks, we focus on one of
the features to estimate the task performance of a team: (minimum)
error.</li>
<li><strong>Extract learning outcomes from the test responses</strong>
(tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts
measures of learning outcomes from the responses to the pre-test and
the post-test. In later notebooks, we focus on one of the features
to estimate the learning outcome of a team: relative learning gain
<a href="https://sandbox.zenodo.org/record/742549#references"
>[4]</a></li>
<li><strong>Select and visualise a subset of teams for
transcription</strong>
(tools/3_visualise_transcribed_teams.ipynb): Visualises the
transcribed teams among the other teams in the feature space spanned
by task performance and learning outcome, as well as the
distribution of their number of attempts and turns.</li>
<li><strong>Extract routines from transcripts</strong>
(tools/4_extract_routines_from_transcripts.ipynb) (uses <a
href="https://github.com/GuillaumeDD/dialign">dialign</a> to
extract routines): Extracts routines of referring expressions that
are &quot;fixed&quot;, i.e. become shared or established amongst
interlocutors.</li>
<li><strong>Combine transcripts with logs</strong>
(tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb):
Merges transcripts with event logs to have a combined dialogue and
actions corpus, to be processed e.g. to detect follow-up
actions.</li>
<li><strong>Recognise instructions and detect follow-up actions</strong>
(tools/6_recognise_instructions_detect_follow-up_actions.ipynb):
Extracts verbalised instruction such as &quot;connect Mount Basel to
Montreux&quot;, and pairs them with the follow-up action that may
<em>match</em> (e.g. if the other connects Basel to Montreux) or
<em>mismatch</em> (e.g. if the other connects Basel to
Neuchatel) with the instruction.</li>
<li><strong>Test the hypotheses </strong>in <a
href="https://sandbox.zenodo.org/record/742549#references"
>[3]</a> (tools/7_test_the_hypotheses.ipynb) (uses
<strong>effsize</strong> to estimate effect size, specifically
Cliff&#39;s Delta): Considers each research questions and hypotheses
studied in <a
href="https://sandbox.zenodo.org/record/742549#references"
>[3]</a> and generates the results in <a
href="https://sandbox.zenodo.org/record/742549#references"
>[3]</a>.</li>
</ol>
<p>
<strong>3.2. External Tools</strong>
</p>
<ol>
<li><strong><a href="https://github.com/GuillaumeDD/dialign">dialign</a>
tool</strong> to extract routines, specifically <a
href="https://github.com/GuillaumeDD/dialign/releases/tag/v1.0"
>Release 1.0</a> from <a
href="https://github.com/GuillaumeDD/dialign/releases/download/v1.0/dialign-1.0.zip"
>dialign-1.0.zip</a>:\n It extracts routine expressions that are
&quot;shared&quot; among the participants from transcripts. \n It is
used as an external module (in accordance with its CeCILL-B License,
see <strong>License</strong>).</li>
<li><strong>effsize tool</strong> to compute estimators of effect
size.\n We specifically use it to compute Cliff&#39;s Delta, which
quantifies the amount difference between two groups of observations,
by computing the Cliff&#39;s Delta statistic.\n It is taken from
project <a
href="https://acclab.github.io/DABEST-python-docs/index.html"
>DABEST</a> (see <strong>License</strong>).</li>
</ol>
<p>
<strong>4. Research Questions and Hypotheses in <a
href="https://sandbox.zenodo.org/record/742549#references"
>[3]</a></strong>
</p>
<ul>
<li><strong>RQ1 Lexical alignment</strong>: How do the interlocutors
<em>use</em> expressions related to the task? Is this associated
with task success? <ul>
<li><strong>H1.1</strong>: Task-specific referents become
routine early for more successful teams.</li>
<li><strong>H1.2</strong>: Hesitation phenomena are more likely
to occur in the vicinity of priming and establishment of
task-specific referents for more successful teams.</li>
</ul>
</li>
<li><strong>RQ2 Behavioural alignment</strong>: How do the interlocutors
<em>follow up</em> these expressions with actions? Is this
associated with task success? <ul>
<li><strong>H2.1</strong>: Instructions are more likely to be
followed by a corresponding action early in the dialogue for
more successful teams.</li>
<li><strong>H2.2</strong>: When instructions are followed by a
corresponding or a different action, the action is more
likely to be in the vicinity of information management
phenomena for more successful teams.</li>
</ul>
</li>
</ul>
<p>The RQs and Hs are addressed in the notebook for testing the hypotheses
(i.e. tools/7_test_the_hypotheses.ipynb).</p>
<p>
<strong>Acknowledgements</strong>
</p>
<p>This project has received funding from the European Union&#39;s Horizon
2020 research and innovation programme under grant agreement No 765955.
Namely, the <a href="https://www.animatas.eu/">ANIMATAS Project</a>.</p>
<p>
<strong>License</strong>
</p>
<p>The whole package is under MIT License, see the <strong>LICENSE</strong>
file.</p>
<p>Classes under the <strong>tools/effsize</strong> package were taken from
project <a href="https://acclab.github.io/DABEST-python-docs/index.html"
><strong>DABEST</strong></a>, Copyright 2016-2020 Joses W. Ho.
These classes are licensed under the BSD 3-Clause Clear License. See
<strong>tools/effsize/LICENSE</strong> file for additional
details.</p>
<p>Classes under the <strong>tools/dialign-1.0</strong> package were taken
from project <strong><a href="https://github.com/GuillaumeDD/dialign"
>dialign</a></strong>. These classes are licensed under the
CeCILL-B License. This package is used as an &quot;external
module&quot;, see<strong> tools/dialign-1.0/LICENSE.txt</strong> for
additional details.</p>
</description>
<country classid="" classname="" schemeid="" schemename=""/>
<subject classid="" classname="" schemeid="" schemename=""/>
<relevantdate classid="" classname="" schemeid="" schemename=""/>
<publisher>Zenodo</publisher>
<embargoenddate/>
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol=""/>
<source/>
<fulltext/>
<format/>
<storagedate/>
<resourcetype classid="" classname="" schemeid="" schemename=""/>
<device/>
<size/>
<version/>
<lastmetadataupdate/>
<metadataversionnumber/>
<documentationUrl/>
<codeRepositoryUrl/>
<programmingLanguage classid="" classname="" schemeid="" schemename=""/>
<contactperson/>
<contactgroup/>
<tool/>
<originalId>oai:zenodo.org:4675070</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<pid classid="oai" classname="Open Archives Initiative"
schemeid="dnet:pid_types" schemename="dnet:pid_types"
>oai:zenodo.org:4675070</pid>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types"
>10.5281/zenodo.4675070</pid>
<bestaccessright classid="OPEN" classname="Open Access"
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
<eoscifguidelines code="EOSC::Jupyter Notebook" label="EOSC::Jupyter Notebook"
url="" semanticrelation="compliesWith"/>
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance/>
<provenanceaction classid="user:insert" classname="user:insert"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance=""
provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations"
type="project">corda__h2020::c4515ebef538a734cf11f795347f5dac</to>
<code>765955</code>
<acronym>ANIMATAS</acronym>
<title>Advancing intuitive human-machine interaction with human-like
social capabilities for education in schools</title>
<contracttype classid="" classname="" schemeid="" schemename=""/>
<funding>
<funder id="ec__________::EC" shortname="EC"
name="European Commission" jurisdiction=""/>
<funding_level_0 name="H2020"
>ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl/>
</rel>
</rels>
<children>
<instance id="od______2659::3801993ea8f970cfc991277160edf277">
<instancetype classid="0029" classname="Software"
schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<collectedfrom name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<hostedby name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<accessright classid="OPEN" classname="Open Access"
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
<dateofacceptance/>
<webresource>
<url>https://zenodo.org/record/4675070</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,429 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri">
<header xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dri:objIdentifier>doi_dedup___::c054151b6a8c4f41c7acf160651a6503</dri:objIdentifier>
<dri:dateOfCollection>2022-10-13T00:15:44+0000</dri:dateOfCollection>
<dri:dateOfTransformation>2022-10-13T07:44:29.152Z</dri:dateOfTransformation>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf https://www.openaire.eu/schema/1.0/oaf-1.0.xsd">
<oaf:result>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
<originalId>oai:zenodo.org:4675070</originalId>
<originalId>50|od______2659::3801993ea8f970cfc991277160edf277</originalId>
<originalId>oai:zenodo.org:6974562</originalId>
<originalId>50|od______2659::9c87ff4a5e7710052b873088e7265072</originalId>
<originalId>10.5281/zenodo.4675069</originalId>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.4675070</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.6974562</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.4675069</pid>
<measure id="influence" score="4.916186E-9" class="C5"/>
<measure id="popularity" score="6.885733E-9" class="C5"/>
<measure id="influence_alt" score="0" class="C5"/>
<measure id="popularity_alt" score="0.0" class="C5"/>
<measure id="impulse" score="0" class="C5"/>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
schemename="dnet:dataCite_title" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">JUSThink Alignment
Analysis</title>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
schemename="dnet:access_modes"/>
<creator rank="1" name="Utku" surname="Norman" orcid_pending="0000-0002-6802-1444"
>Norman, Utku</creator>
<creator rank="2" name="Tanvi" surname="Dinkar">Dinkar, Tanvi</creator>
<creator rank="3" name="Barbara" surname="Bruno" orcid_pending="0000-0003-0953-7173"
>Bruno, Barbara</creator>
<creator rank="4" name="Chloé" surname="Clavel" orcid_pending="0000-0003-4850-3398"
>Clavel, Chloé</creator>
<dateofacceptance>2022-08-08</dateofacceptance>
<description>&amp;lt;strong>1. Description&amp;lt;/strong> This repository
contains&amp;lt;strong> tools to automatically analyse how participants align
their use of task-specific referents in their dialogue and actions for a
collaborative learning activity, and how it relates to the task
success&amp;lt;/strong> (i.e. their learning outcomes and task performance). As
a use case, it processes data from a collaborative problem solving activity
named JUSThink [1, 2], i.e. JUSThink Dialogue and Actions Corpus data set that
is available from the Zenodo Repository, DOI: 10.5281/zenodo.4627104, and
reproduces the results and figures in [3]. In brief: &amp;lt;strong>JUSThink
Dialogue and Actions Corpus&amp;lt;/strong> contains transcripts, event logs,
and test responses of children aged 9 through 12, as they participate in the
JUSThink activity [1, 2] in pairs of two, to solve a problem on graphs together.
&amp;lt;strong>The JUSThink activity and its study&amp;lt;/strong> is first
described in [1], and elaborated with findings concerning the link between
children's learning, performance in the activity, and perception of self, the
other and the robot in [2]. &amp;lt;strong>Alignment analysis in our work
[3]&amp;lt;/strong> studies the participants' use of expressions that are
related to the task at hand, their follow up actions of these expressions, and
how it links to task success. &amp;lt;strong>Changes in Release
v1.1.0:&amp;lt;/strong> updated with the publication information, finalized
paper structure, research questions and hypotheses as in the published article:
U. Norman*&amp;lt;em>, &amp;lt;/em>T. Dinkar*, B. Bruno, and C. Clavel,
"Studying Alignment in a Collaborative Learning Activity via Automatic Methods:
The Link Between What We Say and Do," Dialogue &amp;amp;amp; Discourse, 13(2),
148. *Contributed equally to this work. 10.5210/dad.2022.201.
&amp;lt;strong>Full Changelog:&amp;lt;/strong>
https://github.com/chili-epfl/justhink-alignment-analysis/compare/v1.0.0...v1.1.0
&amp;lt;strong>2. Publications&amp;lt;/strong> If you use this work in an
academic context, please cite the following publications: Norman*, U., Dinkar*,
T., Bruno, B., &amp;amp;amp; Clavel, C. (2022). Studying Alignment in a
Collaborative Learning Activity via Automatic Methods: The Link Between What We
Say and Do. Dialogue &amp;amp;amp; Discourse, 13(2), 148. *Contributed equally
to this work. https://doi.org/10.5210/dad.2022.201 Norman, U., Dinkar, T.,
Bruno, B., &amp;amp;amp; Clavel, C. (2021). JUSThink Alignment Analysis. In
Dialogue &amp;amp;amp; Discourse (v1.1.0, Vol. 13, Number 2, pp. 148). Zenodo.
https://doi.org/10.5281/zenodo.6974562 &amp;lt;strong>3. Content&amp;lt;/strong>
The tools provided in this repository consists of 7 Jupyter Notebooks written in
Python 3, and two additional external tools utilised by the notebooks.
&amp;lt;strong>3.1. Jupyter Notebooks&amp;lt;/strong> We highlight that the
notebooks up until the last (i.e. to test the hypotheses
(tools/7_test_the_hypotheses.ipynb)) present a general pipeline to process event
logs, test responses and transcripts to extract measures of task performance,
learning outcomes, and measures of alignment. &amp;lt;strong>Extract task
performance (and other features) from the logs
&amp;lt;/strong>(tools/1_extract_performance_and_other_features_from_logs.ipynb):
Extracts various measures of task behaviour from the logs, at varying
granularities of the activity (i.e. the whole corpus, task, attempt, and turn
levels). In later notebooks, we focus on one of the features to estimate the
task performance of a team: (minimum) error. &amp;lt;strong>Extract learning
outcomes from the test responses&amp;lt;/strong>
(tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts measures of
learning outcomes from the responses to the pre-test and the post-test. In later
notebooks, we focus on one of the features to estimate the learning outcome of a
team: relative learning gain [4] &amp;lt;strong>Select and visualise a subset of
teams for transcription&amp;lt;/strong>
(tools/3_visualise_transcribed_teams.ipynb): Visualises the transcribed teams
among the other teams in the feature space spanned by task performance and
learning outcome, as well as the distribution of their number of attempts and
turns. &amp;lt;strong>Extract routines from transcripts&amp;lt;/strong>
(tools/4_extract_routines_from_transcripts.ipynb) (uses dialign to extract
routines): Extracts routines of referring expressions that are "fixed", i.e.
become shared or established amongst interlocutors. &amp;lt;strong>Combine
transcripts with logs&amp;lt;/strong>
(tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb): Merges
transcripts with event logs to have a combined dialogue and actions corpus, to
be processed e.g. to detect follow-up actions. &amp;lt;strong>Recognise
instructions and detect follow-up actions&amp;lt;/strong>
(tools/6_recognise_instructions_detect_follow-up_actions.ipynb): Extracts
verbalised instruction such as "connect Mount Basel to Montreux", and pairs them
with the follow-up action that may &amp;lt;em>match&amp;lt;/em> (e.g. if the
other connects Basel to Montreux) or &amp;lt;em>mismatch&amp;lt;/em> (e.g. if
the other connects Basel to Neuchatel) with the instruction. &amp;lt;strong>Test
the hypotheses &amp;lt;/strong>in [3] (tools/7_test_the_hypotheses.ipynb) (uses
&amp;lt;strong>effsize&amp;lt;/strong> to estimate effect size, specifically
Cliff's Delta): Considers each research questions and hypotheses studied in [3]
and generates the results in [3]. &amp;lt;strong>3.2. External
Tools&amp;lt;/strong> &amp;lt;strong>dialign tool&amp;lt;/strong> to extract
routines, specifically Release 1.0 from dialign-1.0.zip:&amp;lt;br> It extracts
routine expressions that are "shared" among the participants from transcripts.
&amp;lt;br> It is used as an external module (in accordance with its CeCILL-B
License, see &amp;lt;strong>License&amp;lt;/strong>). &amp;lt;strong>effsize
tool&amp;lt;/strong> to compute estimators of effect size.&amp;lt;br> We
specifically use it to compute Cliff's Delta, which quantifies the amount
difference between two groups of observations, by computing the Cliff's Delta
statistic.&amp;lt;br> It is taken from project DABEST (see
&amp;lt;strong>License&amp;lt;/strong>). &amp;lt;strong>4. Research Questions
and Hypotheses in [3]&amp;lt;/strong> &amp;lt;strong>RQ1 Lexical
alignment&amp;lt;/strong>: How do the interlocutors &amp;lt;em>use&amp;lt;/em>
expressions related to the task? Is this associated with task success?
&amp;lt;strong>H1.1&amp;lt;/strong>: Task-specific referents become routine
early for more successful teams. &amp;lt;strong>H1.2&amp;lt;/strong>: Hesitation
phenomena are more likely to occur in the vicinity of priming and establishment
of task-specific referents for more successful teams. &amp;lt;strong>RQ2
Behavioural alignment&amp;lt;/strong>: How do the interlocutors
&amp;lt;em>follow up&amp;lt;/em> these expressions with actions? Is this
associated with task success? &amp;lt;strong>H2.1&amp;lt;/strong>: Instructions
are more likely to be followed by a corresponding action early in the dialogue
for more successful teams. &amp;lt;strong>H2.2&amp;lt;/strong>: When
instructions are followed by a corresponding or a different action, the action
is more likely to be in the vicinity of information management phenomena for
more successful teams. The RQs and Hs are addressed in the notebook for testing
the hypotheses (i.e. tools/7_test_the_hypotheses.ipynb).
&amp;lt;strong>Acknowledgements&amp;lt;/strong> This project has received
funding from the European Union's Horizon 2020 research and innovation programme
under grant agreement No 765955. Namely, the ANIMATAS Project.
&amp;lt;strong>License&amp;lt;/strong> The whole package is under MIT License,
see the &amp;lt;strong>LICENSE&amp;lt;/strong> file. Classes under the
&amp;lt;strong>tools/effsize&amp;lt;/strong> package were taken from project
&amp;lt;strong>DABEST&amp;lt;/strong>, Copyright 2016-2020 Joses W. Ho. These
classes are licensed under the BSD 3-Clause Clear License. See
&amp;lt;strong>tools/effsize/LICENSE&amp;lt;/strong> file for additional
details. Classes under the &amp;lt;strong>tools/dialign-1.0&amp;lt;/strong>
package were taken from project &amp;lt;strong>dialign&amp;lt;/strong>. These
classes are licensed under the CeCILL-B License. This package is used as an
"external module", see&amp;lt;strong>
tools/dialign-1.0/LICENSE.txt&amp;lt;/strong> for additional
details.</description>
<description>{"references": ["[1] J. Nasir, U. Norman, B. Bruno, and P. Dillenbourg,
\"You Tell, I Do, and We Swap until we Connect All the Gold Mines!,\" ERCIM
News, vol. 2020, no. 120, 2020, [Online]. Available:
https://ercim-news.ercim.eu/en120/special/you-tell-i-do-and-we-swap-until-we-connect-all-the-gold-mines",
"[2] J. Nasir*, U. Norman*, B. Bruno, and P. Dillenbourg, \"When Positive
Perception of the Robot Has No Effect on Learning,\" in 2020 29th IEEE
International Conference on Robot and Human Interactive Communication (RO-MAN),
Aug. 2020, pp. 313\u2013320, doi: 10.1109/RO-MAN47096.2020.9223343", "[3] U.
Norman*, T. Dinkar*, B. Bruno, and C. Clavel, \"Studying Alignment in a
Collaborative Learning Activity via Automatic Methods: The Link Between What We
Say and Do,\" Dialogue &amp;amp;amp; Discourse, vol. 13, no. 2, pp. 1\u201348,
Aug. 2022, doi: 10.5210/dad.2022.201.", "[4] M. Sangin, G. Molinari, M.-A.
N\u00fcssli, and P. Dillenbourg, \"Facilitating peer knowledge modeling: Effects
of a knowledge awareness tool on collaborative learning outcomes and
processes,\"\" Computers in Human Behavior, vol. 27, no. 3, pp. 1059\u20131067,
May 2011, doi: 10.1016/j.chb.2010.05.032."]}</description>
<subject classid="keyword" classname="keyword"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>alignment</subject>
<subject classid="keyword" classname="keyword"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">situated
dialogue</subject>
<subject classid="keyword" classname="keyword"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">collaborative
learning</subject>
<subject classid="keyword" classname="keyword"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">spontaneous
speech</subject>
<subject classid="keyword" classname="keyword"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>disfluency</subject>
<subject classid="keyword" classname="keyword"
schemeid="dnet:subject_classification_typologies"
schemename="dnet:subject_classification_typologies" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9">mutual
understanding</subject>
<language classid="eng" classname="English" schemeid="dnet:languages"
schemename="dnet:languages"/>
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>2021-04-09</relevantdate>
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date"
schemename="dnet:dataCite_date" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>2022-08-08</relevantdate>
<publisher>Zenodo</publisher>
<resulttype classid="software" classname="software"
schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
<resourcetype classid="UNKNOWN" classname="UNKNOWN"
schemeid="dnet:dataCite_resource" schemename="dnet:dataCite_resource"/>
<programmingLanguage/>
<context id="EC" label="European Commission" type="funding">
<category id="EC::H2020" label="Horizon 2020 Framework Programme">
<concept id="EC::H2020::MSCA-ITN-ETN" label="European Training Networks"/>
</category>
</context>
<eoscifguidelines code="EOSC::Jupyter Notebook"
label="EOSC::Jupyter Notebook"
url=""
semanticrelation="compliesWith"/>
<datainfo>
<inferred>true</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.8</trust>
<inferenceprovenance>dedup-result-decisiontree-v3</inferenceprovenance>
<provenanceaction classid="sysimport:dedup" classname="Inferred by OpenAIRE"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance=""
provenanceaction="sysimport:actionset">
<to class="IsSupplementTo" scheme="dnet:result_result_relations"
type="publication">doi_dedup___::ae235765bbc422195a6c9f632b2d77eb</to>
<collectedfrom name="arXiv.org e-Print Archive"
id="opendoar____::6f4922f45568161a8cdf4ad2299f6d23"/>
<pid classid="arXiv" classname="arXiv" schemeid="dnet:pid_types"
schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>2104.04429</pid>
<collectedfrom name="Infoscience - EPFL scientific publications"
id="opendoar____::eecca5b6365d9607ee5a9d336962c534"/>
<publisher>arXiv</publisher>
<collectedfrom name="Crossref"
id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
<dateofacceptance>2022-08-05</dateofacceptance>
<title classid="main title" classname="main title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Studying
Alignment in a Collaborative Learning Activity via Automatic Methods:
The Link Between What We Say and Do</title>
<collectedfrom name="ORCID"
id="openaire____::806360c771262b4d6770e7cdf04b5c5a"/>
<collectedfrom name="Datacite"
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9"
>10.48550/arxiv.2104.04429</pid>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types"
>10.5210/dad.2022.201</pid>
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance=""
provenanceaction="sysimport:actionset">
<to class="isProducedBy" scheme="dnet:result_project_relations"
type="project">corda__h2020::c4515ebef538a734cf11f795347f5dac</to>
<title>Advancing intuitive human-machine interaction with human-like social
capabilities for education in schools</title>
<code>765955</code>
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission"
jurisdiction="EU"/>
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
<funding_level_1 name="MSCA-ITN-ETN"
>ec__________::EC::H2020::MSCA-ITN-ETN</funding_level_1>
</funding>
<acronym>ANIMATAS</acronym>
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance=""
provenanceaction="sysimport:actionset">
<to class="IsSupplementedBy" scheme="dnet:result_result_relations"
type="dataset">doi_dedup___::0a6314b0ed275d915f5b57a259375691</to>
<dateofacceptance>2021-03-22</dateofacceptance>
<publisher>Zenodo</publisher>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.4627104</pid>
<title classid="main title" classname="main title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title"
inferred="false" provenanceaction="sysimport:crosswalk:repository"
trust="0.9">JUSThink Dialogue and Actions Corpus</title>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9"
>10.5281/zenodo.4627103</pid>
<collectedfrom name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<collectedfrom name="Datacite"
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
</rel>
</rels>
<children>
<result objidentifier="doi_________::c054151b6a8c4f41c7acf160651a6503">
<publisher>Zenodo</publisher>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.4675070</pid>
<title classid="main title" classname="main title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title"
inferred="false" provenanceaction="sysimport:crosswalk:repository"
trust="0.9">JUSThink Alignment Analysis</title>
<dateofacceptance>2021-04-09</dateofacceptance>
<collectedfrom name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
</result>
<result objidentifier="doi_________::04aaa160a921cafdc90e03483de0a26f">
<dateofacceptance>2022-08-08</dateofacceptance>
<publisher>Zenodo</publisher>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.6974562</pid>
<collectedfrom name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<title classid="main title" classname="main title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title"
inferred="false" provenanceaction="sysimport:crosswalk:repository"
trust="0.9">JUSThink Alignment Analysis (v1.1.0)</title>
</result>
<result objidentifier="doi_________::684a8fbe0ff09f288e9d29db897233bb">
<title classid="main title" classname="main title"
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">JUSThink
Alignment Analysis (v1.1.0)</title>
<dateofacceptance>2022-08-08</dateofacceptance>
<publisher>Zenodo</publisher>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9"
>10.5281/zenodo.4675069</pid>
<collectedfrom name="Datacite"
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
</result>
<instance>
<accessright classid="OPEN" classname="Open Access"
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
<collectedfrom name="Datacite"
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<dateofacceptance>2022-08-08</dateofacceptance>
<instancetype classid="0029" classname="Software"
schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:actionset" trust="0.9"
>10.5281/zenodo.4675069</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<license>https://opensource.org/licenses/MIT</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.4675069</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access"
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
<collectedfrom name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<dateofacceptance>2022-08-08</dateofacceptance>
<instancetype classid="0029" classname="Software"
schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.6974562</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<license>https://opensource.org/licenses/MIT</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.6974562</url>
</webresource>
</instance>
<instance>
<accessright classid="OPEN" classname="Open Access"
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
<collectedfrom name="ZENODO"
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
<dateofacceptance>2021-04-09</dateofacceptance>
<instancetype classid="0029" classname="Software"
schemeid="dnet:publication_resource"
schemename="dnet:publication_resource"/>
<pid classid="doi" classname="Digital Object Identifier"
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
>10.5281/zenodo.4675070</pid>
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
schemename="dnet:review_levels"/>
<license>https://opensource.org/licenses/MIT</license>
<webresource>
<url>https://doi.org/10.5281/zenodo.4675070</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,99 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::9cb0664d4c891c4baaf73f007c0c9de0</dri:objIdentifier>
<dri:dateOfCollection>2022-11-25T12:55:13Z</dri:dateOfCollection>
<dri:status>under curation</dri:status>
<counters />
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">COVID-19 Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
<creator rank="1" name="" surname="">Karki, Reagon</creator>
<dateofacceptance />
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
<description><p>A Knowledge graph representation of compounds and associated biological entities in the BY-COVID and EOSC Future project.</p> <p><strong>Current status</strong></p> <ul> <li>Number of Nodes: 35952</li> <li>Number of Edges: 279462</li> <li>Human Proteins: 1347</li> <li>Assay: 15835</li> <li>Chemical/Compound: 4096</li> <li>Mechanism of Action: 739</li> <li>Pathway: 1513</li> <li>Disease: 1585</li> <li>SideEffect: 7420</li> <li>Biological Process: 2085</li> <li>Molecular Function: 1332</li> </ul> <p>Please check the BY_COVID_update_August.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in &#39;data\export&#39; folder.</p> <p></p></description>
<country classid="" classname="" schemeid="" schemename="" />
<subject classid="" classname="" schemeid="" schemename="" />
<relevantdate classid="" classname="" schemeid="" schemename="" />
<publisher>Zenodo</publisher>
<embargoenddate />
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
<source />
<fulltext />
<format />
<storagedate />
<resourcetype classid="" classname="" schemeid="" schemename="" />
<device />
<size />
<version />
<lastmetadataupdate />
<metadataversionnumber />
<documentationUrl />
<codeRepositoryUrl />
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
<contactperson />
<contactgroup />
<tool />
<originalId>oai:zenodo.org:7351221</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351221</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351221</pid>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<context id="covid-19" label="COVID-19" type="community"></context>
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance />
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
<code>101017536</code>
<acronym>EOSC Future</acronym>
<title>EOSC Future</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl />
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
<code>101046203</code>
<acronym>BY-COVID</acronym>
<title>Beyond COVID</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
</funding>
<websiteurl />
</rel>
</rels>
<children>
<instance id="od______2659::040cee965a4544e343a2ba149783c3fc">
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<dateofacceptance />
<webresource>
<url>https://zenodo.org/record/7351221</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,100 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::07fdccabd77830e3caccf0b33c083f1b</dri:objIdentifier>
<dri:dateOfCollection>2022-11-25T01:08:31Z</dri:dateOfCollection>
<dri:status>under curation</dri:status>
<counters />
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Monkeypox Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
<creator rank="1" name="" surname="">Karki, Reagon</creator>
<creator rank="2" name="" surname="">Andrea, Zaliani</creator>
<creator rank="3" name="" surname="">Gadiya, Yojana</creator>
<creator rank="4" name="" surname="">Gribbon, Philip</creator>
<dateofacceptance />
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
<description><p>The Monkeypox KG is built using viral and human proteins reported in different resources. Additionally, the KG represents chemicals tested against Monkeypox and their targets, associated biological processes, molecular functions, diseases and side effects.</p> <p><strong>KG status</strong></p> <p>Version 1 stats:</p> <ul> <li>Number of Nodes: 8235</li> <li>Number of Edges: 40422</li> </ul> <p>Version 2 stats (2nd September) :</p> <ul> <li>Number of Nodes: 9129</li> <li>Number of Edges: 44568</li> </ul> <p>Please check the graph.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in &#39;data\export&#39; folder.</p> <p></p></description>
<country classid="" classname="" schemeid="" schemename="" />
<subject classid="" classname="" schemeid="" schemename="" />
<relevantdate classid="" classname="" schemeid="" schemename="" />
<publisher>Zenodo</publisher>
<embargoenddate />
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
<source />
<fulltext />
<format />
<storagedate />
<resourcetype classid="" classname="" schemeid="" schemename="" />
<device />
<size />
<version />
<lastmetadataupdate />
<metadataversionnumber />
<documentationUrl />
<codeRepositoryUrl />
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
<contactperson />
<contactgroup />
<tool />
<originalId>oai:zenodo.org:7351393</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351393</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351393</pid>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance />
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
<code>101017536</code>
<acronym>EOSC Future</acronym>
<title>EOSC Future</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl />
</rel>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
<code>101046203</code>
<acronym>BY-COVID</acronym>
<title>Beyond COVID</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
</funding>
<websiteurl />
</rel>
</rels>
<children>
<instance id="od______2659::db2bc6381545f80dc9feec808a173ec0">
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<dateofacceptance />
<webresource>
<url>https://zenodo.org/record/7351393</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -0,0 +1,85 @@
<record>
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header>
<dri:objIdentifier>doi_________::93d39dd7edef016928788c3500e149f1</dri:objIdentifier>
<dri:dateOfCollection>2022-11-24T08:41:37Z</dri:dateOfCollection>
<dri:status>under curation</dri:status>
<counters/>
</header>
<metadata>
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
<oaf:result>
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">ENVRI SP - Dashboard State of the Environment - Ocean Indicators</title>
<creator rank="1" name="" surname="">Tjerk Krijger</creator>
<dateofacceptance />
<resulttype classid="other" classname="other" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
<description><p>The attached .yaml file is used as input to the Dashboard State of the Environment, which is a science project of the ENVRI-FAIR science cluster within EOSC-FUTURE. The contents of the file enable the visualization of Ocean indicators on the dashboard. It is possible to download the attached file and change the contents to include indicators from different domains such as atmosphere or biodiversity.</p></description>
<country classid="" classname="" schemeid="" schemename="" />
<subject classid="" classname="" schemeid="" schemename="" />
<relevantdate classid="" classname="" schemeid="" schemename="" />
<publisher>Zenodo</publisher>
<embargoenddate />
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
<source />
<fulltext />
<format />
<storagedate />
<resourcetype classid="" classname="" schemeid="" schemename="" />
<device />
<size />
<version />
<lastmetadataupdate />
<metadataversionnumber />
<documentationUrl />
<codeRepositoryUrl />
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
<contactperson />
<contactgroup />
<tool />
<originalId>oai:zenodo.org:7353841</originalId>
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7353841</pid>
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7353841</pid>
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<datainfo>
<inferred>false</inferred>
<deletedbyinference>false</deletedbyinference>
<trust>0.9</trust>
<inferenceprovenance />
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
</datainfo>
<rels>
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
<code>101017536</code>
<acronym>EOSC Future</acronym>
<title>EOSC Future</title>
<contracttype classid="" classname="" schemeid="" schemename="" />
<funding>
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
</funding>
<websiteurl />
</rel>
</rels>
<children>
<instance id="od______2659::3e4323c221f269e5f3d6db4c61dd2ec8">
<instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
<dateofacceptance />
<webresource>
<url>https://zenodo.org/record/7353841</url>
</webresource>
</instance>
</children>
</oaf:result>
</oaf:entity>
</metadata>
</result>
</record>

View File

@ -2,11 +2,11 @@
<FIELDS>
<FIELD indexable="false" name="oafentity" result="true" stat="false" tokenizable="false" xpath="//*[local-name() = 'entity']"/>
<FIELD indexable="true" name="oaftype" result="false" stat="false" tokenizable="false" value="local-name(//*[local-name()='entity']/*[local-name() != 'extraInfo'])"/>
<FIELD indexable="true" name="objIdentifier" result="false" stat="false" tokenizable="false" xpath="//header/dri:objIdentifier"/><!-- DATASOURCE FIELDS -->
<FIELD indexable="true" name="datasourceofficialname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/officialname"/>
<FIELD indexable="true" name="datasourceenglishname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/englishname"/>
<FIELD indexable="true" name="datasourceoddescription" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/oddescription"/>
<FIELD indexable="true" name="datasourceodsubjects" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odsubjects"/>
<FIELD indexable="true" name="objIdentifier" result="false" stat="false" tokenizable="false" xpath="//header/dri:objIdentifier"/><!-- DATASOURCE FIELDS -->
<FIELD copy="true" indexable="true" name="datasourceofficialname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/officialname"/>
<FIELD copy="true" indexable="true" name="datasourceenglishname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/englishname"/>
<FIELD copy="true" indexable="true" name="datasourceoddescription" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/oddescription"/>
<FIELD copy="true" indexable="true" name="datasourceodsubjects" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odsubjects"/>
<FIELD indexable="true" name="datasourceodlanguages" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odlanguages"/>
<FIELD indexable="true" name="datasourceodcontenttypes" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odcontenttypes"/>
<FIELD indexable="true" multivalued="false" name="datasourcetypename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/datasourcetype/@classname"/>
@ -14,17 +14,16 @@
<FIELD indexable="true" multivalued="false" name="datasourcetypeuiname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/datasourcetypeui/@classname"/>
<FIELD indexable="true" multivalued="false" name="datasourcecompatibilityid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/openairecompatibility/@classid"/>
<FIELD indexable="true" multivalued="false" name="datasourcecompatibilityname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/openairecompatibility/@classname"/>
<FIELD indexable="true" multivalued="true" name="datasourcesubject" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='datasource']/subjects"/>
<FIELD indexable="true" name="versioning" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/versioning"/>
<!-- datasource fields for EOSC -->
<FIELD indexable="true" name="datasourcejurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/jurisdiction"/>
<FIELD copy="true" indexable="true" multivalued="true" name="datasourcesubject" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='datasource']/subjects"/>
<FIELD indexable="true" name="versioning" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/versioning"/><!-- datasource fields for EOSC -->
<FIELD indexable="true" name="datasourcejurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/jurisdiction/@classname"/>
<FIELD indexable="true" name="datasourcethematic" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/thematic"/>
<FIELD indexable="true" name="datasourceknowledge_graph" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/knowledgegraph"/>
<FIELD indexable="true" name="datasourcecontentpolicy" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/contentpolicy"/>
<!-- ORGANIZATION FIELDS -->
<FIELD indexable="true" name="organizationlegalshortname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalshortname)"/>
<FIELD indexable="true" name="organizationlegalname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalname)"/>
<FIELD indexable="true" name="organizationalternativenames" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//alternativeNames)"/>
<FIELD indexable="true" name="datasourcecontentpolicy" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/contentpolicy/@classname"/>
<FIELD indexable="true" name="eosctype" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/eosctype/@classname"/>
<FIELD indexable="true" name="eoscdatasourcetype" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/eoscdatasourcetype/@classname"/><!-- ORGANIZATION FIELDS --><!-- ORGANIZATION FIELDS --><!-- ORGANIZATION FIELDS -->
<FIELD copy="true" indexable="true" name="organizationlegalshortname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalshortname)"/>
<FIELD copy="true" indexable="true" name="organizationlegalname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalname)"/>
<FIELD copy="true" indexable="true" name="organizationalternativenames" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//alternativeNames)"/>
<FIELD indexable="true" name="organizationeclegalbody" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/eclegalbody"/>
<FIELD indexable="true" name="organizationeclegalperson" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/eclegalperson"/>
<FIELD indexable="true" name="organizationecnonprofit" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecnonprofit"/>
@ -34,18 +33,17 @@
<FIELD indexable="true" name="organizationecenterprise" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecenterprise"/>
<FIELD indexable="true" name="organizationecsmevalidated" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecsmevalidated"/>
<FIELD indexable="true" name="organizationecnutscode" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecnutscode"/>
<FIELD indexable="true" multivalued="false" name="organizationcountryname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/country/@classname"/>
<!-- PROJECT FIELDS -->
<FIELD indexable="true" name="projectcode" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
<FIELD indexable="true" multivalued="false" name="organizationcountryname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/country/@classname"/><!-- PROJECT FIELDS -->
<FIELD copy="true" indexable="true" name="projectcode" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
<FIELD indexable="true" name="projectcode_nt" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
<FIELD indexable="true" name="projectacronym" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/acronym"/>
<FIELD indexable="true" name="projecttitle" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/title"/>
<FIELD indexable="true" multivalued="false" name="projectstartdate" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='project']/startdate"/>
<FIELD copy="true" indexable="true" name="projectacronym" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/acronym"/>
<FIELD copy="true" indexable="true" name="projecttitle" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/title"/>
<FIELD indexable="true" multivalued="false" name="projectstartdate" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='project']/startdate"/>
<FIELD indexable="true" multivalued="false" name="projectstartyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='project']/startdate)"/>
<FIELD indexable="true" multivalued="false" name="projectenddate" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='project']/enddate"/>
<FIELD indexable="true" multivalued="false" name="projectenddate" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='project']/enddate"/>
<FIELD indexable="true" multivalued="false" name="projectendyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='project']/enddate)"/>
<FIELD indexable="true" multivalued="false" name="projectcallidentifier" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/callidentifier"/>
<FIELD indexable="true" name="projectkeywords" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/keywords"/>
<FIELD copy="true" indexable="true" name="projectkeywords" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/keywords"/>
<FIELD indexable="true" multivalued="false" name="projectduration" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/duration"/>
<FIELD indexable="true" multivalued="false" name="projectecsc39" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='project']/ecsc39)"/>
<FIELD indexable="true" multivalued="false" name="projectoamandatepublications" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/oamandatepublications"/>
@ -54,35 +52,36 @@
<FIELD indexable="true" multivalued="false" name="projectcontracttypename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/contracttype/@classname"/>
<FIELD indexable="true" name="fundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/id"/>
<FIELD indexable="true" name="fundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/name"/>
<FIELD indexable="true" name="fundinglevel0_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/description"/>
<FIELD copy="true" indexable="true" name="fundinglevel0_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/description"/>
<FIELD indexable="true" name="fundinglevel1_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/id"/>
<FIELD indexable="true" name="fundinglevel1_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/name"/>
<FIELD indexable="true" name="fundinglevel1_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/description"/>
<FIELD copy="true" indexable="true" name="fundinglevel1_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/description"/>
<FIELD indexable="true" name="fundinglevel2_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/id"/>
<FIELD indexable="true" name="fundinglevel2_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/name"/>
<FIELD indexable="true" name="fundinglevel2_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/description"/><!-- PROJECTS' FUNDER FIELDS: indexable only with the new funding path/context handling -->
<FIELD copy="true" indexable="true" name="fundinglevel2_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/description"/><!-- PROJECTS' FUNDER FIELDS: indexable only with the new funding path/context handling -->
<FIELD indexable="true" name="funder" result="false" stat="false" tokenizable="false" value="concat(./id/text(), '||', ./name/text(), '||', ./shortname/text())" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder"/>
<FIELD indexable="true" name="fundershortname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/shortname"/>
<FIELD indexable="true" name="funderid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/id"/>
<FIELD indexable="true" name="fundername" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/name"/>
<FIELD indexable="true" name="funderoriginalname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/originalname"/>
<FIELD indexable="true" name="funderjurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/jurisdiction"/><!-- RESULT FIELDS -->
<FIELD indexable="true" name="resulttitle" result="false" stat="false" xpath="//*[local-name() = 'entity']/*[local-name() ='result']/title | //*[local-name()='entity']/*[local-name()='result']/children/result/title"/>
<FIELD indexable="true" name="resultsubject" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject)"/>
<FIELD copy="true" indexable="true" name="resulttitle" result="false" stat="false" type="text_en" xpath="//*[local-name() = 'entity']/*[local-name() ='result']/title | //*[local-name()='entity']/*[local-name()='result']/children/result/title"/>
<FIELD indexable="true" name="resultsubject" result="false" stat="false" type="text_en" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject)"/>
<FIELD indexable="true" name="resultsubjectclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject/@classname)"/>
<FIELD indexable="true" multivalued="false" name="resultembargoenddate" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='result']/embargoenddate"/>
<FIELD indexable="true" multivalued="false" name="resultembargoenddate" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='result']/embargoenddate"/>
<FIELD indexable="true" multivalued="false" name="resultembargoendyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='result']/embargoenddate)"/>
<FIELD indexable="true" multivalued="false" name="resulttypeid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/resulttype/@classid"/>
<FIELD indexable="true" multivalued="false" name="resulttypename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/resulttype/@classname"/>
<FIELD indexable="true" multivalued="false" name="resultlanguagename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/language/@classname"/>
<FIELD indexable="true" name="resultpublisher" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/*[local-name()='publisher']"/>
<FIELD indexable="true" name="resultdescription" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']//*[local-name()='description']"/>
<FIELD copy="true" indexable="true" name="resultpublisher" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/*[local-name()='publisher']"/>
<FIELD copy="true" indexable="true" name="resultdescription" result="false" stat="false" type="text_en" xpath="//*[local-name()='entity']/*[local-name()='result']//*[local-name()='description']"/>
<FIELD indexable="true" name="resultlicense" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/children/instance/license"/>
<FIELD indexable="true" name="resultaccessright" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/children/instance/accessright/@classname"/>
<FIELD indexable="true" name="resultresourcetypename" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/resourcetype/@classname"/>
<FIELD indexable="true" multivalued="false" name="resultbestaccessright" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/bestaccessright/@classname)"/>
<FIELD indexable="true" multivalued="false" name="resultdateofacceptance" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='result']/dateofacceptance"/>
<FIELD indexable="true" multivalued="false" name="resultacceptanceyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='result']/dateofacceptance)"/>
<FIELD indexable="true" multivalued="true" name="resultauthor" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
<FIELD indexable="true" multivalued="false" name="resultdateofacceptance" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='result']/dateofacceptance"/>
<FIELD copy="true" indexable="true" multivalued="false" name="resultacceptanceyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='result']/dateofacceptance)"/>
<FIELD copy="true" indexable="true" multivalued="true" name="resultauthor" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
<FIELD indexable="true" multivalued="true" name="resultauthor_nt" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
<FIELD indexable="true" multivalued="true" name="authorid" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator/@*[local-name() != 'rank' and local-name() != 'name' and local-name() != 'surname']"/>
<FIELD indexable="true" multivalued="true" name="authoridtype" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator/@*[local-name() != 'rank' and local-name() != 'name' and local-name() != 'surname']/local-name()"/>
@ -94,26 +93,29 @@
<FIELD indexable="true" name="resultdupid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*//children/result/@objidentifier"/>
<FIELD indexable="true" name="organizationdupid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*//children/organization/@objidentifier"/>
<FIELD indexable="true" name="externalrefsite" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/sitename)"/>
<FIELD indexable="true" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
<FIELD copy="true" indexable="true" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
<FIELD indexable="true" name="externalrefclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/qualifier/@classid)"/>
<FIELD indexable="true" name="externalrefid" result="false" stat="false" tokenizable="false" xpath="(//*[local-name()='entity']/*//children/externalreference/refidentifier)"/>
<FIELD indexable="true" name="resultidentifier" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
<FIELD indexable="true" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/><!-- REL FIELDS -->
<FIELD copy="true" indexable="true" name="resultidentifier" result="false" stat="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
<FIELD copy="true" indexable="true" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/>
<FIELD indexable="true" name="eoscifguidelines" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name() = 'result']/eoscifguidelines/@code)"/><!-- FOS and SDGs non tokenizable for faceted search-->
<FIELD indexable="true" name="fos" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS'])"/>
<FIELD indexable="true" name="sdg" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='SDG'])"/><!-- REL FIELDS -->
<FIELD indexable="true" name="reldatasourcecompatibilityid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='datasource']/openairecompatibility/@classid)"/>
<FIELD indexable="true" name="relproject" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./text(), '||', dnet:pickFirst(../acronym/text(), ../title/text())))" xpath="//*[local-name()='entity']/*//rel/to[@type='project']"/>
<FIELD indexable="true" name="relprojectid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='project'])"/>
<FIELD indexable="true" name="relprojectcode" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/code)"/>
<FIELD indexable="true" name="relprojectname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/acronym)"/>
<FIELD indexable="true" name="relprojecttitle" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/title)"/>
<FIELD copy="true" indexable="true" name="relprojectname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/acronym)"/>
<FIELD copy="true" indexable="true" name="relprojecttitle" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/title)"/>
<FIELD indexable="true" name="relcontracttypeid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/contracttype/@classid)"/>
<FIELD indexable="true" name="relcontracttypename" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/contracttype/@classname)"/>
<FIELD copy="true" indexable="true" name="relcontracttypename" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/contracttype/@classname)"/>
<FIELD indexable="true" name="relorganizationcountryid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid)"/>
<FIELD indexable="true" name="relorganizationcountryname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classname)"/>
<FIELD copy="true" indexable="true" name="relorganizationcountryname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classname)"/>
<FIELD indexable="true" name="relorganizationid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='organization'])"/>
<FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
<FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
<FIELD copy="true" indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
<FIELD copy="true" indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
<FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
<FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
@ -132,13 +134,15 @@
<FIELD indexable="true" name="relfunderjurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@jurisdiction)"/><!-- Collected from of the related entity. Available for result-result relationships -->
<FIELD indexable="true" name="relcollectedfromid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@id)"/>
<FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/>
<FIELD indexable="true" name="relvalidated" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./validated]/to[@type='project'])"/>
<FIELD indexable="true" name="semrelid" result="false" stat="false" tokenizable="false" value="concat(./to/text(), '||', ./to/@class/string())" xpath="//*[local-name()='entity']//rel"/><!-- COMMON FIELDS -->
<FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="pdate" value="//header/*[local-name()='dateOfCollection']"/>
<FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="date" value="//header/*[local-name()='dateOfCollection']"/>
<FIELD indexable="true" name="status" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//header/*[local-name()='status']"/>
<FIELD indexable="true" name="collectedfrom" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*/*[local-name()='collectedfrom'] | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']"/>
<FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>
<FIELD indexable="true" name="collectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@name | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@name)"/>
<FIELD indexable="true" name="originalid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//*[local-name()='entity']/*/*[local-name()='originalId']"/>
<FIELD indexable="true" name="pid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*/pid/text()|//*[local-name()='instance']/*[local-name()='alternateidentifier']/text())"/>
<FIELD indexable="true" name="pid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//*[local-name()='entity']/*/pid/text()"/>
<FIELD indexable="true" name="pidclassid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/pid/@classid)"/>
<FIELD indexable="true" name="pidclassname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/pid/@classname)"/>
<FIELD indexable="true" name="inferred" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//datainfo/inferred"/>
@ -156,20 +160,6 @@
<FIELD indexable="true" name="categoryname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category/@label)"/>
<FIELD indexable="true" name="conceptid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@id)"/>
<FIELD indexable="true" name="conceptname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@label)"/><!-- new index field for country info from different xpaths for any type of entity -->
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/><!-- COUNTER FIELDS -->
<FIELD header="true" indexable="true" multivalued="false" name="counter_dedup" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_dedup/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_authorship" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_authorship/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_participation" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_participation/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_similarity" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_similarity/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset_claimed" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset_claimed/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset_collected" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset_collected/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset_inferred" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset_inferred/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome_claimed" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome_claimed/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome_collected" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome_collected/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome_inferred" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome_inferred/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_affiliation" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_affiliation/@value"/>
<FIELD header="true" indexable="true" multivalued="false" name="counter_doi" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_doi/@value"/>
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/>
</FIELDS>
</LAYOUT>

View File

@ -21,7 +21,7 @@
</property>
<property>
<name>hive_jdbc_url</name>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228</value>
</property>
<property>
<name>oozie.wf.workflow.notification.url</name>

View File

@ -42,7 +42,9 @@ SELECT p.id,
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
p.callidentifier,
p.code,
p.totalcost
p.totalcost,
p.fundedamount,
p.currency
FROM ${stats_db_name}.project_tmp p
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
FROM ${stats_db_name}.project_results pr

View File

@ -59,7 +59,7 @@ UNION ALL
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
create table ${stats_db_name}.result_orcid STORED AS PARQUET as
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
from (
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
@ -69,7 +69,7 @@ from (
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
create table ${stats_db_name}.result_result stored as parquet as
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
@ -82,7 +82,7 @@ where reltype='resultResult'
and r2.resulttype.classname != 'other'
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
create table ${stats_db_name}.result_citations_oc stored as parquet as
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id
@ -97,7 +97,7 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
group by substr(target, 4);
create table ${stats_db_name}.result_references_oc stored as parquet as
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
from ${openaire_db_name}.relation rel
join ${openaire_db_name}.result r1 on rel.source=r1.id

View File

@ -48,7 +48,9 @@ create table TARGET.result stored as parquet as
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a' -- Nanyang Technological University
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb' -- McMaster University
) )) foo;
compute stats TARGET.result;

View File

@ -48,7 +48,9 @@ CREATE TABLE ${stats_db_name}.project_tmp
delayedpubs INT,
callidentifier STRING,
code STRING,
totalcost FLOAT
totalcost FLOAT,
fundedamount FLOAT,
currency STRING
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
INSERT INTO ${stats_db_name}.project_tmp
@ -72,7 +74,9 @@ SELECT substr(p.id, 4) AS id,
0 AS delayedpubs,
p.callidentifier.value AS callidentifier,
p.code.value AS code,
p.totalcost AS totalcost
p.totalcost AS totalcost,
p.fundedamount AS fundedamount,
p.currency.value AS currency
FROM ${openaire_db_name}.project p
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;