mergin with branch beta
This commit is contained in:
commit
e1317edd23
|
@ -0,0 +1,81 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.action;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.common.DbClient;
|
||||
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
|
||||
public class ReadDatasourceMasterDuplicateFromDB {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ReadDatasourceMasterDuplicateFromDB.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final String QUERY = "SELECT distinct dd.id as masterId, d.officialname as masterName, dd.duplicate as duplicateId "
|
||||
+
|
||||
"FROM dsm_dedup_services dd join dsm_services d on (dd.id = d.id);";
|
||||
|
||||
public static int execute(String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode)
|
||||
throws IOException {
|
||||
int count = 0;
|
||||
try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) {
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
FSDataOutputStream fos = fileSystem.create(new Path(hdfsPath));
|
||||
|
||||
log.info("running query: {}", QUERY);
|
||||
log.info("storing results in: {}", hdfsPath);
|
||||
|
||||
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) {
|
||||
dbClient.processResults(QUERY, rs -> writeMap(datasourceMasterMap(rs), writer));
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
private static MasterDuplicate datasourceMasterMap(ResultSet rs) {
|
||||
try {
|
||||
final MasterDuplicate md = new MasterDuplicate();
|
||||
|
||||
final String duplicateId = rs.getString("duplicateId");
|
||||
final String masterId = rs.getString("masterId");
|
||||
final String masterName = rs.getString("masterName");
|
||||
|
||||
md.setDuplicateId(OafMapperUtils.createOpenaireId(10, duplicateId, true));
|
||||
md.setMasterId(OafMapperUtils.createOpenaireId(10, masterId, true));
|
||||
md.setMasterName(masterName);
|
||||
|
||||
return md;
|
||||
} catch (final SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void writeMap(final MasterDuplicate dm, final BufferedWriter writer) {
|
||||
try {
|
||||
writer.write(OBJECT_MAPPER.writeValueAsString(dm));
|
||||
writer.newLine();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
|
||||
package eu.dnetlib.dhp.common.action.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* @author miriam.baglioni
|
||||
* @Date 21/07/22
|
||||
*/
|
||||
public class MasterDuplicate implements Serializable {
|
||||
private String duplicateId;
|
||||
private String masterId;
|
||||
private String masterName;
|
||||
|
||||
public String getDuplicateId() {
|
||||
return duplicateId;
|
||||
}
|
||||
|
||||
public void setDuplicateId(String duplicateId) {
|
||||
this.duplicateId = duplicateId;
|
||||
}
|
||||
|
||||
public String getMasterId() {
|
||||
return masterId;
|
||||
}
|
||||
|
||||
public void setMasterId(String masterId) {
|
||||
this.masterId = masterId;
|
||||
}
|
||||
|
||||
public String getMasterName() {
|
||||
return masterName;
|
||||
}
|
||||
|
||||
public void setMasterName(String masterName) {
|
||||
this.masterName = masterName;
|
||||
}
|
||||
}
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.common.vocabulary;
|
|||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -66,27 +67,39 @@ public class Vocabulary implements Serializable {
|
|||
}
|
||||
|
||||
public Qualifier getTermAsQualifier(final String termId) {
|
||||
if (StringUtils.isBlank(termId)) {
|
||||
return getTermAsQualifier(termId, false);
|
||||
}
|
||||
|
||||
public Qualifier getTermAsQualifier(final String termId, boolean strict) {
|
||||
final VocabularyTerm term = getTerm(termId);
|
||||
if (Objects.nonNull(term)) {
|
||||
return OafMapperUtils.qualifier(term.getId(), term.getName(), getId(), getName());
|
||||
} else if (Objects.isNull(term) && strict) {
|
||||
return OafMapperUtils.unknown(getId(), getName());
|
||||
} else if (termExists(termId)) {
|
||||
final VocabularyTerm t = getTerm(termId);
|
||||
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
|
||||
} else {
|
||||
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
|
||||
}
|
||||
}
|
||||
|
||||
public Qualifier getSynonymAsQualifier(final String syn) {
|
||||
return getSynonymAsQualifier(syn, false);
|
||||
}
|
||||
|
||||
public Qualifier getSynonymAsQualifier(final String syn, boolean strict) {
|
||||
return Optional
|
||||
.ofNullable(getTermBySynonym(syn))
|
||||
.map(term -> getTermAsQualifier(term.getId()))
|
||||
.map(term -> getTermAsQualifier(term.getId(), strict))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
public Qualifier lookup(String id) {
|
||||
return lookup(id, false);
|
||||
}
|
||||
|
||||
public Qualifier lookup(String id, boolean strict) {
|
||||
return Optional
|
||||
.ofNullable(getSynonymAsQualifier(id))
|
||||
.orElse(getTermAsQualifier(id));
|
||||
.ofNullable(getSynonymAsQualifier(id, strict))
|
||||
.orElse(getTermAsQualifier(id, strict));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,6 +23,8 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import me.xuender.unidecode.Unidecode;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
|
||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||
|
||||
|
@ -201,6 +203,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(s -> {
|
||||
if ("dnet:result_subject".equals(s.getQualifier().getClassid())) {
|
||||
s.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
|
||||
s.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_TYPOLOGIES);
|
||||
}
|
||||
return s;
|
||||
})
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(
|
||||
Collectors
|
||||
|
@ -211,7 +220,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.orElse(s.getValue()),
|
||||
Function.identity(),
|
||||
(s1, s2) -> Collections
|
||||
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
|
||||
.min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator())))
|
||||
.values());
|
||||
r.setSubject(subjects);
|
||||
}
|
||||
|
@ -333,7 +342,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
}
|
||||
if (Objects.isNull(i.getRefereed())) {
|
||||
if (Objects.isNull(i.getRefereed()) || StringUtils.isBlank(i.getRefereed().getClassid())) {
|
||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
||||
}
|
||||
if (Objects.nonNull(i.getDateofacceptance())) {
|
||||
|
|
|
@ -1,100 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.merge;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import scala.Tuple2;
|
||||
|
||||
class AuthorMergerTest {
|
||||
|
||||
private String publicationsBasePath;
|
||||
|
||||
private List<List<Author>> authors;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
|
||||
publicationsBasePath = Paths
|
||||
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
|
||||
.stream()
|
||||
.map(p -> p._2().getAuthor())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void mergeTest() { // used in the dedup: threshold set to 0.95
|
||||
|
||||
for (List<Author> authors1 : authors) {
|
||||
System.out.println("List " + (authors.indexOf(authors1) + 1));
|
||||
for (Author author : authors1) {
|
||||
System.out.println(authorToString(author));
|
||||
}
|
||||
}
|
||||
|
||||
List<Author> merge = AuthorMerger.merge(authors);
|
||||
|
||||
System.out.println("Merge ");
|
||||
for (Author author : merge) {
|
||||
System.out.println(authorToString(author));
|
||||
}
|
||||
|
||||
Assertions.assertEquals(7, merge.size());
|
||||
|
||||
}
|
||||
|
||||
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(path));
|
||||
String line = reader.readLine();
|
||||
while (line != null) {
|
||||
res
|
||||
.add(
|
||||
new Tuple2<>(
|
||||
MapDocumentUtil.getJPathString("$.id", line),
|
||||
new ObjectMapper().readValue(line, clazz)));
|
||||
// read next line
|
||||
line = reader.readLine();
|
||||
}
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public String authorToString(Author a) {
|
||||
|
||||
String print = "Fullname = ";
|
||||
print += a.getFullname() + " pid = [";
|
||||
if (a.getPid() != null)
|
||||
for (StructuredProperty sp : a.getPid()) {
|
||||
print += sp.toComparableString() + " ";
|
||||
}
|
||||
print += "]";
|
||||
return print;
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -49,7 +49,7 @@ object DataciteToOAFTransformation {
|
|||
/** This method should skip record if json contains invalid text
|
||||
* defined in file datacite_filter
|
||||
*
|
||||
* @param record : unparsed datacite record
|
||||
* @param record : not parsed Datacite record
|
||||
* @param json : parsed record
|
||||
* @return True if the record should be skipped
|
||||
*/
|
||||
|
@ -98,6 +98,10 @@ object DataciteToOAFTransformation {
|
|||
|
||||
}
|
||||
|
||||
/** This utility method indicates whether the embargo date has been reached
|
||||
* @param embargo_end_date
|
||||
* @return True if the embargo date has been reached, false otherwise
|
||||
*/
|
||||
def embargo_end(embargo_end_date: String): Boolean = {
|
||||
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
|
||||
val td = LocalDate.now()
|
||||
|
@ -142,6 +146,21 @@ object DataciteToOAFTransformation {
|
|||
}
|
||||
}
|
||||
|
||||
/** *
|
||||
* Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type.
|
||||
* Using the dnet:result_typologies vocabulary, we look up the instance.type synonym
|
||||
* to generate one of the following main entities:
|
||||
* - publication
|
||||
* - dataset
|
||||
* - software
|
||||
* - otherresearchproduct
|
||||
*
|
||||
* @param resourceType
|
||||
* @param resourceTypeGeneral
|
||||
* @param schemaOrg
|
||||
* @param vocabularies
|
||||
* @return
|
||||
*/
|
||||
def getTypeQualifier(
|
||||
resourceType: String,
|
||||
resourceTypeGeneral: String,
|
||||
|
@ -330,6 +349,7 @@ object DataciteToOAFTransformation {
|
|||
if (result == null)
|
||||
return List()
|
||||
|
||||
// DOI is mapped on a PID inside a Instance object
|
||||
val doi_q = OafMapperUtils.qualifier(
|
||||
"doi",
|
||||
"doi",
|
||||
|
@ -338,6 +358,8 @@ object DataciteToOAFTransformation {
|
|||
)
|
||||
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
|
||||
result.setPid(List(pid).asJava)
|
||||
|
||||
// This identifiere will be replaced in a second moment using the PID logic generation
|
||||
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
|
||||
result.setOriginalId(List(doi).asJava)
|
||||
|
||||
|
@ -386,6 +408,10 @@ object DataciteToOAFTransformation {
|
|||
a
|
||||
}
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
|
||||
|
||||
result.setTitle(
|
||||
|
@ -409,10 +435,6 @@ object DataciteToOAFTransformation {
|
|||
.asJava
|
||||
)
|
||||
|
||||
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
|
||||
return List()
|
||||
result.setAuthor(authors.asJava)
|
||||
|
||||
val dates = (json \\ "dates").extract[List[DateType]]
|
||||
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ object SparkCreateBaselineDataFrame {
|
|||
def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = {
|
||||
val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/")
|
||||
|
||||
val result = data.lines
|
||||
val result = data.linesWithSeparators.map(l =>l.stripLineEnd)
|
||||
.filter(l => l.startsWith("<a href="))
|
||||
.map { l =>
|
||||
val end = l.lastIndexOf("\">")
|
||||
|
|
|
@ -130,7 +130,7 @@
|
|||
</xsl:if>
|
||||
|
||||
<oaf:hostedBy name="{$varOfficialName}" id="{$varDataSourceId}" />
|
||||
<oaf:collectedFrom name="{$varOfficialName}" id="{$varDataSourceId}ß" />
|
||||
<oaf:collectedFrom name="{$varOfficialName}" id="{$varDataSourceId}" />
|
||||
|
||||
<xsl:variable name="varKnownFileEndings" select="('.bmp', '.doc', '.docx', '.epub', '.flv', '.jpeg', '.jpg', '.m4v', '.mp4', '.mpg', '.odp', '.pdf', '.png', '.ppt', '.tiv', '.txt', '.xls', '.xlsx', '.zip')" />
|
||||
<xsl:variable name="varIdDoi" select="distinct-values((//dc:identifier[starts-with(., '10.')][matches(., '(10[.][0-9]{4,}[^\s/>]*/[^\s>]+)')], //dc:identifier[starts-with(., 'http') and (contains(., '://dx.doi.org/10.') or contains(., '://doi.org/10.'))]/substring-after(., 'doi.org/'), //dc:identifier[starts-with(lower-case(.), 'doi:10.')]/substring-after(lower-case(.), 'doi:')))" />
|
||||
|
|
|
@ -63,7 +63,7 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump"))
|
||||
.mkString
|
||||
val r: List[Oaf] = records.lines.toList
|
||||
val r: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList
|
||||
.map(s => mapper.readValue(s, classOf[PMArticle]))
|
||||
.map(a => PubMedToOaf.convert(a, vocabularies))
|
||||
assertEquals(10, r.size)
|
||||
|
@ -173,9 +173,9 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump"))
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||
val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList.flatMap(o => BioDBToOAF.pdbTOOaf(o))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
@ -194,9 +194,9 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump"))
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result: List[Oaf] = records.lines.toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||
val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList.flatMap(o => BioDBToOAF.uniprotToOAF(o))
|
||||
|
||||
assertTrue(result.nonEmpty)
|
||||
result.foreach(r => assertNotNull(r))
|
||||
|
@ -239,9 +239,9 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
val records: String = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links"))
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
val result: List[Oaf] = records.lines.map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||
val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList
|
||||
|
||||
assertNotNull(result)
|
||||
assertTrue(result.nonEmpty)
|
||||
|
@ -276,11 +276,11 @@ class BioScholixTest extends AbstractVocabularyTest {
|
|||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved")
|
||||
)
|
||||
.mkString
|
||||
records.lines.foreach(s => assertTrue(s.nonEmpty))
|
||||
records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty))
|
||||
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
val l: List[ScholixResolved] = records.lines.map { input =>
|
||||
val l: List[ScholixResolved] = records.linesWithSeparators.map(l =>l.stripLineEnd).map { input =>
|
||||
lazy val json = parse(input)
|
||||
json.extract[ScholixResolved]
|
||||
}.toList
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
|
@ -15,6 +17,7 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
|
|||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
@ -74,33 +77,39 @@ public class DedupRecordFactory {
|
|||
|
||||
public static <T extends OafEntity> T entityMerger(
|
||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
|
||||
throws IllegalAccessException, InstantiationException {
|
||||
throws IllegalAccessException, InstantiationException, InvocationTargetException {
|
||||
|
||||
T entity = clazz.newInstance();
|
||||
entity.setDataInfo(dataInfo);
|
||||
final Comparator<Identifier<T>> idComparator = new IdentifierComparator<>();
|
||||
|
||||
final LinkedList<T> entityList = Lists
|
||||
.newArrayList(entities)
|
||||
.stream()
|
||||
.map(t -> Identifier.newInstance(t._2()))
|
||||
.sorted(idComparator)
|
||||
.map(Identifier::getEntity)
|
||||
.collect(Collectors.toCollection(LinkedList::new));
|
||||
|
||||
final T entity = clazz.newInstance();
|
||||
final T first = entityList.removeFirst();
|
||||
|
||||
BeanUtils.copyProperties(entity, first);
|
||||
|
||||
final Collection<String> dates = Lists.newArrayList();
|
||||
final List<List<Author>> authors = Lists.newArrayList();
|
||||
|
||||
entities
|
||||
.forEachRemaining(
|
||||
t -> {
|
||||
T duplicate = t._2();
|
||||
|
||||
entityList
|
||||
.forEach(
|
||||
duplicate -> {
|
||||
entity.mergeFrom(duplicate);
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result r1 = (Result) duplicate;
|
||||
if (r1.getAuthor() != null && !r1.getAuthor().isEmpty())
|
||||
authors.add(r1.getAuthor());
|
||||
if (r1.getDateofacceptance() != null)
|
||||
dates.add(r1.getDateofacceptance().getValue());
|
||||
Optional
|
||||
.ofNullable(r1.getAuthor())
|
||||
.ifPresent(a -> authors.add(a));
|
||||
}
|
||||
|
||||
});
|
||||
|
||||
// set authors and date
|
||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||
((Result) entity).setDateofacceptance(DatePicker.pick(dates));
|
||||
((Result) entity).setAuthor(AuthorMerger.merge(authors));
|
||||
}
|
||||
|
||||
|
|
|
@ -18,6 +18,10 @@ public class IdGenerator implements Serializable {
|
|||
if (pids == null || pids.isEmpty())
|
||||
return defaultID;
|
||||
|
||||
return generateId(pids);
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> String generateId(List<Identifier<T>> pids) {
|
||||
Identifier<T> bp = pids
|
||||
.stream()
|
||||
.min(Identifier::compareTo)
|
||||
|
|
|
@ -0,0 +1,81 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
||||
public class IdentifierComparator<T extends OafEntity> implements Comparator<Identifier<T>> {
|
||||
|
||||
public static int compareIdentifiers(Identifier left, Identifier right) {
|
||||
return new IdentifierComparator<>().compare(left, right);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(Identifier<T> left, Identifier<T> i) {
|
||||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||
// alphabetical order of the originalID
|
||||
|
||||
Set<String> lKeys = Optional
|
||||
.ofNullable(left.getCollectedFrom())
|
||||
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||
.orElse(Sets.newHashSet());
|
||||
|
||||
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
|
||||
Set<String> rKeys = cf
|
||||
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||
.orElse(Sets.newHashSet());
|
||||
|
||||
if (left.getPidType().compareTo(i.getPidType()) == 0) { // same type
|
||||
if (left.getEntityType() == EntityType.publication) {
|
||||
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
|
||||
return -1;
|
||||
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
|
||||
return 1;
|
||||
}
|
||||
if (left.getEntityType() == EntityType.dataset) {
|
||||
if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
|
||||
&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
|
||||
return -1;
|
||||
if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
|
||||
&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (left.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||
// we need to take the alphabetically lower id
|
||||
return left.getOriginalID().compareTo(i.getOriginalID());
|
||||
} else
|
||||
// we need to take the elder date
|
||||
return left.getDate().compareTo(i.getDate());
|
||||
} else {
|
||||
return new PidComparator<>(left.getEntity()).compare(toSP(left.getPidType()), toSP(i.getPidType()));
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||
return collectedFrom.contains(dsId);
|
||||
}
|
||||
|
||||
private StructuredProperty toSP(PidType pidType) {
|
||||
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
|
||||
}
|
||||
|
||||
}
|
|
@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
||||
import eu.dnetlib.dhp.oa.dedup.IdentifierComparator;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
@ -83,60 +84,12 @@ public class Identifier<T extends OafEntity> implements Serializable, Comparable
|
|||
return entity.getId();
|
||||
}
|
||||
|
||||
private PidType getPidType() {
|
||||
public PidType getPidType() {
|
||||
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(Identifier<T> i) {
|
||||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||
// alphabetical order of the originalID
|
||||
|
||||
Set<String> lKeys = Optional
|
||||
.ofNullable(getCollectedFrom())
|
||||
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||
.orElse(Sets.newHashSet());
|
||||
|
||||
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
|
||||
Set<String> rKeys = cf
|
||||
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||
.orElse(Sets.newHashSet());
|
||||
|
||||
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
|
||||
if (getEntityType() == EntityType.publication) {
|
||||
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
|
||||
return -1;
|
||||
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
|
||||
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
|
||||
return 1;
|
||||
}
|
||||
if (getEntityType() == EntityType.dataset) {
|
||||
if (isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID)
|
||||
&& !isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID))
|
||||
return -1;
|
||||
if (isFromDatasourceID(rKeys, ModelConstants.DATACITE_ID)
|
||||
&& !isFromDatasourceID(lKeys, ModelConstants.DATACITE_ID))
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||
// we need to take the alphabetically lower id
|
||||
return this.getOriginalID().compareTo(i.getOriginalID());
|
||||
} else
|
||||
// we need to take the elder date
|
||||
return this.getDate().compareTo(i.getDate());
|
||||
} else {
|
||||
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private StructuredProperty toSP(PidType pidType) {
|
||||
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
|
||||
}
|
||||
|
||||
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||
return collectedFrom.contains(dsId);
|
||||
return IdentifierComparator.compareIdentifiers(this, i);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.io.BufferedReader;
|
|||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -54,7 +55,7 @@ class EntityMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void softwareMergerTest() throws InstantiationException, IllegalAccessException {
|
||||
void softwareMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
||||
|
||||
List<Tuple2<String, Software>> softwares = readSample(
|
||||
testEntityBasePath + "/software_merge.json", Software.class);
|
||||
|
@ -69,7 +70,7 @@ class EntityMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
||||
void publicationMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||
|
@ -134,7 +135,7 @@ class EntityMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
|
||||
void publicationMergerTest2() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||
|
@ -146,7 +147,7 @@ class EntityMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
|
||||
void publicationMergerTest3() throws InstantiationException, IllegalAccessException, InvocationTargetException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||
|
@ -156,7 +157,8 @@ class EntityMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||
void publicationMergerTest4()
|
||||
throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {
|
||||
|
||||
Publication pub_merged = DedupRecordFactory
|
||||
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
||||
|
@ -166,7 +168,8 @@ class EntityMergerTest implements Serializable {
|
|||
}
|
||||
|
||||
@Test
|
||||
void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||
void publicationMergerTest5()
|
||||
throws InstantiationException, IllegalStateException, IllegalAccessException, InvocationTargetException {
|
||||
|
||||
System.out
|
||||
.println(
|
||||
|
|
|
@ -4,8 +4,7 @@ package eu.dnetlib.dhp.oa.dedup;
|
|||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
|
@ -14,7 +13,11 @@ import java.io.IOException;
|
|||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -35,10 +38,13 @@ import org.mockito.Mock;
|
|||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
@ -105,57 +111,27 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml")));
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json")));
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct")))
|
||||
.thenReturn(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")));
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -163,11 +139,7 @@ public class SparkDedupTest implements Serializable {
|
|||
void createSimRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateSimRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"));
|
||||
|
||||
parser
|
||||
.parseArgument(
|
||||
|
@ -207,7 +179,7 @@ public class SparkDedupTest implements Serializable {
|
|||
.count();
|
||||
|
||||
assertEquals(3076, orgs_simrel);
|
||||
assertEquals(7040, pubs_simrel);
|
||||
assertEquals(7046, pubs_simrel);
|
||||
assertEquals(336, sw_simrel);
|
||||
assertEquals(442, ds_simrel);
|
||||
assertEquals(6784, orp_simrel);
|
||||
|
@ -223,11 +195,7 @@ public class SparkDedupTest implements Serializable {
|
|||
void whitelistSimRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkWhitelistSimRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/whitelistSimRels_parameters.json"));
|
||||
|
||||
parser
|
||||
.parseArgument(
|
||||
|
@ -264,7 +232,7 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||
assertEquals(3076, orgs_simrel);
|
||||
assertEquals(7040, pubs_simrel);
|
||||
assertEquals(7046, pubs_simrel);
|
||||
assertEquals(442, ds_simrel);
|
||||
assertEquals(6784, orp_simrel);
|
||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
||||
|
@ -306,11 +274,7 @@ public class SparkDedupTest implements Serializable {
|
|||
void cutMergeRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateMergeRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
|
||||
|
||||
parser
|
||||
.parseArgument(
|
||||
|
@ -402,11 +366,7 @@ public class SparkDedupTest implements Serializable {
|
|||
void createMergeRelsTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateMergeRels.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
|
||||
|
||||
parser
|
||||
.parseArgument(
|
||||
|
@ -427,10 +387,10 @@ public class SparkDedupTest implements Serializable {
|
|||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||
.count();
|
||||
long pubs_mergerel = spark
|
||||
final Dataset<Relation> pubs = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
|
||||
.count();
|
||||
.as(Encoders.bean(Relation.class));
|
||||
long sw_mergerel = spark
|
||||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
||||
|
@ -445,8 +405,35 @@ public class SparkDedupTest implements Serializable {
|
|||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||
.count();
|
||||
|
||||
final List<Relation> merges = pubs
|
||||
.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.collectAsList();
|
||||
assertEquals(3, merges.size());
|
||||
Set<String> dups = Sets
|
||||
.newHashSet(
|
||||
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
||||
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
|
||||
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
|
||||
merges.forEach(r -> {
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
||||
assertTrue(dups.contains(r.getTarget()));
|
||||
});
|
||||
|
||||
final List<Relation> mergedIn = pubs
|
||||
.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.collectAsList();
|
||||
assertEquals(3, mergedIn.size());
|
||||
mergedIn.forEach(r -> {
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||
assertTrue(dups.contains(r.getSource()));
|
||||
});
|
||||
|
||||
assertEquals(1268, orgs_mergerel);
|
||||
assertEquals(1444, pubs_mergerel);
|
||||
assertEquals(1450, pubs.count());
|
||||
assertEquals(286, sw_mergerel);
|
||||
assertEquals(472, ds_mergerel);
|
||||
assertEquals(738, orp_mergerel);
|
||||
|
@ -463,11 +450,7 @@ public class SparkDedupTest implements Serializable {
|
|||
void createDedupRecordTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkCreateDedupRecord.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"));
|
||||
parser
|
||||
.parseArgument(
|
||||
new String[] {
|
||||
|
@ -483,12 +466,18 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
new SparkCreateDedupRecord(parser, spark).run(isLookUpService);
|
||||
|
||||
final ObjectMapper mapper = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
final Dataset<Publication> pubs = spark
|
||||
.read()
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
|
||||
.map(
|
||||
(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
|
||||
Encoders.bean(Publication.class));
|
||||
long orgs_deduprecord = jsc
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord")
|
||||
.count();
|
||||
long pubs_deduprecord = jsc
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord")
|
||||
.count();
|
||||
long sw_deduprecord = jsc
|
||||
.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
|
||||
.count();
|
||||
|
@ -499,11 +488,13 @@ public class SparkDedupTest implements Serializable {
|
|||
.count();
|
||||
|
||||
assertEquals(86, orgs_deduprecord);
|
||||
assertEquals(67, pubs_deduprecord);
|
||||
assertEquals(68, pubs.count());
|
||||
assertEquals(49, sw_deduprecord);
|
||||
assertEquals(97, ds_deduprecord);
|
||||
assertEquals(92, orp_deduprecord);
|
||||
|
||||
verifyRoot_1(mapper, pubs);
|
||||
|
||||
// System.out.println("orgs_deduprecord = " + orgs_deduprecord);
|
||||
// System.out.println("pubs_deduprecord = " + pubs_deduprecord);
|
||||
// System.out.println("sw_deduprecord = " + sw_deduprecord);
|
||||
|
@ -511,16 +502,63 @@ public class SparkDedupTest implements Serializable {
|
|||
// System.out.println("orp_deduprecord = " + orp_deduprecord);
|
||||
}
|
||||
|
||||
private static void verifyRoot_1(ObjectMapper mapper, Dataset<Publication> pubs) {
|
||||
Publication root = pubs
|
||||
.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.first();
|
||||
assertNotNull(root);
|
||||
|
||||
final Dataset<String> publication = spark
|
||||
.read()
|
||||
.textFile(DedupUtility.createEntityPath(testGraphBasePath, "publication"));
|
||||
|
||||
Publication crossref_duplicate = publication
|
||||
.map(
|
||||
(MapFunction<String, Publication>) value -> mapper.readValue(value, Publication.class),
|
||||
Encoders.bean(Publication.class))
|
||||
.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.collectAsList()
|
||||
.get(0);
|
||||
|
||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
||||
Set<String> rootPids = root
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Set<String> dupPids = crossref_duplicate
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
|
||||
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
|
||||
|
||||
Optional<Instance> instance_cr = root
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
|
||||
.findFirst();
|
||||
assertTrue(instance_cr.isPresent());
|
||||
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
|
||||
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
|
||||
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
|
||||
assertEquals(
|
||||
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
|
||||
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
|
||||
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(6)
|
||||
void updateEntityTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkUpdateEntity.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"));
|
||||
parser
|
||||
.parseArgument(
|
||||
new String[] {
|
||||
|
@ -587,7 +625,7 @@ public class SparkDedupTest implements Serializable {
|
|||
.distinct()
|
||||
.count();
|
||||
|
||||
assertEquals(898, publications);
|
||||
assertEquals(902, publications);
|
||||
assertEquals(839, organizations);
|
||||
assertEquals(100, projects);
|
||||
assertEquals(100, datasource);
|
||||
|
@ -640,11 +678,7 @@ public class SparkDedupTest implements Serializable {
|
|||
void propagateRelationTest() throws Exception {
|
||||
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkPropagateRelation.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
|
||||
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"));
|
||||
parser
|
||||
.parseArgument(
|
||||
new String[] {
|
||||
|
@ -714,4 +748,12 @@ public class SparkDedupTest implements Serializable {
|
|||
public boolean isDeletedByInference(String s) {
|
||||
return s.contains("\"deletedbyinference\":true");
|
||||
}
|
||||
|
||||
private static String classPathResourceAsString(String path) throws IOException {
|
||||
return IOUtils
|
||||
.toString(
|
||||
SparkDedupTest.class
|
||||
.getResourceAsStream(path));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
|||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
assertEquals(288, orgs_simrel);
|
||||
assertEquals(290, orgs_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
|||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
assertEquals(324, orgs_simrel);
|
||||
assertEquals(326, orgs_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -0,0 +1,403 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class SparkPublicationRootsTest implements Serializable {
|
||||
|
||||
@Mock(serializable = true)
|
||||
ISLookUpService isLookUpService;
|
||||
|
||||
private static SparkSession spark;
|
||||
private static String workingPath;
|
||||
|
||||
private static String graphInputPath;
|
||||
private static String graphOutputPath;
|
||||
private static final String testActionSetId = "test-orchestrator";
|
||||
|
||||
private static Path testBaseTmpPath;
|
||||
|
||||
private static final ObjectMapper MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@BeforeAll
|
||||
public static void init() throws IOException, URISyntaxException {
|
||||
|
||||
testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest.class.getSimpleName() + "-");
|
||||
|
||||
final File entitiesSources = Paths
|
||||
.get(SparkPublicationRootsTest.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
|
||||
.toFile();
|
||||
|
||||
FileUtils
|
||||
.copyDirectory(
|
||||
entitiesSources,
|
||||
testBaseTmpPath.resolve("input").toFile());
|
||||
|
||||
workingPath = testBaseTmpPath.resolve("workingPath").toString();
|
||||
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
|
||||
graphOutputPath = testBaseTmpPath.resolve("output").toString();
|
||||
|
||||
FileUtils.deleteDirectory(new File(workingPath));
|
||||
FileUtils.deleteDirectory(new File(graphOutputPath));
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.set("spark.sql.shuffle.partitions", "10");
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkPublicationRootsTest.class.getSimpleName())
|
||||
.master("local[*]")
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, ISLookUpException {
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
||||
spark.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(1)
|
||||
void createSimRelsTest() throws Exception {
|
||||
new SparkCreateSimRels(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath,
|
||||
"--numPartitions", "5"
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
long pubs_simrel = spark
|
||||
.read()
|
||||
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
|
||||
.count();
|
||||
|
||||
assertEquals(74, pubs_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(2)
|
||||
void cutMergeRelsTest() throws Exception {
|
||||
new SparkCreateMergeRels(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath,
|
||||
"--cutConnectedComponent", "3"
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
long pubs_mergerel = spark
|
||||
.read()
|
||||
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.filter((FilterFunction<Relation>) r -> r.getRelClass().equalsIgnoreCase("merges"))
|
||||
.groupBy("source")
|
||||
.agg(count("target").alias("cnt"))
|
||||
.select("source", "cnt")
|
||||
.where("cnt > 3")
|
||||
.count();
|
||||
|
||||
assertEquals(0, pubs_mergerel);
|
||||
|
||||
FileUtils.deleteDirectory(new File(workingPath + "/" + testActionSetId + "/publication_mergerel"));
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(3)
|
||||
void createMergeRelsTest() throws Exception {
|
||||
new SparkCreateMergeRels(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
final Dataset<Relation> merges = spark
|
||||
.read()
|
||||
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
|
||||
.as(Encoders.bean(Relation.class));
|
||||
|
||||
final List<Relation> mergeList = merges
|
||||
.filter("source == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.collectAsList();
|
||||
assertEquals(3, mergeList.size());
|
||||
Set<String> dups = Sets
|
||||
.newHashSet(
|
||||
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
||||
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
|
||||
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
|
||||
mergeList.forEach(r -> {
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
||||
assertTrue(dups.contains(r.getTarget()));
|
||||
});
|
||||
|
||||
final List<Relation> mergedIn = merges
|
||||
.filter("target == '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.collectAsList();
|
||||
assertEquals(3, mergedIn.size());
|
||||
mergedIn.forEach(r -> {
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||
assertTrue(dups.contains(r.getSource()));
|
||||
});
|
||||
|
||||
assertEquals(32, merges.count());
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(4)
|
||||
void createDedupRecordTest() throws Exception {
|
||||
new SparkCreateDedupRecord(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
final Dataset<Publication> roots = spark
|
||||
.read()
|
||||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||
|
||||
assertEquals(3, roots.count());
|
||||
|
||||
final Dataset<Publication> pubs = spark
|
||||
.read()
|
||||
.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
|
||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||
|
||||
verifyRoot_case_1(roots, pubs);
|
||||
verifyRoot_case_2(roots, pubs);
|
||||
verifyRoot_case_3(roots, pubs);
|
||||
}
|
||||
|
||||
private static void verifyRoot_case_1(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||
Publication root = roots
|
||||
.filter("id = '50|doi_dedup___::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.first();
|
||||
assertNotNull(root);
|
||||
|
||||
Publication crossref_duplicate = pubs
|
||||
.filter("id = '50|doi_________::d5021b53204e4fdeab6ff5d5bc468032'")
|
||||
.collectAsList()
|
||||
.get(0);
|
||||
|
||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
||||
Set<String> rootPids = root
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Set<String> dupPids = crossref_duplicate
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
|
||||
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
|
||||
|
||||
Optional<Instance> instance_cr = root
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
|
||||
.findFirst();
|
||||
assertTrue(instance_cr.isPresent());
|
||||
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
|
||||
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
|
||||
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
|
||||
assertEquals(
|
||||
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
|
||||
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
|
||||
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
|
||||
}
|
||||
|
||||
private void verifyRoot_case_2(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||
Publication root = roots
|
||||
.filter("id = '50|doi_dedup___::18aff3b55fb6876466a5d4bd82434885'")
|
||||
.first();
|
||||
assertNotNull(root);
|
||||
|
||||
Publication crossref_duplicate = pubs
|
||||
.filter("id = '50|doi_________::18aff3b55fb6876466a5d4bd82434885'")
|
||||
.first();
|
||||
|
||||
// System.err.println(new ObjectMapper().writeValueAsString(root));
|
||||
|
||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||
assertEquals(crossref_duplicate.getJournal().getIssnOnline(), root.getJournal().getIssnOnline());
|
||||
assertEquals(crossref_duplicate.getJournal().getVol(), root.getJournal().getVol());
|
||||
|
||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
||||
Set<String> dups_cf = pubs
|
||||
.collectAsList()
|
||||
.stream()
|
||||
.flatMap(p -> p.getCollectedfrom().stream())
|
||||
.map(KeyValue::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
Set<String> root_cf = root
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(KeyValue::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||
}
|
||||
|
||||
private void verifyRoot_case_3(Dataset<Publication> roots, Dataset<Publication> pubs) {
|
||||
Publication root = roots
|
||||
.filter("id = '50|dedup_wf_001::31ca734cc22181b704c4aa8fd050062a'")
|
||||
.first();
|
||||
assertNotNull(root);
|
||||
|
||||
Publication pivot_duplicate = pubs
|
||||
.filter("id = '50|od_______166::31ca734cc22181b704c4aa8fd050062a'")
|
||||
.first();
|
||||
|
||||
assertEquals(pivot_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
||||
Set<String> dups_cf = pubs
|
||||
.collectAsList()
|
||||
.stream()
|
||||
.flatMap(p -> p.getCollectedfrom().stream())
|
||||
.map(KeyValue::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
Set<String> root_cf = root
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.map(KeyValue::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
assertTrue(Sets.difference(root_cf, dups_cf).isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(6)
|
||||
void updateEntityTest() throws Exception {
|
||||
new SparkUpdateEntity(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--workingPath", workingPath,
|
||||
"--dedupGraphPath", graphOutputPath
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
long publications = spark.read().textFile(graphOutputPath + "/publication").count();
|
||||
|
||||
long mergedPubs = spark
|
||||
.read()
|
||||
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
|
||||
.as(Encoders.bean(Relation.class))
|
||||
.where("relClass=='merges'")
|
||||
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
||||
.distinct()
|
||||
.count();
|
||||
|
||||
assertEquals(19, publications); // 16 originals + 3 roots
|
||||
|
||||
long deletedPubs = spark
|
||||
.read()
|
||||
.textFile(graphOutputPath + "/publication")
|
||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class))
|
||||
.filter("datainfo.deletedbyinference == true")
|
||||
.map((MapFunction<Publication, String>) OafEntity::getId, Encoders.STRING())
|
||||
.distinct()
|
||||
.count();
|
||||
|
||||
assertEquals(mergedPubs, deletedPubs);
|
||||
}
|
||||
|
||||
private static String classPathResourceAsString(String path) throws IOException {
|
||||
return IOUtils
|
||||
.toString(
|
||||
SparkPublicationRootsTest.class
|
||||
.getResourceAsStream(path));
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
|
||||
return value -> MAPPER.readValue(value, clazz);
|
||||
}
|
||||
|
||||
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
|
||||
parser.parseArgument(args);
|
||||
return parser;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,251 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import static java.nio.file.Files.createTempDirectory;
|
||||
|
||||
import static org.apache.spark.sql.functions.count;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.cli.ParseException;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class SparkPublicationRootsTest2 implements Serializable {
|
||||
|
||||
@Mock(serializable = true)
|
||||
ISLookUpService isLookUpService;
|
||||
private static SparkSession spark;
|
||||
|
||||
private static String workingPath;
|
||||
|
||||
private static String graphInputPath;
|
||||
|
||||
private static String graphOutputPath;
|
||||
|
||||
private static final String testActionSetId = "test-orchestrator";
|
||||
|
||||
private static Path testBaseTmpPath;
|
||||
|
||||
private static final ObjectMapper MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@BeforeAll
|
||||
public static void init() throws IOException, URISyntaxException {
|
||||
|
||||
testBaseTmpPath = createTempDirectory(SparkPublicationRootsTest2.class.getSimpleName() + "-");
|
||||
|
||||
final File entitiesSources = Paths
|
||||
.get(SparkPublicationRootsTest2.class.getResource("/eu/dnetlib/dhp/dedup/root").toURI())
|
||||
.toFile();
|
||||
|
||||
FileUtils
|
||||
.copyDirectory(
|
||||
entitiesSources,
|
||||
testBaseTmpPath.resolve("input").toFile());
|
||||
|
||||
FileUtils
|
||||
.copyFileToDirectory(
|
||||
Paths
|
||||
.get(
|
||||
SparkPublicationRootsTest2.class
|
||||
.getResource(
|
||||
"/eu/dnetlib/dhp/dedup/root/alterations/publication/publication_1.gz")
|
||||
.toURI())
|
||||
.toFile(),
|
||||
testBaseTmpPath.resolve("input").resolve("entities").resolve("publication").toFile());
|
||||
|
||||
workingPath = testBaseTmpPath.resolve("workingPath").toString();
|
||||
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
|
||||
graphOutputPath = testBaseTmpPath.resolve("output").toString();
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.set("spark.sql.shuffle.partitions", "10");
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(SparkPublicationRootsTest2.class.getSimpleName())
|
||||
.master("local[*]")
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, ISLookUpException {
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator_publication.xml"));
|
||||
|
||||
lenient()
|
||||
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication")))
|
||||
.thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"));
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDown() throws IOException {
|
||||
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(7)
|
||||
void dedupAlteredDatasetTest() throws Exception {
|
||||
|
||||
new SparkCreateSimRels(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath,
|
||||
"--numPartitions", "5"
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
new SparkCreateMergeRels(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
final Dataset<Relation> merges = spark
|
||||
.read()
|
||||
.load(workingPath + "/" + testActionSetId + "/publication_mergerel")
|
||||
.as(Encoders.bean(Relation.class));
|
||||
|
||||
assertEquals(
|
||||
3, merges
|
||||
.filter("relclass == 'isMergedIn'")
|
||||
.map((MapFunction<Relation, String>) Relation::getTarget, Encoders.STRING())
|
||||
.distinct()
|
||||
.count());
|
||||
assertEquals(
|
||||
4, merges
|
||||
.filter("source == '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
|
||||
.count());
|
||||
|
||||
new SparkCreateDedupRecord(args(
|
||||
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json",
|
||||
new String[] {
|
||||
"--graphBasePath", graphInputPath,
|
||||
"--actionSetId", testActionSetId,
|
||||
"--isLookUpUrl", "lookupurl",
|
||||
"--workingPath", workingPath
|
||||
}), spark)
|
||||
.run(isLookUpService);
|
||||
|
||||
final Dataset<Publication> roots = spark
|
||||
.read()
|
||||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||
|
||||
assertEquals(3, roots.count());
|
||||
|
||||
final Dataset<Publication> pubs = spark
|
||||
.read()
|
||||
.textFile(DedupUtility.createEntityPath(graphInputPath, "publication"))
|
||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||
|
||||
Publication root = roots
|
||||
.filter("id = '50|doi_dedup___::b3aec7985136e36827176aaa1dd5082d'")
|
||||
.first();
|
||||
assertNotNull(root);
|
||||
|
||||
Publication crossref_duplicate = pubs
|
||||
.filter("id = '50|doi_________::b3aec7985136e36827176aaa1dd5082d'")
|
||||
.collectAsList()
|
||||
.get(0);
|
||||
|
||||
assertEquals(crossref_duplicate.getDateofacceptance().getValue(), root.getDateofacceptance().getValue());
|
||||
assertEquals(crossref_duplicate.getJournal().getName(), root.getJournal().getName());
|
||||
assertEquals(crossref_duplicate.getJournal().getIssnPrinted(), root.getJournal().getIssnPrinted());
|
||||
assertEquals(crossref_duplicate.getPublisher().getValue(), root.getPublisher().getValue());
|
||||
|
||||
Set<String> rootPids = root
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
Set<String> dupPids = crossref_duplicate
|
||||
.getPid()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
||||
assertFalse(Sets.intersection(rootPids, dupPids).isEmpty());
|
||||
assertTrue(rootPids.contains("10.1109/jstqe.2022.3205716"));
|
||||
assertTrue(rootPids.contains("10.1109/jstqe.2023.9999999"));
|
||||
|
||||
Optional<Instance> instance_cr = root
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getCollectedfrom().getValue().equals("Crossref"))
|
||||
.findFirst();
|
||||
assertTrue(instance_cr.isPresent());
|
||||
assertEquals("OPEN", instance_cr.get().getAccessright().getClassid());
|
||||
assertEquals("Open Access", instance_cr.get().getAccessright().getClassname());
|
||||
assertEquals(OpenAccessRoute.hybrid, instance_cr.get().getAccessright().getOpenAccessRoute());
|
||||
assertEquals(
|
||||
"IEEE Journal of Selected Topics in Quantum Electronics", instance_cr.get().getHostedby().getValue());
|
||||
assertEquals("0001", instance_cr.get().getInstancetype().getClassid());
|
||||
assertEquals("Article", instance_cr.get().getInstancetype().getClassname());
|
||||
|
||||
}
|
||||
|
||||
private static String classPathResourceAsString(String path) throws IOException {
|
||||
return IOUtils
|
||||
.toString(
|
||||
SparkPublicationRootsTest2.class
|
||||
.getResourceAsStream(path));
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> MapFunction<String, T> asEntity(Class<T> clazz) {
|
||||
return value -> MAPPER.readValue(value, clazz);
|
||||
}
|
||||
|
||||
private ArgumentApplicationParser args(String paramSpecs, String[] args) throws IOException, ParseException {
|
||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(classPathResourceAsString(paramSpecs));
|
||||
parser.parseArgument(args);
|
||||
return parser;
|
||||
}
|
||||
|
||||
}
|
|
@ -168,11 +168,11 @@ public class SparkStatsTest implements Serializable {
|
|||
.textFile(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_blockstats")
|
||||
.count();
|
||||
|
||||
assertEquals(477, orgs_blocks);
|
||||
assertEquals(480, orgs_blocks);
|
||||
assertEquals(295, pubs_blocks);
|
||||
assertEquals(122, sw_blocks);
|
||||
assertEquals(191, ds_blocks);
|
||||
assertEquals(171, orp_blocks);
|
||||
assertEquals(178, orp_blocks);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,24 @@
|
|||
<RESOURCE_PROFILE>
|
||||
<HEADER>
|
||||
<RESOURCE_IDENTIFIER value=""/>
|
||||
<RESOURCE_TYPE value="DedupOrchestrationDSResourceType"/>
|
||||
<RESOURCE_KIND value="DedupOrchestrationDSResources"/>
|
||||
<RESOURCE_URI value=""/>
|
||||
<DATE_OF_CREATION value="2001-12-31T12:00:00"/>
|
||||
</HEADER>
|
||||
<BODY>
|
||||
<CONFIGURATION enabled="true">
|
||||
<DEDUPLICATION>
|
||||
<ENTITY code="20" label="Organization" name="organization"/>
|
||||
<ACTION_SET id="test-orchestrator"/>
|
||||
<SCAN_SEQUENCE>
|
||||
<SCAN id="publication"/>
|
||||
</SCAN_SEQUENCE>
|
||||
</DEDUPLICATION>
|
||||
</CONFIGURATION>
|
||||
<STATUS>
|
||||
<LAST_UPDATE value="2001-12-31T12:00:00"/>
|
||||
</STATUS>
|
||||
<SECURITY_PARAMETERS>SECURITY_PARAMETERS</SECURITY_PARAMETERS>
|
||||
</BODY>
|
||||
</RESOURCE_PROFILE>
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,47 @@
|
|||
# Root logger option
|
||||
log4j.rootLogger=DEBUG, stdout
|
||||
|
||||
# Direct log messages to stdout
|
||||
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.stdout.Target=System.out
|
||||
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
|
||||
|
||||
# Change this to set Spark log level
|
||||
log4j.logger.org.apache.spark=ERROR
|
||||
log4j.rootCategory=WARN
|
||||
|
||||
# Silence akka remoting
|
||||
log4j.logger.Remoting=WARN
|
||||
|
||||
# Ignore messages below warning level from Jetty, because it's a bit verbose
|
||||
log4j.logger.org.eclipse.jetty=WARN
|
||||
|
||||
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN
|
||||
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
|
||||
#log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN
|
||||
#log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN
|
||||
log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN
|
||||
#log4j.logger.org.apache.hadoop.io.compress=WARN
|
||||
#log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN
|
||||
log4j.logger.parquet.hadoop.ColumnChunkPageWriteStore=ERROR
|
||||
log4j.logger.com.jayway.jsonpath.internal.path.CompiledPath=WARN
|
||||
log4j.logger.org.apache.parquet.hadoop.ParquetRecordReader=ERROR
|
||||
log4j.logger.parquet.hadoop=WARN
|
||||
log4j.logger.org.eclipse.jetty.server.handler.ContextHandlerCollection=WARN
|
||||
log4j.logger.org.spark_project.jetty.util.component.ContainerLifeCycle=WARN
|
||||
log4j.logger.org.apache.hadoop.mapred.FileInputFormat=WARN
|
||||
log4j.logger.org.spark_project.jetty.servlet.ServletHandler=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.BooleanConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.StringConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.LongConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.ArrayConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.FloatConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.IntegerConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.DoubleConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.CharacterConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.ByteConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.BigIntegerConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.BigDecimalConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.converters.ShortConverter=WARN
|
||||
log4j.logger.org.apache.commons.beanutils.BeanUtils=WARN
|
|
@ -554,7 +554,7 @@ public class PublicationToOaf implements Serializable {
|
|||
private KeyValue createCollectedFrom() {
|
||||
KeyValue cf = new KeyValue();
|
||||
cf.setValue(ModelConstants.ORCID.toUpperCase());
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "806360c771262b4d6770e7cdf04b5c5a");
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + "cd0f74b5955dc87fd0605745c4b49ee8");
|
||||
return cf;
|
||||
}
|
||||
|
||||
|
|
|
@ -31,13 +31,13 @@ class CrossrefMappingTest {
|
|||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/doiboost/crossref/funder_doi"))
|
||||
.mkString
|
||||
|
||||
for (line <- funder_doi.lines) {
|
||||
for (line <- funder_doi.linesWithSeparators.map(l =>l.stripLineEnd)) {
|
||||
val json = template.replace("%s", line)
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
assertTrue(resultList.nonEmpty)
|
||||
checkRelation(resultList)
|
||||
}
|
||||
for (line <- funder_name.lines) {
|
||||
for (line <- funder_name.linesWithSeparators.map(l =>l.stripLineEnd)) {
|
||||
val json = template.replace("%s", line)
|
||||
val resultList: List[Oaf] = Crossref2Oaf.convert(json)
|
||||
assertTrue(resultList.nonEmpty)
|
||||
|
|
|
@ -25,7 +25,7 @@ class MappingORCIDToOAFTest {
|
|||
.mkString
|
||||
assertNotNull(json)
|
||||
assertFalse(json.isEmpty)
|
||||
json.lines.foreach(s => {
|
||||
json.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => {
|
||||
assertNotNull(ORCIDToOAF.extractValueFromInputString(s))
|
||||
})
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ class UnpayWallMappingTest {
|
|||
.mkString
|
||||
|
||||
var i: Int = 0
|
||||
for (line <- Ilist.lines) {
|
||||
for (line <- Ilist.linesWithSeparators.map(l =>l.stripLineEnd)) {
|
||||
val p = UnpayWallToOAF.convertToOAF(line)
|
||||
|
||||
if (p != null) {
|
||||
|
@ -43,7 +43,7 @@ class UnpayWallMappingTest {
|
|||
i = i + 1
|
||||
}
|
||||
|
||||
val l = Ilist.lines.next()
|
||||
val l = Ilist.linesWithSeparators.map(l =>l.stripLineEnd).next()
|
||||
|
||||
val item = UnpayWallToOAF.convertToOAF(l)
|
||||
|
||||
|
|
|
@ -230,10 +230,15 @@ public class PropagationConstant {
|
|||
|
||||
public static <R> Dataset<R> readPath(
|
||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
|
||||
if (HdfsSupport.exists(inputPath, spark.sparkContext().hadoopConfiguration())) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
} else {
|
||||
return spark.emptyDataset(Encoders.bean(clazz));
|
||||
}
|
||||
}
|
||||
|
||||
public static <R extends Oaf> Dataset<R> readOafKryoPath(
|
||||
|
|
|
@ -90,12 +90,12 @@ public class CommunityConfigurationFactory {
|
|||
}
|
||||
|
||||
private static SelectionConstraints parseConstrains(Node node) {
|
||||
Node aconstraints = node.selectSingleNode("./advancedConstraints");
|
||||
if (aconstraints == null) {
|
||||
Node advConstsNode = node.selectSingleNode("./advancedConstraints");
|
||||
if (advConstsNode == null || StringUtils.isBlank(StringUtils.trim(advConstsNode.getText()))) {
|
||||
return null;
|
||||
}
|
||||
SelectionConstraints selectionConstraints = new Gson()
|
||||
.fromJson(aconstraints.getText(), SelectionConstraints.class);
|
||||
.fromJson(advConstsNode.getText(), SelectionConstraints.class);
|
||||
|
||||
selectionConstraints.setSelection(resolver);
|
||||
return selectionConstraints;
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag.community;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
|
@ -13,71 +15,17 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
public class QueryInformationSystem {
|
||||
private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') "
|
||||
+ " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() "
|
||||
+ " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept "
|
||||
+ " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept "
|
||||
+ " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept "
|
||||
+ " let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text() "
|
||||
+ " let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text() "
|
||||
+
|
||||
"let $zenodo := $x//param[./@name='zenodoCommunity']/text() "
|
||||
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden' "
|
||||
+ " return "
|
||||
+ " <community> "
|
||||
+ " { $x//CONFIGURATION/context/@id} "
|
||||
+ " <subjects> "
|
||||
+ " {for $y in tokenize($subj,',') "
|
||||
+ " return "
|
||||
+ " <subject>{$y}</subject>} "
|
||||
+ " {for $y in tokenize($fos,',') "
|
||||
+ " return "
|
||||
+ " <subject>{$y}</subject>} "
|
||||
+ " {for $y in tokenize($sdg,',') "
|
||||
+ " return "
|
||||
+ " <subject>{$y}</subject>} "
|
||||
+ " </subjects> "
|
||||
+ " <datasources> "
|
||||
+ " {for $d in $datasources "
|
||||
+ " where $d/param[./@name='enabled']/text()='true' "
|
||||
+ " return "
|
||||
+ " <datasource> "
|
||||
+ " <openaireId> "
|
||||
+ " {$d//param[./@name='openaireId']/text()} "
|
||||
+ " </openaireId> "
|
||||
+ " <selcriteria> "
|
||||
+ " {$d/param[./@name='selcriteria']/text()} "
|
||||
+ " </selcriteria> "
|
||||
+ " </datasource> } "
|
||||
+ " </datasources> " +
|
||||
" <zenodocommunities> " +
|
||||
"{for $zc in $zenodo " +
|
||||
"return " +
|
||||
"<zenodocommunity> " +
|
||||
"<zenodoid> " +
|
||||
"{$zc} " +
|
||||
"</zenodoid> " +
|
||||
"</zenodocommunity>}"
|
||||
+ " {for $zc in $communities "
|
||||
+ " return "
|
||||
+ " <zenodocommunity> "
|
||||
+ " <zenodoid> "
|
||||
+ " {$zc/param[./@name='zenodoid']/text()} "
|
||||
+ " </zenodoid> "
|
||||
+ " <selcriteria> "
|
||||
+ " {$zc/param[./@name='selcriteria']/text()} "
|
||||
+ " </selcriteria> "
|
||||
+ " </zenodocommunity>} "
|
||||
+ " </zenodocommunities> "
|
||||
+ "<advancedConstraint>"
|
||||
+ "{$x//CONFIGURATION/context/param[./@name='advancedConstraint']/text()} "
|
||||
+ "</advancedConstraint>"
|
||||
+ " </community>";
|
||||
|
||||
public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl)
|
||||
throws ISLookUpException, DocumentException, SAXException {
|
||||
throws ISLookUpException, DocumentException, SAXException, IOException {
|
||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
final List<String> res = isLookUp.quickSearchProfile(XQUERY);
|
||||
final List<String> res = isLookUp
|
||||
.quickSearchProfile(
|
||||
IOUtils
|
||||
.toString(
|
||||
QueryInformationSystem.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/bulktag/query.xq")));
|
||||
|
||||
final String xmlConf = "<communities>" + Joiner.on(" ").join(res) + "</communities>";
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
@VerbClass("contains_ignorecase")
|
||||
@VerbClass("contains_caseinsensitive")
|
||||
public class ContainsVerbIgnoreCase implements Selection, Serializable {
|
||||
|
||||
private String param;
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
@VerbClass("equals_ignorecase")
|
||||
@VerbClass("equals_caseinsensitive")
|
||||
public class EqualVerbIgnoreCase implements Selection, Serializable {
|
||||
|
||||
private String param;
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
@VerbClass("not_contains_ignorecase")
|
||||
@VerbClass("not_contains_caseinsensitive")
|
||||
public class NotContainsVerbIgnoreCase implements Selection, Serializable {
|
||||
|
||||
private String param;
|
||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.dhp.bulktag.criteria;
|
|||
|
||||
import java.io.Serializable;
|
||||
|
||||
@VerbClass("not_equals_ignorecase")
|
||||
@VerbClass("not_equals_caseinsensitive")
|
||||
public class NotEqualVerbIgnoreCase implements Selection, Serializable {
|
||||
|
||||
private String param;
|
||||
|
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.bulktag.criteria;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
@VerbClass("starts_with")
|
||||
public class StartsWithVerb implements Selection, Serializable {
|
||||
|
||||
private String param;
|
||||
|
||||
public StartsWithVerb() {
|
||||
}
|
||||
|
||||
public StartsWithVerb(final String param) {
|
||||
this.param = param;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean apply(String value) {
|
||||
return value.startsWith(param);
|
||||
}
|
||||
|
||||
public String getParam() {
|
||||
return param;
|
||||
}
|
||||
|
||||
public void setParam(String param) {
|
||||
this.param = param;
|
||||
}
|
||||
}
|
|
@ -38,13 +38,13 @@
|
|||
{
|
||||
"paramName": "test",
|
||||
"paramLongName": "isTest",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramDescription": "Parameter intended for testing purposes only. True if the reun is relatesd to a test and so the taggingConf parameter should be loaded",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "tg",
|
||||
"paramLongName": "taggingConf",
|
||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||
"paramDescription": "this parameter is intended for testing purposes only. It is a possible tagging configuration obtained via the XQUERY. Intended to be removed",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')
|
||||
let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text()
|
||||
let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept
|
||||
let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept
|
||||
let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept
|
||||
let $fos := $x//CONFIGURATION/context/param[./@name='fos']/text()
|
||||
let $sdg := $x//CONFIGURATION/context/param[./@name='sdg']/text()
|
||||
let $zenodo := $x//param[./@name='zenodoCommunity']/text()
|
||||
where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] and $x//context/param[./@name = 'status']/text() != 'hidden'
|
||||
return
|
||||
<community>
|
||||
{ $x//CONFIGURATION/context/@id}
|
||||
<advancedConstraints>
|
||||
{$x//CONFIGURATION/context/param[./@name='advancedConstraints']/text() }
|
||||
</advancedConstraints>
|
||||
<subjects>
|
||||
{for $y in tokenize($subj,',')
|
||||
return
|
||||
<subject>{$y}</subject>}
|
||||
{for $y in tokenize($fos,',')
|
||||
return
|
||||
<subject>{$y}</subject>}
|
||||
{for $y in tokenize($sdg,',')
|
||||
return
|
||||
<subject>{$y}</subject>}
|
||||
</subjects>
|
||||
<datasources>
|
||||
{for $d in $datasources
|
||||
where $d/param[./@name='enabled']/text()='true'
|
||||
return
|
||||
<datasource>
|
||||
<openaireId>
|
||||
{$d//param[./@name='openaireId']/text()}
|
||||
</openaireId>
|
||||
<selcriteria>
|
||||
{$d/param[./@name='selcriteria']/text()}
|
||||
</selcriteria>
|
||||
</datasource> }
|
||||
</datasources>
|
||||
<zenodocommunities>
|
||||
{for $zc in $zenodo
|
||||
return
|
||||
<zenodocommunity>
|
||||
<zenodoid>
|
||||
{$zc}
|
||||
</zenodoid>
|
||||
</zenodocommunity>}
|
||||
{for $zc in $communities
|
||||
return
|
||||
<zenodocommunity>
|
||||
<zenodoid>
|
||||
{$zc/param[./@name='zenodoid']/text()}
|
||||
</zenodoid>
|
||||
<selcriteria>
|
||||
{$zc/param[./@name='selcriteria']/text()}
|
||||
</selcriteria>
|
||||
</zenodocommunity>}
|
||||
</zenodocommunities>
|
|
@ -16,6 +16,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.ForeachFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
@ -45,7 +46,9 @@ public class BulkTagJobTest {
|
|||
+ " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\","
|
||||
+ " \"contributor\" : \"$['contributor'][*]['value']\","
|
||||
+ " \"description\" : \"$['description'][*]['value']\", "
|
||||
+ " \"subject\" :\"$['subject'][*]['value']\" }";
|
||||
+ " \"subject\" :\"$['subject'][*]['value']\" , " +
|
||||
|
||||
"\"fos\" : \"$['subject'][?(@['qualifier']['classid']=='subject:fos')].value\"} ";
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
|
@ -769,28 +772,14 @@ public class BulkTagJobTest {
|
|||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
idExplodeCommunity.show(false);
|
||||
Assertions.assertEquals(4, idExplodeCommunity.count());
|
||||
Assertions.assertEquals(5, idExplodeCommunity.count());
|
||||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, idExplodeCommunity.filter("provenance = 'community:datasource'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
1, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
|
||||
2, idExplodeCommunity.filter("provenance = 'community:advconstraint'").count());
|
||||
}
|
||||
|
||||
// @Test
|
||||
// void test1(){
|
||||
// ProtoMap params = new Gson().fromJson(pathMap, ProtoMap.class);
|
||||
// HashMap<String, String> param = new HashMap<>();
|
||||
// for (String key : params.keySet()) {
|
||||
// try {
|
||||
// param.put(key, jsonContext.read(params.get(key)));
|
||||
// } catch (com.jayway.jsonpath.PathNotFoundException e) {
|
||||
// param.put(key, new ArrayList<>());
|
||||
// }
|
||||
// }
|
||||
// return param;
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -83,4 +83,36 @@ class CommunityConfigurationFactoryTest {
|
|||
Assertions.assertEquals("dariah", comm.get(0));
|
||||
}
|
||||
|
||||
@Test
|
||||
void loadSelCriteriaTest2() throws DocumentException, IOException, SAXException {
|
||||
String xml = IOUtils
|
||||
.toString(
|
||||
getClass()
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/bulktag/communityconfiguration/community_configuration_selcrit2.xml"));
|
||||
final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml);
|
||||
Map<String, List<String>> param = new HashMap<>();
|
||||
param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi")));
|
||||
param
|
||||
.put(
|
||||
"description",
|
||||
new ArrayList<>(
|
||||
Collections
|
||||
.singletonList(
|
||||
"This work has been partially supported by DARIAH-EU infrastructure")));
|
||||
param
|
||||
.put(
|
||||
"contributor",
|
||||
new ArrayList<>(
|
||||
Collections
|
||||
.singletonList(
|
||||
"Author X helped to write the paper. X works for DARIAH")));
|
||||
List<String> comm = cc
|
||||
.getCommunityForDatasource(
|
||||
"openaire____::1cfdb2e14977f31a98e0118283401f32", param);
|
||||
|
||||
// TODO add more assertions
|
||||
Assertions.assertEquals(0, comm.size());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -844,6 +844,89 @@
|
|||
<organizations/>
|
||||
</community>
|
||||
<community id="dariah">
|
||||
<advancedConstraints>
|
||||
{
|
||||
"criteria": [
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "North America"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "05"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "North America"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "06"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Mexico"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "United States"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Canada"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "05"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"constraint": [
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Mexico"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "United States"
|
||||
},
|
||||
{
|
||||
"verb": "equals_caseinsensitive",
|
||||
"field": "subject",
|
||||
"value": "Canada"
|
||||
},
|
||||
{
|
||||
"verb": "contains",
|
||||
"field": "fos",
|
||||
"value": "06"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
</advancedConstraints>
|
||||
<subjects/>
|
||||
<datasources>
|
||||
<datasource>
|
||||
|
@ -1174,7 +1257,9 @@
|
|||
</zenodocommunities>
|
||||
<organizations/>
|
||||
</community>
|
||||
|
||||
<community id="euromarine">
|
||||
|
||||
<subjects/>
|
||||
<datasources/>
|
||||
<zenodocommunities/>
|
||||
|
@ -1193,7 +1278,7 @@
|
|||
<organizations/>
|
||||
</community>
|
||||
<community id="science-innovation-policy">
|
||||
<advancedConstraints>{"criteria":[{"constraint":[{"verb":"equals_ignorecase","field":"subject","value":"ciencias de la comunicación"},
|
||||
<advancedConstraints>{"criteria":[{"constraint":[{"verb":"equals_caseinsensitive","field":"subject","value":"ciencias de la comunicación"},
|
||||
{"verb":"equals","field":"subject","value":"Miriam"}]},
|
||||
{"constraint":[{"verb":"equals","field":"subject","value":"miriam"}]}]}</advancedConstraints>
|
||||
<subjects>
|
||||
|
@ -1317,81 +1402,81 @@
|
|||
<datasources>
|
||||
<datasource>
|
||||
<openaireId>opendoar____::358aee4cc897452c00244351e4d91f69</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>re3data_____::7b0ad08687b2c960d5aeef06f811d5e6</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>driver______::bee53aa31dc2cbb538c10c2b65fa5824</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>openaire____::437f4b072b1aa198adcbc35910ff3b98</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>openaire____::081b82f96300b6a6e3d282bad31cb6e2</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>openaire____::9e3be59865b2c1c335d32dae2fe7b254</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>opendoar____::8b6dd7db9af49e67306feb59a8bdc52c</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>share_______::4719356ec8d7d55d3feb384ce879ad6c</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>share_______::bbd802baad85d1fd440f32a7a3a2c2b1</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>opendoar____::6f4922f45568161a8cdf4ad2299f6d23</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCoV"}]}]}
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},
|
||||
{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCoV"}]}]}
|
||||
</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>re3data_____::7980778c78fb4cf0fab13ce2159030dc</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]}</selcriteria>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]}</selcriteria>
|
||||
</datasource>
|
||||
<datasource>
|
||||
<openaireId>re3data_____::978378def740bbf2bfb420de868c460b</openaireId>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_ignorecase","field":"title","value":"2019-nCov"}]}]}</selcriteria>
|
||||
<selcriteria>{"criteria":[{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"SARS-CoV-2"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"COVID-19"}]},{"constraint":[{"verb":"contains_caseinsensitive","field":"title","value":"2019-nCov"}]}]}</selcriteria>
|
||||
</datasource>
|
||||
</datasources>
|
||||
<zenodocommunities>
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
|
@ -0,0 +1,25 @@
|
|||
# Root logger option
|
||||
log4j.rootLogger=DEBUG, stdout
|
||||
|
||||
# Direct log messages to stdout
|
||||
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
|
||||
log4j.appender.stdout.Target=System.out
|
||||
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
|
||||
|
||||
# Change this to set Spark log level
|
||||
log4j.logger.org.apache.spark=ERROR
|
||||
log4j.rootCategory=WARN
|
||||
|
||||
# Silence akka remoting
|
||||
log4j.logger.Remoting=WARN
|
||||
|
||||
# Ignore messages below warning level from Jetty, because it's a bit verbose
|
||||
log4j.logger.org.eclipse.jetty=WARN
|
||||
|
||||
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitterFactory=WARN
|
||||
log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
|
||||
log4j.logger.org.apache.parquet.hadoop.ParquetOutputFormat=WARN
|
||||
log4j.logger.org.apache.parquet.hadoop.InternalParquetRecordWriter=WARN
|
||||
log4j.logger.org.apache.hadoop.io.compress.CodecPool=WARN
|
||||
log4j.logger.org.apache.parquet.hadoop.codec.CodecConfig=WARN
|
|
@ -47,8 +47,8 @@ public class CleanContextSparkJob implements Serializable {
|
|||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
String contextId = parser.get("contextId");
|
||||
log.info("contextId: {}", contextId);
|
||||
|
@ -67,12 +67,12 @@ public class CleanContextSparkJob implements Serializable {
|
|||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
||||
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingPath);
|
||||
cleanContext(spark, contextId, verifyParam, inputPath, entityClazz, workingDir);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Result> void cleanContext(SparkSession spark, String contextId, String verifyParam,
|
||||
String inputPath, Class<T> entityClazz, String workingPath) {
|
||||
String inputPath, Class<T> entityClazz, String workingDir) {
|
||||
Dataset<T> res = spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
|
@ -106,11 +106,11 @@ public class CleanContextSparkJob implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
.json(workingDir);
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(workingPath)
|
||||
.textFile(workingDir)
|
||||
.map(
|
||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||
Encoders.bean(entityClazz))
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
|
||||
import org.apache.commons.lang3.SerializationUtils;
|
||||
|
@ -10,6 +11,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
|
||||
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
@ -31,29 +33,30 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
|
|||
}
|
||||
|
||||
private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
|
||||
if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
|
||||
return;
|
||||
} else {
|
||||
// TODO cleaning based on different subject vocabs can be added here
|
||||
}
|
||||
cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject);
|
||||
// TODO cleaning based on different subject vocabs can be added here
|
||||
}
|
||||
|
||||
private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
||||
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
|
||||
Subject subject) {
|
||||
AtomicReference<Boolean> modified = new AtomicReference<>(false);
|
||||
|
||||
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
|
||||
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
|
||||
return;
|
||||
}
|
||||
Qualifier newValue = vocabulary.lookup(subject.getValue());
|
||||
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
|
||||
subject.setValue(newValue.getClassid());
|
||||
subject.getQualifier().setClassid(vocabularyId);
|
||||
subject.getQualifier().setClassname(vocabulary.getName());
|
||||
modified.set(true);
|
||||
if (ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
|
||||
Qualifier newValue = vocabulary.lookup(subject.getValue(), true);
|
||||
if (!ModelConstants.UNKNOWN.equals(newValue.getClassid())) {
|
||||
subject.setValue(newValue.getClassid());
|
||||
subject.getQualifier().setClassid(vocabularyId);
|
||||
subject.getQualifier().setClassname(vocabulary.getName());
|
||||
}
|
||||
} else if (vocabularyId.equals(subject.getQualifier().getClassid())) {
|
||||
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
|
||||
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
|
||||
if (Objects.isNull(syn) && Objects.isNull(term)) {
|
||||
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
|
||||
}
|
||||
}
|
||||
});
|
||||
return modified.get();
|
||||
}
|
||||
|
||||
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
|
||||
|
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.action.ReadDatasourceMasterDuplicateFromDB;
|
||||
|
||||
public class MasterDuplicateAction {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MasterDuplicateAction.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
MasterDuplicateAction.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/datasourcemaster_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String dbUrl = parser.get("postgresUrl");
|
||||
log.info("postgresUrl: {}", dbUrl);
|
||||
|
||||
final String dbUser = parser.get("postgresUser");
|
||||
log.info("postgresUser: {}", dbUser);
|
||||
|
||||
final String dbPassword = parser.get("postgresPassword");
|
||||
log.info("postgresPassword: {}", dbPassword);
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
log.info("hdfsPath: {}", hdfsPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("hdfsNameNode");
|
||||
log.info("hdfsNameNode: {}", hdfsNameNode);
|
||||
|
||||
int rows = ReadDatasourceMasterDuplicateFromDB.execute(dbUrl, dbUser, dbPassword, hdfsPath, hdfsNameNode);
|
||||
|
||||
log.info("written {} rows", rows);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,227 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||
import org.apache.spark.sql.*;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.common.action.model.MasterDuplicate;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class CleanCfHbSparkJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJob.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CleanCountrySparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/input_clean_cfhb_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
String resolvedPath = parser.get("resolvedPath");
|
||||
log.info("resolvedPath: {}", resolvedPath);
|
||||
|
||||
String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
String dsMasterDuplicatePath = parser.get("masterDuplicatePath");
|
||||
log.info("masterDuplicatePath: {}", dsMasterDuplicatePath);
|
||||
|
||||
String graphTableClassName = parser.get("graphTableClassName");
|
||||
log.info("graphTableClassName: {}", graphTableClassName);
|
||||
|
||||
Class<? extends Result> entityClazz = (Class<? extends Result>) Class.forName(graphTableClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
|
||||
HdfsSupport.remove(resolvedPath, spark.sparkContext().hadoopConfiguration());
|
||||
cleanCfHb(
|
||||
spark, inputPath, entityClazz, resolvedPath, dsMasterDuplicatePath, outputPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Result> void cleanCfHb(SparkSession spark, String inputPath, Class<T> entityClazz,
|
||||
String resolvedPath, String masterDuplicatePath, String outputPath) {
|
||||
|
||||
// read the master-duplicate tuples
|
||||
Dataset<MasterDuplicate> md = spark
|
||||
.read()
|
||||
.textFile(masterDuplicatePath)
|
||||
.map(as(MasterDuplicate.class), Encoders.bean(MasterDuplicate.class));
|
||||
|
||||
// prepare the resolved CF|HB references with the corresponding EMPTY master ID
|
||||
Dataset<IdCfHbMapping> resolved = spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(as(entityClazz), Encoders.bean(entityClazz))
|
||||
.flatMap(flattenCfHbFn(), Encoders.bean(IdCfHbMapping.class));
|
||||
|
||||
// set the EMPTY master ID/NAME and save it
|
||||
resolved
|
||||
.joinWith(md, resolved.col("cfhb").equalTo(md.col("duplicateId")))
|
||||
.map(asIdCfHbMapping(), Encoders.bean(IdCfHbMapping.class))
|
||||
.filter((FilterFunction<IdCfHbMapping>) m -> Objects.nonNull(m.getMasterId()))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.json(resolvedPath);
|
||||
|
||||
// read again the resolved CF|HB mapping
|
||||
Dataset<IdCfHbMapping> resolvedDS = spark
|
||||
.read()
|
||||
.textFile(resolvedPath)
|
||||
.map(as(IdCfHbMapping.class), Encoders.bean(IdCfHbMapping.class));
|
||||
|
||||
// read the result table
|
||||
Dataset<T> res = spark
|
||||
.read()
|
||||
.textFile(inputPath)
|
||||
.map(as(entityClazz), Encoders.bean(entityClazz));
|
||||
|
||||
// Join the results with the resolved CF|HB mapping, apply the mapping and save it
|
||||
res
|
||||
.joinWith(resolvedDS, res.col("id").equalTo(resolvedDS.col("resultId")), "left")
|
||||
.groupByKey((MapFunction<Tuple2<T, IdCfHbMapping>, String>) t -> t._1().getId(), Encoders.STRING())
|
||||
.mapGroups(getMapGroupsFunction(), Encoders.bean(entityClazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath);
|
||||
}
|
||||
|
||||
private static MapFunction<Tuple2<IdCfHbMapping, MasterDuplicate>, IdCfHbMapping> asIdCfHbMapping() {
|
||||
return t -> {
|
||||
final IdCfHbMapping mapping = t._1();
|
||||
Optional
|
||||
.ofNullable(t._2())
|
||||
.ifPresent(t2 -> {
|
||||
mapping.setMasterId(t2.getMasterId());
|
||||
mapping.setMasterName(t2.getMasterName());
|
||||
|
||||
});
|
||||
return mapping;
|
||||
};
|
||||
}
|
||||
|
||||
private static <T extends Result> FlatMapFunction<T, IdCfHbMapping> flattenCfHbFn() {
|
||||
return r -> Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(r.getCollectedfrom())
|
||||
.map(cf -> cf.stream().map(KeyValue::getKey))
|
||||
.orElse(Stream.empty()),
|
||||
Stream
|
||||
.concat(
|
||||
Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instances -> instances
|
||||
.stream()
|
||||
.map(i -> Optional.ofNullable(i.getHostedby()).map(KeyValue::getKey).orElse("")))
|
||||
.orElse(Stream.empty())
|
||||
.filter(StringUtils::isNotBlank),
|
||||
Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instances -> instances
|
||||
.stream()
|
||||
.map(
|
||||
i -> Optional
|
||||
.ofNullable(i.getCollectedfrom())
|
||||
.map(KeyValue::getKey)
|
||||
.orElse("")))
|
||||
.orElse(Stream.empty())
|
||||
.filter(StringUtils::isNotBlank)))
|
||||
.distinct()
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(cfHb -> asIdCfHbMapping(r.getId(), cfHb))
|
||||
.iterator();
|
||||
}
|
||||
|
||||
private static <T extends Result> MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T> getMapGroupsFunction() {
|
||||
return new MapGroupsFunction<String, Tuple2<T, IdCfHbMapping>, T>() {
|
||||
@Override
|
||||
public T call(String key, Iterator<Tuple2<T, IdCfHbMapping>> values) {
|
||||
final Tuple2<T, IdCfHbMapping> first = values.next();
|
||||
final T res = first._1();
|
||||
|
||||
updateResult(res, first._2());
|
||||
values.forEachRemaining(t -> updateResult(res, t._2()));
|
||||
return res;
|
||||
}
|
||||
|
||||
private void updateResult(T res, IdCfHbMapping m) {
|
||||
if (Objects.nonNull(m)) {
|
||||
res.getCollectedfrom().forEach(kv -> updateKeyValue(kv, m));
|
||||
res.getInstance().forEach(i -> {
|
||||
updateKeyValue(i.getHostedby(), m);
|
||||
updateKeyValue(i.getCollectedfrom(), m);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private void updateKeyValue(final KeyValue kv, final IdCfHbMapping a) {
|
||||
if (kv.getKey().equals(a.getCfhb())) {
|
||||
kv.setKey(a.getMasterId());
|
||||
kv.setValue(a.getMasterName());
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
private static IdCfHbMapping asIdCfHbMapping(String resultId, String cfHb) {
|
||||
IdCfHbMapping m = new IdCfHbMapping(resultId);
|
||||
m.setCfhb(cfHb);
|
||||
return m;
|
||||
}
|
||||
|
||||
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
||||
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class IdCfHbMapping implements Serializable {
|
||||
|
||||
private String resultId;
|
||||
|
||||
private String cfhb;
|
||||
|
||||
private String masterId;
|
||||
|
||||
private String masterName;
|
||||
|
||||
public IdCfHbMapping() {
|
||||
}
|
||||
|
||||
public IdCfHbMapping(String id) {
|
||||
this.resultId = id;
|
||||
}
|
||||
|
||||
public String getResultId() {
|
||||
return resultId;
|
||||
}
|
||||
|
||||
public void setResultId(String resultId) {
|
||||
this.resultId = resultId;
|
||||
}
|
||||
|
||||
public String getCfhb() {
|
||||
return cfhb;
|
||||
}
|
||||
|
||||
public void setCfhb(String cfhb) {
|
||||
this.cfhb = cfhb;
|
||||
}
|
||||
|
||||
public String getMasterId() {
|
||||
return masterId;
|
||||
}
|
||||
|
||||
public void setMasterId(String masterId) {
|
||||
this.masterId = masterId;
|
||||
}
|
||||
|
||||
public String getMasterName() {
|
||||
return masterName;
|
||||
}
|
||||
|
||||
public void setMasterName(String masterName) {
|
||||
this.masterName = masterName;
|
||||
}
|
||||
}
|
|
@ -4,9 +4,12 @@ package eu.dnetlib.dhp.oa.graph.clean.country;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import javax.swing.text.html.Option;
|
||||
|
||||
|
@ -30,6 +33,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
@ -43,7 +47,7 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
CleanContextSparkJob.class
|
||||
CleanCountrySparkJob.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/oa/graph/input_clean_country_parameters.json"));
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
|
@ -58,8 +62,8 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
String workingDir = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingDir);
|
||||
|
||||
String datasourcePath = parser.get("hostedBy");
|
||||
log.info("datasourcePath: {}", datasourcePath);
|
||||
|
@ -85,12 +89,12 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
spark -> {
|
||||
|
||||
cleanCountry(
|
||||
spark, country, verifyParam, inputPath, entityClazz, workingPath, collectedfrom, datasourcePath);
|
||||
spark, country, verifyParam, inputPath, entityClazz, workingDir, collectedfrom, datasourcePath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Result> void cleanCountry(SparkSession spark, String country, String[] verifyParam,
|
||||
String inputPath, Class<T> entityClazz, String workingPath, String collectedfrom, String datasourcePath) {
|
||||
String inputPath, Class<T> entityClazz, String workingDir, String collectedfrom, String datasourcePath) {
|
||||
|
||||
List<String> hostedBy = spark
|
||||
.read()
|
||||
|
@ -110,8 +114,8 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
return r;
|
||||
}
|
||||
|
||||
if (r
|
||||
.getPid()
|
||||
List<StructuredProperty> ids = getPidsAndAltIds(r).collect(Collectors.toList());
|
||||
if (ids
|
||||
.stream()
|
||||
.anyMatch(
|
||||
p -> p
|
||||
|
@ -134,11 +138,11 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
.json(workingDir);
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(workingPath)
|
||||
.textFile(workingDir)
|
||||
.map(
|
||||
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, entityClazz),
|
||||
Encoders.bean(entityClazz))
|
||||
|
@ -148,6 +152,42 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
.json(inputPath);
|
||||
}
|
||||
|
||||
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
|
||||
final Stream<StructuredProperty> resultPids = Optional
|
||||
.ofNullable(r.getPid())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty());
|
||||
|
||||
final Stream<StructuredProperty> instancePids = Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.flatMap(
|
||||
i -> Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty())))
|
||||
.orElse(Stream.empty());
|
||||
|
||||
final Stream<StructuredProperty> instanceAltIds = Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.flatMap(
|
||||
i -> Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty())))
|
||||
.orElse(Stream.empty());
|
||||
|
||||
return Stream
|
||||
.concat(
|
||||
Stream.concat(resultPids, instancePids),
|
||||
instanceAltIds);
|
||||
}
|
||||
|
||||
private static boolean pidInParam(String value, String[] verifyParam) {
|
||||
for (String s : verifyParam)
|
||||
if (value.startsWith(s))
|
||||
|
|
|
@ -54,8 +54,8 @@ public class GetDatasourceFromCountry implements Serializable {
|
|||
String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
String workingPath = parser.get("workingPath");
|
||||
log.info("workingPath: {}", workingPath);
|
||||
String workingPath = parser.get("workingDir");
|
||||
log.info("workingDir: {}", workingPath);
|
||||
|
||||
String country = parser.get("country");
|
||||
log.info("country: {}", country);
|
||||
|
@ -65,13 +65,12 @@ public class GetDatasourceFromCountry implements Serializable {
|
|||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
|
||||
getDatasourceFromCountry(spark, country, inputPath, workingPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static void getDatasourceFromCountry(SparkSession spark, String country, String inputPath,
|
||||
String workingPath) {
|
||||
String workingDir) {
|
||||
|
||||
Dataset<Organization> organization = spark
|
||||
.read()
|
||||
|
@ -83,7 +82,6 @@ public class GetDatasourceFromCountry implements Serializable {
|
|||
(FilterFunction<Organization>) o -> !o.getDataInfo().getDeletedbyinference() &&
|
||||
o.getCountry().getClassid().length() > 0 &&
|
||||
o.getCountry().getClassid().equals(country));
|
||||
;
|
||||
|
||||
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
||||
Dataset<Relation> relation = spark
|
||||
|
@ -97,12 +95,12 @@ public class GetDatasourceFromCountry implements Serializable {
|
|||
!rel.getDataInfo().getDeletedbyinference());
|
||||
|
||||
organization
|
||||
.joinWith(relation, organization.col("id").equalTo(relation.col("target")), "left")
|
||||
.joinWith(relation, organization.col("id").equalTo(relation.col("target")))
|
||||
.map((MapFunction<Tuple2<Organization, Relation>, String>) t2 -> t2._2().getSource(), Encoders.STRING())
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(workingPath);
|
||||
.json(workingDir);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -366,6 +366,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
|
||||
r.setInstance(instances);
|
||||
r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
|
||||
r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info));
|
||||
}
|
||||
|
||||
protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info);
|
||||
|
@ -384,6 +385,25 @@ public abstract class AbstractMdRecordToOafMapper {
|
|||
return list;
|
||||
}
|
||||
|
||||
private List<EoscIfGuidelines> prepareEOSCIfGuidelines(Document doc, DataInfo info) {
|
||||
final Set<EoscIfGuidelines> set = Sets.newHashSet();
|
||||
for (final Object o : doc.selectNodes("//oaf:eoscifguidelines")) {
|
||||
final String code = ((Node) o).valueOf("@code");
|
||||
final String label = ((Node) o).valueOf("@label");
|
||||
final String url = ((Node) o).valueOf("@url");
|
||||
final String semrel = ((Node) o).valueOf("@semanticrelation");
|
||||
if (StringUtils.isNotBlank(code)) {
|
||||
final EoscIfGuidelines eig = new EoscIfGuidelines();
|
||||
eig.setCode(code);
|
||||
eig.setLabel(label);
|
||||
eig.setUrl(url);
|
||||
eig.setSemanticRelation(semrel);
|
||||
set.add(eig);
|
||||
}
|
||||
}
|
||||
return Lists.newArrayList(set);
|
||||
}
|
||||
|
||||
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||
|
||||
protected abstract List<Instance> prepareInstances(
|
||||
|
|
|
@ -317,13 +317,13 @@
|
|||
</switch>
|
||||
</decision>
|
||||
|
||||
|
||||
<fork name="fork_clean_context">
|
||||
<path start="clean_publication_context"/>
|
||||
<path start="clean_dataset_context"/>
|
||||
<path start="clean_otherresearchproduct_context"/>
|
||||
<path start="clean_software_context"/>
|
||||
</fork>
|
||||
|
||||
<action name="clean_publication_context">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
|
@ -343,7 +343,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
|
@ -370,7 +370,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
|
@ -397,7 +397,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
|
@ -424,7 +424,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
|
||||
<arg>--contextId</arg><arg>${contextId}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyParam}</arg>
|
||||
</spark>
|
||||
|
@ -432,14 +432,13 @@
|
|||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_clean_context" to="getHostedby"/>
|
||||
<join name="wait_clean_context" to="select_datasourceId_from_country"/>
|
||||
|
||||
|
||||
<action name="getHostedby">
|
||||
<action name="select_datasourceId_from_country">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean publications context</name>
|
||||
<name>Select datasource ID from country</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.GetDatasourceFromCountry</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -453,25 +452,25 @@
|
|||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--country</arg><arg>${country}</arg>
|
||||
</spark>
|
||||
<ok to="fork_clean_country"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<fork name="fork_clean_country">
|
||||
<path start="clean_publication_country"/>
|
||||
<path start="clean_dataset_country"/>
|
||||
<path start="clean_otherresearchproduct_country"/>
|
||||
<path start="clean_software_country"/>
|
||||
</fork>
|
||||
|
||||
<action name="clean_publication_country">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean publications counmtry</name>
|
||||
<name>Clean publication country</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -486,13 +485,13 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/publication</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/publication</arg>
|
||||
<arg>--country</arg><arg>${country}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
||||
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<ok to="wait_clean_country"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -500,7 +499,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean datasets Country</name>
|
||||
<name>Clean dataset country</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -515,13 +514,13 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/dataset</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/dataset</arg>
|
||||
<arg>--country</arg><arg>${country}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
||||
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<ok to="wait_clean_country"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -529,7 +528,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean otherresearchproducts country</name>
|
||||
<name>Clean otherresearchproduct country</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -544,13 +543,13 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/otherresearchproduct</arg>
|
||||
<arg>--country</arg><arg>${country}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
||||
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_context"/>
|
||||
<ok to="wait_clean_country"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
@ -558,7 +557,7 @@
|
|||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>Clean softwares country</name>
|
||||
<name>Clean software country</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.country.CleanCountrySparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
|
@ -573,17 +572,212 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--workingPath</arg><arg>${workingDir}/working/software</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}/working/software</arg>
|
||||
<arg>--country</arg><arg>${country}</arg>
|
||||
<arg>--verifyParam</arg><arg>${verifyCountryParam}</arg>
|
||||
<arg>--datasourcePath</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--hostedBy</arg><arg>${workingDir}/working/hostedby</arg>
|
||||
<arg>--collectedfrom</arg><arg>${collectedfrom}</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_country"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_clean_country" to="End"/>
|
||||
<join name="wait_clean_country" to="should_patch_datasource_ids"/>
|
||||
|
||||
<decision name="should_patch_datasource_ids">
|
||||
<switch>
|
||||
<case to="get_ds_master_duplicate">${wf:conf('shouldClean') eq true}</case>
|
||||
<default to="End"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="get_ds_master_duplicate">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction</main-class>
|
||||
<arg>--postgresUrl</arg><arg>${postgresURL}</arg>
|
||||
<arg>--postgresUser</arg><arg>${postgresUser}</arg>
|
||||
<arg>--postgresPassword</arg><arg>${postgresPassword}</arg>
|
||||
<arg>--hdfsPath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||
<arg>--hdfsNameNode</arg><arg>${nameNode}</arg>
|
||||
</java>
|
||||
<ok to="fork_patch_cfhb"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<fork name="fork_patch_cfhb">
|
||||
<path start="patch_publication_cfhb"/>
|
||||
<path start="patch_dataset_cfhb"/>
|
||||
<path start="patch_otherresearchproduct_cfhb"/>
|
||||
<path start="patch_software_cfhb"/>
|
||||
</fork>
|
||||
|
||||
<action name="patch_publication_cfhb">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>patch publication cfhb</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/publication</arg>
|
||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/publication</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/publication</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_cfhb"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="patch_dataset_cfhb">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>patch dataset cfhb</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/dataset</arg>
|
||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/dataset</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/dataset</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_cfhb"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="patch_otherresearchproduct_cfhb">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>patch otherresearchproduct cfhb</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/otherresearchproduct</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_cfhb"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="patch_software_cfhb">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>patch software cfhb</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${graphOutputPath}/software</arg>
|
||||
<arg>--resolvedPath</arg><arg>${workingDir}/cfHbResolved/software</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/cfHbPatched/software</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--masterDuplicatePath</arg><arg>${workingDir}/masterduplicate</arg>
|
||||
</spark>
|
||||
<ok to="wait_clean_cfhb"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_clean_cfhb" to="fork_copy_cfhb_patched_results"/>
|
||||
|
||||
<fork name="fork_copy_cfhb_patched_results">
|
||||
<path start="copy_cfhb_patched_publication"/>
|
||||
<path start="copy_cfhb_patched_dataset"/>
|
||||
<path start="copy_cfhb_patched_otherresearchproduct"/>
|
||||
<path start="copy_cfhb_patched_software"/>
|
||||
</fork>
|
||||
|
||||
<action name="copy_cfhb_patched_publication">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${graphOutputPath}/publication"/>
|
||||
</prepare>
|
||||
<arg>${workingDir}/cfHbPatched/publication</arg>
|
||||
<arg>${graphOutputPath}/publication</arg>
|
||||
</distcp>
|
||||
<ok to="copy_wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_cfhb_patched_dataset">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${graphOutputPath}/dataset"/>
|
||||
</prepare>
|
||||
<arg>${workingDir}/cfHbPatched/dataset</arg>
|
||||
<arg>${graphOutputPath}/dataset</arg>
|
||||
</distcp>
|
||||
<ok to="copy_wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_cfhb_patched_otherresearchproduct">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${graphOutputPath}/otherresearchproduct"/>
|
||||
</prepare>
|
||||
<arg>${workingDir}/cfHbPatched/otherresearchproduct</arg>
|
||||
<arg>${graphOutputPath}/otherresearchproduct</arg>
|
||||
</distcp>
|
||||
<ok to="copy_wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="copy_cfhb_patched_software">
|
||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||
<prepare>
|
||||
<delete path="${graphOutputPath}/software"/>
|
||||
</prepare>
|
||||
<arg>${workingDir}/cfHbPatched/software</arg>
|
||||
<arg>${graphOutputPath}/software</arg>
|
||||
</distcp>
|
||||
<ok to="copy_wait"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="copy_wait" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "pu",
|
||||
"paramLongName": "postgresUrl",
|
||||
"paramDescription": "the jdbc url to the postgres",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "uid",
|
||||
"paramLongName": "postgresUser",
|
||||
"paramDescription": "the postgres user",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "pwd",
|
||||
"paramLongName": "postgresPassword",
|
||||
"paramDescription": "the postgres password=",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "p",
|
||||
"paramLongName": "hdfsPath",
|
||||
"paramDescription": "the target path on HDFS",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "hdfsNameNode",
|
||||
"paramDescription": "the HDFS nameNode",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -126,6 +126,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>8000</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -152,6 +153,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>4000</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -178,6 +180,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>3000</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -204,6 +207,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>300</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -230,6 +234,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>100</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -256,6 +261,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>400</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
@ -309,6 +315,7 @@
|
|||
<arg>--hiveDbName</arg><arg>${hiveDbName}</arg>
|
||||
<arg>--className</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--numPartitions</arg><arg>10000</arg>
|
||||
</spark>
|
||||
<ok to="join_import"/>
|
||||
<error to="Kill"/>
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "in",
|
||||
"paramLongName": "inputPath",
|
||||
"paramDescription": "the path to the graph data dump to read",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "rp",
|
||||
"paramLongName": "resolvedPath",
|
||||
"paramDescription": "the path to store the resolved records",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "out",
|
||||
"paramLongName": "outputPath",
|
||||
"paramDescription": "the path to store the output graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "class",
|
||||
"paramLongName": "graphTableClassName",
|
||||
"paramDescription": "class name moelling the graph table",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "md",
|
||||
"paramLongName": "masterDuplicatePath",
|
||||
"paramDescription": "path to the file on HDFS holding the datasource id tuples [master, duplicate]",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -12,8 +12,8 @@
|
|||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramName": "wd",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path to store the output graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramName": "wd",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path to store the output graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
|
|
@ -12,8 +12,8 @@
|
|||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "wp",
|
||||
"paramLongName": "workingPath",
|
||||
"paramName": "wd",
|
||||
"paramLongName": "workingDir",
|
||||
"paramDescription": "the path to store the output graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
|
|
@ -116,54 +116,45 @@ object SparkConvertRDDtoDataset {
|
|||
.map(s => mapper.readValue(s, classOf[Relation]))
|
||||
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference)
|
||||
.filter(r => r.getSource.startsWith("50") && r.getTarget.startsWith("50"))
|
||||
.filter(r => filterRelations(subRelTypeFilter, relClassFilter, r))
|
||||
//filter OpenCitations relations
|
||||
.filter(r =>
|
||||
r.getDataInfo.getProvenanceaction != null &&
|
||||
!"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
|
||||
)
|
||||
.filter(r => filterRelations(r))
|
||||
//filter OpenCitations relations
|
||||
// .filter(r =>
|
||||
// r.getDataInfo.getProvenanceaction != null &&
|
||||
// !"sysimport:crosswalk:opencitations".equals(r.getDataInfo.getProvenanceaction.getClassid)
|
||||
// )
|
||||
|
||||
spark.createDataset(rddRelation).as[Relation].write.mode(SaveMode.Overwrite).save(s"$relPath")
|
||||
}
|
||||
|
||||
private def filterRelations(subRelTypeFilter: String, relClassFilter: List[String], r: Relation): Boolean = {
|
||||
if (StringUtils.isNotBlank(subRelTypeFilter)) {
|
||||
subRelTypeFilter.equalsIgnoreCase(r.getSubRelType)
|
||||
} else {
|
||||
!relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass))
|
||||
private def filterRelations(r: Relation): Boolean = {
|
||||
|
||||
/** *
|
||||
* We filter relation generated by dedups
|
||||
* and all the relation that have one single collectedFrom OpenCitation
|
||||
*/
|
||||
|
||||
val relClassFilter = List(
|
||||
ModelConstants.MERGES,
|
||||
ModelConstants.IS_MERGED_IN,
|
||||
ModelConstants.HAS_AMONG_TOP_N_SIMILAR_DOCS,
|
||||
ModelConstants.IS_AMONG_TOP_N_SIMILAR_DOCS
|
||||
)
|
||||
if (relClassFilter.exists(k => k.equalsIgnoreCase(r.getRelClass)))
|
||||
false
|
||||
else {
|
||||
if (r.getCollectedfrom == null || r.getCollectedfrom.size() == 0)
|
||||
false
|
||||
else if (r.getCollectedfrom.size() > 1)
|
||||
true
|
||||
else if (
|
||||
r.getCollectedfrom.size() == 1 && r.getCollectedfrom.get(0) != null && "OpenCitations".equalsIgnoreCase(
|
||||
r.getCollectedfrom.get(0).getValue
|
||||
)
|
||||
)
|
||||
false
|
||||
else
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
//TODO: finalise implementation
|
||||
private def processResult[T<: Result](
|
||||
implicit ct: ClassTag[T],
|
||||
log: Logger,
|
||||
spark: SparkSession,
|
||||
sourcePath: String,
|
||||
entityPath: String,
|
||||
clazz: Class[T]
|
||||
): Unit = {
|
||||
val entityType = clazz.getSimpleName.toLowerCase
|
||||
|
||||
log.info(s"Converting $entityType")
|
||||
|
||||
val mapper = new ObjectMapper() with ScalaObjectMapper
|
||||
mapper.registerModule(DefaultScalaModule)
|
||||
|
||||
val rdd = spark.sparkContext
|
||||
.textFile(s"$sourcePath/$entityType")
|
||||
.map(s => mapper.readValue(s, clazz))
|
||||
.filter(r => r.getDataInfo != null && !r.getDataInfo.getDeletedbyinference);
|
||||
|
||||
implicit val encoder: Encoder[T] = Encoders.kryo(clazz)
|
||||
spark
|
||||
.createDataset(rdd)
|
||||
.as[T]
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"$entityPath/$entityType")
|
||||
}
|
||||
*/
|
||||
|
||||
}
|
||||
|
|
|
@ -82,10 +82,10 @@ public class CleanContextTest {
|
|||
CleanContextSparkJob.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", workingDir.toString() + "/publication",
|
||||
"-graphTableClassName", Publication.class.getCanonicalName(),
|
||||
"-workingPath", workingDir.toString() + "/working",
|
||||
"-contextId", "sobigdata",
|
||||
"-verifyParam", "gCube "
|
||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||
"--workingDir", workingDir.toString() + "/working",
|
||||
"--contextId", "sobigdata",
|
||||
"--verifyParam", "gCube "
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.io.IOException;
|
|||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
@ -83,12 +84,12 @@ public class CleanCountryTest {
|
|||
CleanCountrySparkJob.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", workingDir.toString() + "/publication",
|
||||
"-graphTableClassName", Publication.class.getCanonicalName(),
|
||||
"-workingPath", workingDir.toString() + "/working",
|
||||
"-country", "NL",
|
||||
"-verifyParam", "10.17632",
|
||||
"-collectedfrom", "NARCIS",
|
||||
"-hostedBy", getClass()
|
||||
"--graphTableClassName", Publication.class.getCanonicalName(),
|
||||
"--workingDir", workingDir.toString() + "/working",
|
||||
"--country", "NL",
|
||||
"--verifyParam", "10.17632",
|
||||
"--collectedfrom", "NARCIS",
|
||||
"--hostedBy", getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
||||
.getPath()
|
||||
});
|
||||
|
@ -147,4 +148,44 @@ public class CleanCountryTest {
|
|||
.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDatasetClean() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/dataset_clean_country.json")
|
||||
.getPath();
|
||||
|
||||
spark
|
||||
.read()
|
||||
.textFile(sourcePath)
|
||||
.map(
|
||||
(MapFunction<String, Dataset>) r -> OBJECT_MAPPER.readValue(r, Dataset.class),
|
||||
Encoders.bean(Dataset.class))
|
||||
.write()
|
||||
.json(workingDir.toString() + "/dataset");
|
||||
|
||||
CleanCountrySparkJob.main(new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", workingDir.toString() + "/dataset",
|
||||
"-graphTableClassName", Dataset.class.getCanonicalName(),
|
||||
"-workingDir", workingDir.toString() + "/working",
|
||||
"-country", "NL",
|
||||
"-verifyParam", "10.17632",
|
||||
"-collectedfrom", "NARCIS",
|
||||
"-hostedBy", getClass()
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/hostedBy")
|
||||
.getPath()
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
JavaRDD<Dataset> tmp = sc
|
||||
.textFile(workingDir.toString() + "/dataset")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Dataset.class));
|
||||
|
||||
Assertions.assertEquals(1, tmp.count());
|
||||
|
||||
Assertions.assertEquals(0, tmp.first().getCountry().size());
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import static org.mockito.Mockito.lenient;
|
|||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
@ -278,10 +279,25 @@ public class GraphCleaningFunctionsTest {
|
|||
s -> "0102 computer and information sciences".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||
|
||||
verify_keyword(p_cleaned, "In Situ Hybridization");
|
||||
verify_keyword(p_cleaned, "Avicennia");
|
||||
|
||||
// TODO add more assertions to verity the cleaned values
|
||||
System.out.println(MAPPER.writeValueAsString(p_cleaned));
|
||||
}
|
||||
|
||||
private static void verify_keyword(Publication p_cleaned, String subject) {
|
||||
Optional<Subject> s1 = p_cleaned
|
||||
.getSubject()
|
||||
.stream()
|
||||
.filter(s -> s.getValue().equals(subject))
|
||||
.findFirst();
|
||||
|
||||
assertTrue(s1.isPresent());
|
||||
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassid());
|
||||
assertEquals(ModelConstants.DNET_SUBJECT_KEYWORD, s1.get().getQualifier().getClassname());
|
||||
}
|
||||
|
||||
private Stream<Qualifier> getAuthorPidTypes(Result pub) {
|
||||
return pub
|
||||
.getAuthor()
|
||||
|
|
|
@ -0,0 +1,213 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.graph.clean.cfhb;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
public class CleanCfHbSparkJobTest {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(CleanCfHbSparkJobTest.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path testBaseTmpPath;
|
||||
|
||||
private static String resolvedPath;
|
||||
|
||||
private static String graphInputPath;
|
||||
|
||||
private static String graphOutputPath;
|
||||
|
||||
private static String dsMasterDuplicatePath;
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException, URISyntaxException {
|
||||
|
||||
testBaseTmpPath = Files.createTempDirectory(CleanCfHbSparkJobTest.class.getSimpleName());
|
||||
log.info("using test base path {}", testBaseTmpPath);
|
||||
|
||||
final File entitiesSources = Paths
|
||||
.get(CleanCfHbSparkJobTest.class.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/entities").toURI())
|
||||
.toFile();
|
||||
|
||||
FileUtils
|
||||
.copyDirectory(
|
||||
entitiesSources,
|
||||
testBaseTmpPath.resolve("input").resolve("entities").toFile());
|
||||
|
||||
FileUtils
|
||||
.copyFileToDirectory(
|
||||
Paths
|
||||
.get(
|
||||
CleanCfHbSparkJobTest.class
|
||||
.getResource("/eu/dnetlib/dhp/oa/graph/clean/cfhb/masterduplicate.json")
|
||||
.toURI())
|
||||
.toFile(),
|
||||
testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toFile());
|
||||
|
||||
graphInputPath = testBaseTmpPath.resolve("input").resolve("entities").toString();
|
||||
resolvedPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbResolved").toString();
|
||||
graphOutputPath = testBaseTmpPath.resolve("workingDir").resolve("cfHbPatched").toString();
|
||||
dsMasterDuplicatePath = testBaseTmpPath.resolve("workingDir").resolve("masterduplicate").toString();
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(CleanCfHbSparkJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(CleanCfHbSparkJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(testBaseTmpPath.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
void testCleanCfHbSparkJob() throws Exception {
|
||||
final String outputPath = graphOutputPath + "/dataset";
|
||||
final String inputPath = graphInputPath + "/dataset";
|
||||
|
||||
org.apache.spark.sql.Dataset<Dataset> records = read(spark, inputPath, Dataset.class);
|
||||
Dataset d = records
|
||||
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
|
||||
.first();
|
||||
assertEquals("10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("Bacterial Protein Interaction Database - DUP", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals(
|
||||
"Bacterial Protein Interaction Database - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
|
||||
d = records
|
||||
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
|
||||
.first();
|
||||
assertEquals("10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("FILUR DATA - DUP", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals("FILUR DATA - DUP", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
assertEquals(
|
||||
"10|re3data_____::6ffd7bc058f762912dc494cd9c175341", d.getInstance().get(0).getHostedby().getKey());
|
||||
assertEquals("depositar - DUP", d.getInstance().get(0).getHostedby().getValue());
|
||||
|
||||
d = records
|
||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
||||
.first();
|
||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals(
|
||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
assertEquals(
|
||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
||||
|
||||
CleanCfHbSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"--isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"--inputPath", inputPath,
|
||||
"--outputPath", outputPath,
|
||||
"--resolvedPath", resolvedPath + "/dataset",
|
||||
"--graphTableClassName", Dataset.class.getCanonicalName(),
|
||||
"--masterDuplicatePath", dsMasterDuplicatePath
|
||||
});
|
||||
|
||||
assertTrue(Files.exists(Paths.get(graphOutputPath, "dataset")));
|
||||
|
||||
records = read(spark, outputPath, Dataset.class);
|
||||
|
||||
assertEquals(3, records.count());
|
||||
|
||||
d = records
|
||||
.filter("id = '50|doi_________::09821844208a5cd6300b2bfb13bca1b9'")
|
||||
.first();
|
||||
assertEquals("10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("Bacterial Protein Interaction Database", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals("Bacterial Protein Interaction Database", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
|
||||
d = records
|
||||
.filter("id = '50|DansKnawCris::0dd644304b7116e8e58da3a5e3adc37a'")
|
||||
.first();
|
||||
assertEquals("10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("FULIR Data", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|re3data_____::fc1db64b3964826913b1e9eafe830490", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals("FULIR Data", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
assertEquals(
|
||||
"10|fairsharing_::3f647cadf56541fb9513cb63ec370187", d.getInstance().get(0).getHostedby().getKey());
|
||||
assertEquals("depositar", d.getInstance().get(0).getHostedby().getValue());
|
||||
|
||||
d = records
|
||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
||||
.first();
|
||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals(
|
||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
assertEquals(
|
||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
||||
|
||||
d = records
|
||||
.filter("id = '50|DansKnawCris::203a27996ddc0fd1948258e5b7dec61c'")
|
||||
.first();
|
||||
assertEquals("10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getCollectedfrom().get(0).getKey());
|
||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getCollectedfrom().get(0).getValue());
|
||||
assertEquals(
|
||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getCollectedfrom().getKey());
|
||||
assertEquals(
|
||||
"DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getCollectedfrom().getValue());
|
||||
assertEquals(
|
||||
"10|openaire____::c6df70599aa984f16ee52b4b86d2e89f", d.getInstance().get(0).getHostedby().getKey());
|
||||
assertEquals("DANS (Data Archiving and Networked Services)", d.getInstance().get(0).getHostedby().getValue());
|
||||
}
|
||||
|
||||
private <R> org.apache.spark.sql.Dataset<R> read(SparkSession spark, String path, Class<R> clazz) {
|
||||
return spark
|
||||
.read()
|
||||
.textFile(path)
|
||||
.map(as(clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static <R> MapFunction<String, R> as(Class<R> clazz) {
|
||||
return s -> OBJECT_MAPPER.readValue(s, clazz);
|
||||
}
|
||||
}
|
|
@ -26,6 +26,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -238,7 +239,11 @@ class MappersTest {
|
|||
assertNotNull(i.getAccessright());
|
||||
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||
});
|
||||
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
|
||||
|
||||
Publication p_cleaned = cleanup(p, vocs);
|
||||
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
|
||||
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
|
||||
|
||||
assertNotNull(p.getInstance().get(0).getPid());
|
||||
assertEquals(2, p.getInstance().get(0).getPid().size());
|
||||
|
||||
|
@ -453,7 +458,10 @@ class MappersTest {
|
|||
assertNotNull(i.getAccessright());
|
||||
assertEquals("OPEN", i.getAccessright().getClassid());
|
||||
});
|
||||
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
|
||||
|
||||
Publication p_cleaned = cleanup(p, vocs);
|
||||
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
|
||||
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -570,7 +578,9 @@ class MappersTest {
|
|||
assertTrue(i.getUrl().contains("http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT02321059"));
|
||||
assertTrue(i.getUrl().contains("https://clinicaltrials.gov/ct2/show/NCT02321059"));
|
||||
|
||||
assertEquals("UNKNOWN", i.getRefereed().getClassid());
|
||||
Dataset d_cleaned = cleanup(d, vocs);
|
||||
assertEquals("0000", d_cleaned.getInstance().get(0).getRefereed().getClassid());
|
||||
assertEquals("Unknown", d_cleaned.getInstance().get(0).getRefereed().getClassname());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -871,7 +881,10 @@ class MappersTest {
|
|||
assertNotNull(i.getAccessright());
|
||||
assertEquals("UNKNOWN", i.getAccessright().getClassid());
|
||||
});
|
||||
assertEquals("UNKNOWN", p.getInstance().get(0).getRefereed().getClassid());
|
||||
|
||||
Dataset p_cleaned = cleanup(p, vocs);
|
||||
assertEquals("0000", p_cleaned.getInstance().get(0).getRefereed().getClassid());
|
||||
assertEquals("Unknown", p_cleaned.getInstance().get(0).getRefereed().getClassname());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -947,6 +960,11 @@ class MappersTest {
|
|||
Instance inst = p.getInstance().get(0);
|
||||
assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getPid().get(0).getValue());
|
||||
assertEquals("https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", inst.getUrl().get(0));
|
||||
assertEquals(1, p.getEoscifguidelines().size());
|
||||
assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getCode());
|
||||
assertEquals("EOSC::RO-crate", p.getEoscifguidelines().get(0).getLabel());
|
||||
assertEquals("", p.getEoscifguidelines().get(0).getUrl());
|
||||
assertEquals("compliesWith", p.getEoscifguidelines().get(0).getSemanticRelation());
|
||||
|
||||
}
|
||||
|
||||
|
@ -995,6 +1013,18 @@ class MappersTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testEOSCFuture_ROHub() throws IOException {
|
||||
final String xml = IOUtils
|
||||
.toString(Objects.requireNonNull(getClass().getResourceAsStream("photic-zone-transformed.xml")));
|
||||
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||
final OtherResearchProduct rocrate = (OtherResearchProduct) list.get(0);
|
||||
assertNotNull(rocrate.getEoscifguidelines());
|
||||
System.out.println("***************");
|
||||
System.out.println(new ObjectMapper().writeValueAsString(rocrate));
|
||||
System.out.println("***************");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testNotWellFormed() throws IOException {
|
||||
final String xml = IOUtils
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,4 @@
|
|||
{ "duplicateId" : "10|re3data_____::4c4416659cb74c2e0e891a883a047cbc", "masterId" : "10|fairsharing_::a29d1598024f9e87beab4b98411d48ce", "masterName" : "Bacterial Protein Interaction Database" }
|
||||
{ "duplicateId" : "10|opendoar____::788b4ac1e172d8e520c2b9461c0a3d35", "masterId" : "10|re3data_____::fc1db64b3964826913b1e9eafe830490", "masterName" : "FULIR Data" }
|
||||
{ "duplicateId" : "10|re3data_____::6ffd7bc058f762912dc494cd9c175341", "masterId" : "10|fairsharing_::3f647cadf56541fb9513cb63ec370187", "masterName" : "depositar" }
|
||||
{ "duplicateId" : "10|scindeksserb::07022f78a8cc6d1171092454ecdbb47c", "masterId" : "10|doajarticles::07022f78a8cc6d1171092454ecdbb47c", "masterName" : "Artefact" }
|
File diff suppressed because one or more lines are too long
|
@ -706,6 +706,28 @@
|
|||
"source": [
|
||||
],
|
||||
"subject": [
|
||||
{
|
||||
"dataInfo": {
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:repository",
|
||||
"classname": "sysimport:crosswalk:repository",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"deletedbyinference": false,
|
||||
"inferred": false,
|
||||
"inferenceprovenance": "",
|
||||
"invisible": false,
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "FOS",
|
||||
"classname": "Fields of Science and Technology classification",
|
||||
"schemeid": "dnet:result_subject",
|
||||
"schemename": "dnet:result_subject"
|
||||
},
|
||||
"value": "In Situ Hybridization"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
|
@ -885,24 +907,23 @@
|
|||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"classid": "sysimport:actionset",
|
||||
"classname": "Harvested",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "",
|
||||
"classname": "",
|
||||
"schemeid": "",
|
||||
"schemename": ""
|
||||
"classid": "FOS",
|
||||
"classname": "Fields of Science and Technology classification",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "doped silicon"
|
||||
"value": "Avicennia"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
|
|
@ -0,0 +1,108 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
|
||||
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<header xmlns="http://www.openarchives.org/OAI/2.0/">
|
||||
<dri:objIdentifier>fsh_____4119::68126da991bd76d8be494bddfbf7a1bb</dri:objIdentifier>
|
||||
<dri:recordIdentifier>https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</dri:recordIdentifier>
|
||||
<dri:dateOfCollection/>
|
||||
<dri:mdFormat/>
|
||||
<dri:mdFormatInterpretation/>
|
||||
<dri:repositoryId/>
|
||||
<dr:objectIdentifier/>
|
||||
<dr:dateOfCollection>2022-11-15T12:29:19Z</dr:dateOfCollection>
|
||||
<dr:dateOfTransformation>2022-11-15T12:29:19Z</dr:dateOfTransformation>
|
||||
<oaf:datasourceprefix>fsh_____4119</oaf:datasourceprefix>
|
||||
<identifier>https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</identifier>
|
||||
<datestamp>2022-11-15T12:29:19Z</datestamp>
|
||||
<setSpec>rohub_data</setSpec>
|
||||
<setSpec>ro-crate_data</setSpec>
|
||||
</header>
|
||||
<metadata>
|
||||
<datacite:resource>
|
||||
<datacite:identifier identifierType="w3id">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</datacite:identifier>
|
||||
<datacite:alternateIdentifiers>
|
||||
<datacite:alternateIdentifier alternateIdentifierType="w3id">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</datacite:alternateIdentifier>
|
||||
</datacite:alternateIdentifiers>
|
||||
<datacite:relatedIdentifiers>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/b1b617b2-6b79-4bae-9fa6-b76945645626</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/78103994-30be-4875-bf89-5acd752b5c3d</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/18fd1c70-249b-4c67-80ee-539f801a0da7</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/32faa2eb-4cc8-401f-ac5c-bec2849b70e1</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/4c253f5a-d427-40c2-9e9f-6063ae087239</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/371b1957-078c-472b-a195-af7bce152c10</datacite:relatedIdentifier>
|
||||
<datacite:relatedIdentifier relatedIdentifierType="w3id" relationType="HasPart">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be/resources/82f9e4b8-01b4-4e50-9e27-ec9d337c8d74</datacite:relatedIdentifier>
|
||||
</datacite:relatedIdentifiers>
|
||||
<datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_1843">RO-crate</datacite:resourceType>
|
||||
<datacite:rightsList>
|
||||
<datacite:rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</datacite:rights>
|
||||
<datacite:rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</datacite:rights>
|
||||
</datacite:rightsList>
|
||||
<datacite:titles>
|
||||
<datacite:title>Mapping the photic zone of the Mediterranean Sea</datacite:title>
|
||||
</datacite:titles>
|
||||
<datacite:descriptions>
|
||||
<datacite:description descriptionType="Abstract">Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea</datacite:description>
|
||||
</datacite:descriptions>
|
||||
<datacite:publisher>CNR-ISMAR</datacite:publisher>
|
||||
<creators xmlns="http://datacite.org/schema/kernel-4">
|
||||
<creator>
|
||||
<creatorName>Giorgio Castellan</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Lorenzo Angeletti</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Paolo Montagna</creatorName>
|
||||
</creator>
|
||||
<creator>
|
||||
<creatorName>Marco Taviani</creatorName>
|
||||
</creator>
|
||||
</creators>
|
||||
<dates xmlns="http://datacite.org/schema/kernel-4">
|
||||
<date dateType="Issued">2022-11-14T16:32:45Z</date>
|
||||
</dates>
|
||||
<dc:descriptions>
|
||||
<dc:description descriptionType="Abstract">Estimating the penetration of light along the water column from satellite data to map the photic zone in the Mediterranean Sea</dc:description>
|
||||
</dc:descriptions>
|
||||
<dc:publicationYear>2022</dc:publicationYear>
|
||||
<rightsList xmlns="http://datacite.org/schema/kernel-4">
|
||||
<rights rightsURI="http://purl.org/coar/access_right/c_abf2">open access</rights>
|
||||
</rightsList>
|
||||
<sizes xmlns="http://datacite.org/schema/kernel-4">
|
||||
<size>813.478 KB</size>
|
||||
</sizes>
|
||||
<subjects xmlns="http://datacite.org/schema/kernel-4">
|
||||
<subject>Earth sciences</subject>
|
||||
<subject>Ecology</subject>
|
||||
<subject>Optics</subject>
|
||||
</subjects>
|
||||
</datacite:resource>
|
||||
<oaf:identifier identifierType="w3id">https://w3id.org/ro-id/28499bdf-a0c6-46aa-a96f-50bd9490b8be</oaf:identifier>
|
||||
<dr:CobjCategory type="other">0048</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2022-11-14</oaf:dateAccepted>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:license>https://creativecommons.org/licenses/by/4.0/legalcode</oaf:license>
|
||||
<oaf:language/>
|
||||
<oaf:hostedBy name="ROHub" id="fairsharing_::4119"/>
|
||||
<oaf:collectedFrom name="ROHub" id="fairsharing_::4119"/>
|
||||
<oaf:eoscifguidelines code="EOSC::RO-crate"
|
||||
label="EOSC::RO-crate"
|
||||
url=""
|
||||
semanticrelation="compliesWith"/>
|
||||
<oaf:eoscifguidelines code="EOSC::Jupyter Notebook"
|
||||
label="EOSC::Jupyter Notebook"
|
||||
url=""
|
||||
semanticrelation="compliesWith"/>
|
||||
<oaf:eoscifguidelines code="EOSC::Data Cube"
|
||||
label="EOSC::Data Cube"
|
||||
url=""
|
||||
semanticrelation="compliesWith"/>
|
||||
</metadata>
|
||||
</record>
|
|
@ -65,7 +65,6 @@
|
|||
</sizes>
|
||||
<subjects xmlns="http://datacite.org/schema/kernel-4">
|
||||
<subject>Ecology</subject>
|
||||
<subject>EOSC::RO-crate</subject>
|
||||
</subjects>
|
||||
</datacite:resource>
|
||||
<oaf:identifier identifierType="w3id">https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca</oaf:identifier>
|
||||
|
@ -75,5 +74,9 @@
|
|||
<oaf:language/>
|
||||
<oaf:hostedBy name="ROHub" id="fairsharing_::4119"/>
|
||||
<oaf:collectedFrom name="ROHub" id="fairsharing_::4119"/>
|
||||
<oaf:eoscifguidelines code="EOSC::RO-crate"
|
||||
label="EOSC::RO-crate"
|
||||
url=""
|
||||
semanticrelation="compliesWith"/>
|
||||
</metadata>
|
||||
</record>
|
|
@ -53,7 +53,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
def generateUpdates(spark: SparkSession): Unit = {
|
||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString
|
||||
|
||||
val pids: List[String] = template.lines
|
||||
val pids: List[String] = template.linesWithSeparators.map(l =>l.stripLineEnd)
|
||||
.map { id =>
|
||||
val r = new Result
|
||||
r.setId(id.toLowerCase.trim)
|
||||
|
@ -127,7 +127,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
entities.foreach { e =>
|
||||
val template = Source.fromInputStream(this.getClass.getResourceAsStream(s"$e")).mkString
|
||||
spark
|
||||
.createDataset(spark.sparkContext.parallelize(template.lines.toList))
|
||||
.createDataset(spark.sparkContext.parallelize(template.linesWithSeparators.map(l =>l.stripLineEnd).toList))
|
||||
.as[String]
|
||||
.write
|
||||
.option("compression", "gzip")
|
||||
|
@ -264,7 +264,7 @@ class ResolveEntitiesTest extends Serializable {
|
|||
Source
|
||||
.fromInputStream(this.getClass.getResourceAsStream(s"publication"))
|
||||
.mkString
|
||||
.lines
|
||||
.linesWithSeparators.map(l =>l.stripLineEnd)
|
||||
.next(),
|
||||
classOf[Publication]
|
||||
)
|
||||
|
|
|
@ -47,7 +47,7 @@ class ScholixGraphTest extends AbstractVocabularyTest {
|
|||
val inputRelations = Source
|
||||
.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/oaf_to_summary"))
|
||||
.mkString
|
||||
val items = inputRelations.lines.toList
|
||||
val items = inputRelations.linesWithSeparators.map(l =>l.stripLineEnd).toList
|
||||
assertNotNull(items)
|
||||
items.foreach(i => assertTrue(i.nonEmpty))
|
||||
val result =
|
||||
|
@ -69,7 +69,7 @@ class ScholixGraphTest extends AbstractVocabularyTest {
|
|||
getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix")
|
||||
)
|
||||
.mkString
|
||||
val result: List[(Relation, ScholixSummary)] = inputRelations.lines
|
||||
val result: List[(Relation, ScholixSummary)] = inputRelations.linesWithSeparators.map(l =>l.stripLineEnd)
|
||||
.sliding(2)
|
||||
.map(s => (s.head, s(1)))
|
||||
.map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary])))
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.client.solrj.util.ClientUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
||||
|
||||
public class EOSCFuture_Test {
|
||||
|
||||
public static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
public static final String VERSION = "2021-04-15T10:05:53Z";
|
||||
public static final String DSID = "b9ee796a-c49f-4473-a708-e7d67b84c16d_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl";
|
||||
|
||||
private ContextMapper contextMapper;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
contextMapper = new ContextMapper();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEOSC_ROHub() throws IOException, DocumentException, TransformerException {
|
||||
|
||||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
XmlConverterJob.schemaLocation);
|
||||
|
||||
final OtherResearchProduct p = OBJECT_MAPPER
|
||||
.readValue(
|
||||
IOUtils.toString(getClass().getResourceAsStream("eosc-future/photic-zone.json")),
|
||||
OtherResearchProduct.class);
|
||||
|
||||
final String xml = xmlRecordFactory.build(new JoinedEntity<>(p));
|
||||
|
||||
assertNotNull(xml);
|
||||
|
||||
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||
|
||||
assertNotNull(doc);
|
||||
System.out.println(doc.asXML());
|
||||
|
||||
testRecordTransformation(xml);
|
||||
}
|
||||
|
||||
private void testRecordTransformation(final String record) throws IOException, TransformerException {
|
||||
final String fields = IOUtils.toString(getClass().getResourceAsStream("fields.xml"));
|
||||
final String xslt = IOUtils.toString(getClass().getResourceAsStream("layoutToRecordTransformer.xsl"));
|
||||
|
||||
final String transformer = XmlIndexingJob.getLayoutTransformer("DMF", fields, xslt);
|
||||
|
||||
final Transformer tr = SaxonTransformerFactory.newInstance(transformer);
|
||||
|
||||
final String indexRecordXML = XmlIndexingJob.toIndexRecord(tr, record);
|
||||
|
||||
final SolrInputDocument solrDoc = new StreamingInputDocumentFactory(VERSION, DSID)
|
||||
.parseDocument(indexRecordXML);
|
||||
|
||||
final String xmlDoc = ClientUtils.toXML(solrDoc);
|
||||
|
||||
Assertions.assertNotNull(xmlDoc);
|
||||
System.out.println(xmlDoc);
|
||||
}
|
||||
|
||||
}
|
|
@ -128,6 +128,41 @@ public class IndexRecordTransformerTest {
|
|||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureSoftwareNotebook() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/software-justthink.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureSoftwareNotebookClaim() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/software-justthink-claim.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureZenodo7353841() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/zenodo7353841.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureZenodo7351393() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/zenodo7351393.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testForEOSCFutureZenodo7351221() throws IOException, TransformerException {
|
||||
final String record = IOUtils
|
||||
.toString(getClass().getResourceAsStream("eosc-future/zenodo7351221.xml"));
|
||||
testRecordTransformation(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
void testDoiUrlNormalization() throws MalformedURLException {
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,305 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri">
|
||||
<header xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<dri:objIdentifier>od______2659::3801993ea8f970cfc991277160edf277</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2022-08-08T03:06:13Z</dri:dateOfCollection>
|
||||
<status>under curation</status>
|
||||
<counters/>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf https://www.openaire.eu/schema/1.0/oaf-1.0.xsd">
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title"
|
||||
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">JUSThink
|
||||
Alignment Analysis</title>
|
||||
<creator rank="1" name="" surname="">Norman, Utku</creator>
|
||||
<creator rank="2" name="" surname="">Dinkar, Tanvi</creator>
|
||||
<creator rank="3" name="" surname="">Bruno, Barbara</creator>
|
||||
<creator rank="4" name="" surname="">Clavel, Chloé</creator>
|
||||
<dateofacceptance/>
|
||||
<resulttype classid="software" classname="software"
|
||||
schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
|
||||
<language classid="eng" classname="English" schemeid="dnet:languages"
|
||||
schemename="dnet:languages"/>
|
||||
<description>
|
||||
<p>
|
||||
<strong>1. Description</strong>
|
||||
</p>
|
||||
<p>This repository contains<strong> tools to automatically analyse how
|
||||
participants align their use of task-specific referents in their
|
||||
dialogue and actions for a collaborative learning activity, and how
|
||||
it relates to the task success</strong> (i.e. their learning
|
||||
outcomes and task performance).</p>
|
||||
<p>As a use case, it processes data from a collaborative problem solving
|
||||
activity named JUSThink <a
|
||||
href="https://zenodo.org/record/4675070#references">[1, 2]</a>, i.e.
|
||||
JUSThink Dialogue and Actions Corpus data set that is available from the
|
||||
Zenodo Repository, DOI: <a href="http://doi.org/10.5281/zenodo.4627104"
|
||||
>10.5281/zenodo.4627104</a>, and reproduces the results and figures
|
||||
in <a href="https://zenodo.org/record/4675070#references">[3]</a>.</p>
|
||||
<p>In brief: </p>
|
||||
<ol>
|
||||
<li><strong>JUSThink Dialogue and Actions Corpus</strong> contains
|
||||
transcripts, event logs, and test responses of children aged 9
|
||||
through 12, as they participate in the JUSThink activity <a
|
||||
href="https://zenodo.org/record/4675070#references">[1, 2]</a>
|
||||
in pairs of two, to solve a problem on graphs together. </li>
|
||||
<li><strong>The JUSThink activity and its study</strong> is first
|
||||
described in <a href="https://zenodo.org/record/4675070#references"
|
||||
>[1]</a>, and elaborated with findings concerning the link
|
||||
between children's learning, performance in the activity, and
|
||||
perception of self, the other and the robot in <a
|
||||
href="https://zenodo.org/record/4675070#references">[2]</a>. </li>
|
||||
<li><strong>Alignment analysis in our work <a
|
||||
href="https://zenodo.org/record/4675070#references"
|
||||
>[3]</a></strong> studies the participants' use of
|
||||
expressions that are related to the task at hand, their follow up
|
||||
actions of these expressions, and how it links to task success.</li>
|
||||
</ol>
|
||||
<p>
|
||||
<strong>2. Publications</strong>
|
||||
</p>
|
||||
<p>If you use this work in an academic context, please cite the following
|
||||
publications:</p>
|
||||
<ul>
|
||||
<li>
|
||||
<p>Norman*, U., Dinkar*, T., Bruno, B., & Clavel, C. (2022).
|
||||
Studying Alignment in a Collaborative Learning Activity via
|
||||
Automatic Methods: The Link Between What We Say and Do. Dialogue
|
||||
& Discourse, 13(2), 1 - ;48. *Contributed equally to this
|
||||
work. <a href="https://doi.org/10.5210/dad.2022.201"
|
||||
>https://doi.org/10.5210/dad.2022.201</a></p>
|
||||
</li>
|
||||
<li>
|
||||
<p>Norman, U., Dinkar, T., Bruno, B., & Clavel, C. (2021).
|
||||
JUSThink Alignment Analysis. In Dialogue & Discourse
|
||||
(v1.0.0, Vol. 13, Number 2, pp. 1 - ;48). Zenodo. <a
|
||||
href="https://doi.org/10.5281/zenodo.4675070"
|
||||
>https://doi.org/10.5281/zenodo.4675070</a></p>
|
||||
</li>
|
||||
</ul>
|
||||
<p>
|
||||
<strong>3. Content</strong>
|
||||
</p>
|
||||
<p>The tools provided in this repository consists of 7 Jupyter Notebooks
|
||||
written in Python 3, and two additional external tools utilised by the
|
||||
notebooks.</p>
|
||||
<p>
|
||||
<strong>3.1. Jupyter Notebooks</strong>
|
||||
</p>
|
||||
<p>We highlight that the notebooks up until the last (i.e. to test the
|
||||
hypotheses (tools/7_test_the_hypotheses.ipynb)) present a general
|
||||
pipeline to process event logs, test responses and transcripts to
|
||||
extract measures of task performance, learning outcomes, and measures of
|
||||
alignment.</p>
|
||||
<ol>
|
||||
<li><strong>Extract task performance (and other features) from the logs
|
||||
</strong>(tools/1_extract_performance_and_other_features_from_logs.ipynb):
|
||||
Extracts various measures of task behaviour from the logs, at
|
||||
varying granularities of the activity (i.e. the whole corpus, task,
|
||||
attempt, and turn levels). In later notebooks, we focus on one of
|
||||
the features to estimate the task performance of a team: (minimum)
|
||||
error.</li>
|
||||
<li><strong>Extract learning outcomes from the test responses</strong>
|
||||
(tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts
|
||||
measures of learning outcomes from the responses to the pre-test and
|
||||
the post-test. In later notebooks, we focus on one of the features
|
||||
to estimate the learning outcome of a team: relative learning gain
|
||||
<a href="https://sandbox.zenodo.org/record/742549#references"
|
||||
>[4]</a></li>
|
||||
<li><strong>Select and visualise a subset of teams for
|
||||
transcription</strong>
|
||||
(tools/3_visualise_transcribed_teams.ipynb): Visualises the
|
||||
transcribed teams among the other teams in the feature space spanned
|
||||
by task performance and learning outcome, as well as the
|
||||
distribution of their number of attempts and turns.</li>
|
||||
<li><strong>Extract routines from transcripts</strong>
|
||||
(tools/4_extract_routines_from_transcripts.ipynb) (uses <a
|
||||
href="https://github.com/GuillaumeDD/dialign">dialign</a> to
|
||||
extract routines): Extracts routines of referring expressions that
|
||||
are "fixed", i.e. become shared or established amongst
|
||||
interlocutors.</li>
|
||||
<li><strong>Combine transcripts with logs</strong>
|
||||
(tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb):
|
||||
Merges transcripts with event logs to have a combined dialogue and
|
||||
actions corpus, to be processed e.g. to detect follow-up
|
||||
actions.</li>
|
||||
<li><strong>Recognise instructions and detect follow-up actions</strong>
|
||||
(tools/6_recognise_instructions_detect_follow-up_actions.ipynb):
|
||||
Extracts verbalised instruction such as "connect Mount Basel to
|
||||
Montreux", and pairs them with the follow-up action that may
|
||||
<em>match</em> (e.g. if the other connects Basel to Montreux) or
|
||||
<em>mismatch</em> (e.g. if the other connects Basel to
|
||||
Neuchatel) with the instruction.</li>
|
||||
<li><strong>Test the hypotheses </strong>in <a
|
||||
href="https://sandbox.zenodo.org/record/742549#references"
|
||||
>[3]</a> (tools/7_test_the_hypotheses.ipynb) (uses
|
||||
<strong>effsize</strong> to estimate effect size, specifically
|
||||
Cliff's Delta): Considers each research questions and hypotheses
|
||||
studied in <a
|
||||
href="https://sandbox.zenodo.org/record/742549#references"
|
||||
>[3]</a> and generates the results in <a
|
||||
href="https://sandbox.zenodo.org/record/742549#references"
|
||||
>[3]</a>.</li>
|
||||
</ol>
|
||||
<p>
|
||||
<strong>3.2. External Tools</strong>
|
||||
</p>
|
||||
<ol>
|
||||
<li><strong><a href="https://github.com/GuillaumeDD/dialign">dialign</a>
|
||||
tool</strong> to extract routines, specifically <a
|
||||
href="https://github.com/GuillaumeDD/dialign/releases/tag/v1.0"
|
||||
>Release 1.0</a> from <a
|
||||
href="https://github.com/GuillaumeDD/dialign/releases/download/v1.0/dialign-1.0.zip"
|
||||
>dialign-1.0.zip</a>:\n It extracts routine expressions that are
|
||||
"shared" among the participants from transcripts. \n It is
|
||||
used as an external module (in accordance with its CeCILL-B License,
|
||||
see <strong>License</strong>).</li>
|
||||
<li><strong>effsize tool</strong> to compute estimators of effect
|
||||
size.\n We specifically use it to compute Cliff's Delta, which
|
||||
quantifies the amount difference between two groups of observations,
|
||||
by computing the Cliff's Delta statistic.\n It is taken from
|
||||
project <a
|
||||
href="https://acclab.github.io/DABEST-python-docs/index.html"
|
||||
>DABEST</a> (see <strong>License</strong>).</li>
|
||||
</ol>
|
||||
<p>
|
||||
<strong>4. Research Questions and Hypotheses in <a
|
||||
href="https://sandbox.zenodo.org/record/742549#references"
|
||||
>[3]</a></strong>
|
||||
</p>
|
||||
<ul>
|
||||
<li><strong>RQ1 Lexical alignment</strong>: How do the interlocutors
|
||||
<em>use</em> expressions related to the task? Is this associated
|
||||
with task success? <ul>
|
||||
<li><strong>H1.1</strong>: Task-specific referents become
|
||||
routine early for more successful teams.</li>
|
||||
<li><strong>H1.2</strong>: Hesitation phenomena are more likely
|
||||
to occur in the vicinity of priming and establishment of
|
||||
task-specific referents for more successful teams.</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li><strong>RQ2 Behavioural alignment</strong>: How do the interlocutors
|
||||
<em>follow up</em> these expressions with actions? Is this
|
||||
associated with task success? <ul>
|
||||
<li><strong>H2.1</strong>: Instructions are more likely to be
|
||||
followed by a corresponding action early in the dialogue for
|
||||
more successful teams.</li>
|
||||
<li><strong>H2.2</strong>: When instructions are followed by a
|
||||
corresponding or a different action, the action is more
|
||||
likely to be in the vicinity of information management
|
||||
phenomena for more successful teams.</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<p>The RQs and Hs are addressed in the notebook for testing the hypotheses
|
||||
(i.e. tools/7_test_the_hypotheses.ipynb).</p>
|
||||
<p>
|
||||
<strong>Acknowledgements</strong>
|
||||
</p>
|
||||
<p>This project has received funding from the European Union's Horizon
|
||||
2020 research and innovation programme under grant agreement No 765955.
|
||||
Namely, the <a href="https://www.animatas.eu/">ANIMATAS Project</a>.</p>
|
||||
<p>
|
||||
<strong>License</strong>
|
||||
</p>
|
||||
<p>The whole package is under MIT License, see the <strong>LICENSE</strong>
|
||||
file.</p>
|
||||
<p>Classes under the <strong>tools/effsize</strong> package were taken from
|
||||
project <a href="https://acclab.github.io/DABEST-python-docs/index.html"
|
||||
><strong>DABEST</strong></a>, Copyright 2016-2020 Joses W. Ho.
|
||||
These classes are licensed under the BSD 3-Clause Clear License. See
|
||||
<strong>tools/effsize/LICENSE</strong> file for additional
|
||||
details.</p>
|
||||
<p>Classes under the <strong>tools/dialign-1.0</strong> package were taken
|
||||
from project <strong><a href="https://github.com/GuillaumeDD/dialign"
|
||||
>dialign</a></strong>. These classes are licensed under the
|
||||
CeCILL-B License. This package is used as an "external
|
||||
module", see<strong> tools/dialign-1.0/LICENSE.txt</strong> for
|
||||
additional details.</p>
|
||||
</description>
|
||||
<country classid="" classname="" schemeid="" schemename=""/>
|
||||
<subject classid="" classname="" schemeid="" schemename=""/>
|
||||
<relevantdate classid="" classname="" schemeid="" schemename=""/>
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate/>
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol=""/>
|
||||
<source/>
|
||||
<fulltext/>
|
||||
<format/>
|
||||
<storagedate/>
|
||||
<resourcetype classid="" classname="" schemeid="" schemename=""/>
|
||||
<device/>
|
||||
<size/>
|
||||
<version/>
|
||||
<lastmetadataupdate/>
|
||||
<metadataversionnumber/>
|
||||
<documentationUrl/>
|
||||
<codeRepositoryUrl/>
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename=""/>
|
||||
<contactperson/>
|
||||
<contactgroup/>
|
||||
<tool/>
|
||||
<originalId>oai:zenodo.org:4675070</originalId>
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<pid classid="oai" classname="Open Archives Initiative"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types"
|
||||
>oai:zenodo.org:4675070</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types"
|
||||
>10.5281/zenodo.4675070</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access"
|
||||
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
|
||||
<eoscifguidelines code="EOSC::Jupyter Notebook" label="EOSC::Jupyter Notebook"
|
||||
url="" semanticrelation="compliesWith"/>
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance/>
|
||||
<provenanceaction classid="user:insert" classname="user:insert"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance=""
|
||||
provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations"
|
||||
type="project">corda__h2020::c4515ebef538a734cf11f795347f5dac</to>
|
||||
<code>765955</code>
|
||||
<acronym>ANIMATAS</acronym>
|
||||
<title>Advancing intuitive human-machine interaction with human-like
|
||||
social capabilities for education in schools</title>
|
||||
<contracttype classid="" classname="" schemeid="" schemename=""/>
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC"
|
||||
name="European Commission" jurisdiction=""/>
|
||||
<funding_level_0 name="H2020"
|
||||
>ec__________::EC::H2020</funding_level_0>
|
||||
</funding>
|
||||
<websiteurl/>
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<instance id="od______2659::3801993ea8f970cfc991277160edf277">
|
||||
<instancetype classid="0029" classname="Software"
|
||||
schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<collectedfrom name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<hostedby name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<accessright classid="OPEN" classname="Open Access"
|
||||
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
|
||||
<dateofacceptance/>
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/4675070</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,429 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri">
|
||||
<header xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<dri:objIdentifier>doi_dedup___::c054151b6a8c4f41c7acf160651a6503</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2022-10-13T00:15:44+0000</dri:dateOfCollection>
|
||||
<dri:dateOfTransformation>2022-10-13T07:44:29.152Z</dri:dateOfTransformation>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf https://www.openaire.eu/schema/1.0/oaf-1.0.xsd">
|
||||
<oaf:result>
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<collectedfrom name="Datacite" id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<originalId>oai:zenodo.org:4675070</originalId>
|
||||
<originalId>50|od______2659::3801993ea8f970cfc991277160edf277</originalId>
|
||||
<originalId>oai:zenodo.org:6974562</originalId>
|
||||
<originalId>50|od______2659::9c87ff4a5e7710052b873088e7265072</originalId>
|
||||
<originalId>10.5281/zenodo.4675069</originalId>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.4675070</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.6974562</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:actionset" trust="0.9">10.5281/zenodo.4675069</pid>
|
||||
<measure id="influence" score="4.916186E-9" class="C5"/>
|
||||
<measure id="popularity" score="6.885733E-9" class="C5"/>
|
||||
<measure id="influence_alt" score="0" class="C5"/>
|
||||
<measure id="popularity_alt" score="0.0" class="C5"/>
|
||||
<measure id="impulse" score="0" class="C5"/>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title"
|
||||
schemename="dnet:dataCite_title" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">JUSThink Alignment
|
||||
Analysis</title>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes"
|
||||
schemename="dnet:access_modes"/>
|
||||
<creator rank="1" name="Utku" surname="Norman" orcid_pending="0000-0002-6802-1444"
|
||||
>Norman, Utku</creator>
|
||||
<creator rank="2" name="Tanvi" surname="Dinkar">Dinkar, Tanvi</creator>
|
||||
<creator rank="3" name="Barbara" surname="Bruno" orcid_pending="0000-0003-0953-7173"
|
||||
>Bruno, Barbara</creator>
|
||||
<creator rank="4" name="Chloé" surname="Clavel" orcid_pending="0000-0003-4850-3398"
|
||||
>Clavel, Chloé</creator>
|
||||
<dateofacceptance>2022-08-08</dateofacceptance>
|
||||
<description>&lt;strong>1. Description&lt;/strong> This repository
|
||||
contains&lt;strong> tools to automatically analyse how participants align
|
||||
their use of task-specific referents in their dialogue and actions for a
|
||||
collaborative learning activity, and how it relates to the task
|
||||
success&lt;/strong> (i.e. their learning outcomes and task performance). As
|
||||
a use case, it processes data from a collaborative problem solving activity
|
||||
named JUSThink [1, 2], i.e. JUSThink Dialogue and Actions Corpus data set that
|
||||
is available from the Zenodo Repository, DOI: 10.5281/zenodo.4627104, and
|
||||
reproduces the results and figures in [3]. In brief: &lt;strong>JUSThink
|
||||
Dialogue and Actions Corpus&lt;/strong> contains transcripts, event logs,
|
||||
and test responses of children aged 9 through 12, as they participate in the
|
||||
JUSThink activity [1, 2] in pairs of two, to solve a problem on graphs together.
|
||||
&lt;strong>The JUSThink activity and its study&lt;/strong> is first
|
||||
described in [1], and elaborated with findings concerning the link between
|
||||
children's learning, performance in the activity, and perception of self, the
|
||||
other and the robot in [2]. &lt;strong>Alignment analysis in our work
|
||||
[3]&lt;/strong> studies the participants' use of expressions that are
|
||||
related to the task at hand, their follow up actions of these expressions, and
|
||||
how it links to task success. &lt;strong>Changes in Release
|
||||
v1.1.0:&lt;/strong> updated with the publication information, finalized
|
||||
paper structure, research questions and hypotheses as in the published article:
|
||||
U. Norman*&lt;em>, &lt;/em>T. Dinkar*, B. Bruno, and C. Clavel,
|
||||
"Studying Alignment in a Collaborative Learning Activity via Automatic Methods:
|
||||
The Link Between What We Say and Do," Dialogue &amp;amp; Discourse, 13(2),
|
||||
1–48. *Contributed equally to this work. 10.5210/dad.2022.201.
|
||||
&lt;strong>Full Changelog:&lt;/strong>
|
||||
https://github.com/chili-epfl/justhink-alignment-analysis/compare/v1.0.0...v1.1.0
|
||||
&lt;strong>2. Publications&lt;/strong> If you use this work in an
|
||||
academic context, please cite the following publications: Norman*, U., Dinkar*,
|
||||
T., Bruno, B., &amp;amp; Clavel, C. (2022). Studying Alignment in a
|
||||
Collaborative Learning Activity via Automatic Methods: The Link Between What We
|
||||
Say and Do. Dialogue &amp;amp; Discourse, 13(2), 1–48. *Contributed equally
|
||||
to this work. https://doi.org/10.5210/dad.2022.201 Norman, U., Dinkar, T.,
|
||||
Bruno, B., &amp;amp; Clavel, C. (2021). JUSThink Alignment Analysis. In
|
||||
Dialogue &amp;amp; Discourse (v1.1.0, Vol. 13, Number 2, pp. 1–48). Zenodo.
|
||||
https://doi.org/10.5281/zenodo.6974562 &lt;strong>3. Content&lt;/strong>
|
||||
The tools provided in this repository consists of 7 Jupyter Notebooks written in
|
||||
Python 3, and two additional external tools utilised by the notebooks.
|
||||
&lt;strong>3.1. Jupyter Notebooks&lt;/strong> We highlight that the
|
||||
notebooks up until the last (i.e. to test the hypotheses
|
||||
(tools/7_test_the_hypotheses.ipynb)) present a general pipeline to process event
|
||||
logs, test responses and transcripts to extract measures of task performance,
|
||||
learning outcomes, and measures of alignment. &lt;strong>Extract task
|
||||
performance (and other features) from the logs
|
||||
&lt;/strong>(tools/1_extract_performance_and_other_features_from_logs.ipynb):
|
||||
Extracts various measures of task behaviour from the logs, at varying
|
||||
granularities of the activity (i.e. the whole corpus, task, attempt, and turn
|
||||
levels). In later notebooks, we focus on one of the features to estimate the
|
||||
task performance of a team: (minimum) error. &lt;strong>Extract learning
|
||||
outcomes from the test responses&lt;/strong>
|
||||
(tools/2_extract_learning_gain_from_test_responses.ipynb): Extracts measures of
|
||||
learning outcomes from the responses to the pre-test and the post-test. In later
|
||||
notebooks, we focus on one of the features to estimate the learning outcome of a
|
||||
team: relative learning gain [4] &lt;strong>Select and visualise a subset of
|
||||
teams for transcription&lt;/strong>
|
||||
(tools/3_visualise_transcribed_teams.ipynb): Visualises the transcribed teams
|
||||
among the other teams in the feature space spanned by task performance and
|
||||
learning outcome, as well as the distribution of their number of attempts and
|
||||
turns. &lt;strong>Extract routines from transcripts&lt;/strong>
|
||||
(tools/4_extract_routines_from_transcripts.ipynb) (uses dialign to extract
|
||||
routines): Extracts routines of referring expressions that are "fixed", i.e.
|
||||
become shared or established amongst interlocutors. &lt;strong>Combine
|
||||
transcripts with logs&lt;/strong>
|
||||
(tools/5_construct_the_corpus_by_combining_transcripts_with_logs.ipynb): Merges
|
||||
transcripts with event logs to have a combined dialogue and actions corpus, to
|
||||
be processed e.g. to detect follow-up actions. &lt;strong>Recognise
|
||||
instructions and detect follow-up actions&lt;/strong>
|
||||
(tools/6_recognise_instructions_detect_follow-up_actions.ipynb): Extracts
|
||||
verbalised instruction such as "connect Mount Basel to Montreux", and pairs them
|
||||
with the follow-up action that may &lt;em>match&lt;/em> (e.g. if the
|
||||
other connects Basel to Montreux) or &lt;em>mismatch&lt;/em> (e.g. if
|
||||
the other connects Basel to Neuchatel) with the instruction. &lt;strong>Test
|
||||
the hypotheses &lt;/strong>in [3] (tools/7_test_the_hypotheses.ipynb) (uses
|
||||
&lt;strong>effsize&lt;/strong> to estimate effect size, specifically
|
||||
Cliff's Delta): Considers each research questions and hypotheses studied in [3]
|
||||
and generates the results in [3]. &lt;strong>3.2. External
|
||||
Tools&lt;/strong> &lt;strong>dialign tool&lt;/strong> to extract
|
||||
routines, specifically Release 1.0 from dialign-1.0.zip:&lt;br> It extracts
|
||||
routine expressions that are "shared" among the participants from transcripts.
|
||||
&lt;br> It is used as an external module (in accordance with its CeCILL-B
|
||||
License, see &lt;strong>License&lt;/strong>). &lt;strong>effsize
|
||||
tool&lt;/strong> to compute estimators of effect size.&lt;br> We
|
||||
specifically use it to compute Cliff's Delta, which quantifies the amount
|
||||
difference between two groups of observations, by computing the Cliff's Delta
|
||||
statistic.&lt;br> It is taken from project DABEST (see
|
||||
&lt;strong>License&lt;/strong>). &lt;strong>4. Research Questions
|
||||
and Hypotheses in [3]&lt;/strong> &lt;strong>RQ1 Lexical
|
||||
alignment&lt;/strong>: How do the interlocutors &lt;em>use&lt;/em>
|
||||
expressions related to the task? Is this associated with task success?
|
||||
&lt;strong>H1.1&lt;/strong>: Task-specific referents become routine
|
||||
early for more successful teams. &lt;strong>H1.2&lt;/strong>: Hesitation
|
||||
phenomena are more likely to occur in the vicinity of priming and establishment
|
||||
of task-specific referents for more successful teams. &lt;strong>RQ2
|
||||
Behavioural alignment&lt;/strong>: How do the interlocutors
|
||||
&lt;em>follow up&lt;/em> these expressions with actions? Is this
|
||||
associated with task success? &lt;strong>H2.1&lt;/strong>: Instructions
|
||||
are more likely to be followed by a corresponding action early in the dialogue
|
||||
for more successful teams. &lt;strong>H2.2&lt;/strong>: When
|
||||
instructions are followed by a corresponding or a different action, the action
|
||||
is more likely to be in the vicinity of information management phenomena for
|
||||
more successful teams. The RQs and Hs are addressed in the notebook for testing
|
||||
the hypotheses (i.e. tools/7_test_the_hypotheses.ipynb).
|
||||
&lt;strong>Acknowledgements&lt;/strong> This project has received
|
||||
funding from the European Union's Horizon 2020 research and innovation programme
|
||||
under grant agreement No 765955. Namely, the ANIMATAS Project.
|
||||
&lt;strong>License&lt;/strong> The whole package is under MIT License,
|
||||
see the &lt;strong>LICENSE&lt;/strong> file. Classes under the
|
||||
&lt;strong>tools/effsize&lt;/strong> package were taken from project
|
||||
&lt;strong>DABEST&lt;/strong>, Copyright 2016-2020 Joses W. Ho. These
|
||||
classes are licensed under the BSD 3-Clause Clear License. See
|
||||
&lt;strong>tools/effsize/LICENSE&lt;/strong> file for additional
|
||||
details. Classes under the &lt;strong>tools/dialign-1.0&lt;/strong>
|
||||
package were taken from project &lt;strong>dialign&lt;/strong>. These
|
||||
classes are licensed under the CeCILL-B License. This package is used as an
|
||||
"external module", see&lt;strong>
|
||||
tools/dialign-1.0/LICENSE.txt&lt;/strong> for additional
|
||||
details.</description>
|
||||
<description>{"references": ["[1] J. Nasir, U. Norman, B. Bruno, and P. Dillenbourg,
|
||||
\"You Tell, I Do, and We Swap until we Connect All the Gold Mines!,\" ERCIM
|
||||
News, vol. 2020, no. 120, 2020, [Online]. Available:
|
||||
https://ercim-news.ercim.eu/en120/special/you-tell-i-do-and-we-swap-until-we-connect-all-the-gold-mines",
|
||||
"[2] J. Nasir*, U. Norman*, B. Bruno, and P. Dillenbourg, \"When Positive
|
||||
Perception of the Robot Has No Effect on Learning,\" in 2020 29th IEEE
|
||||
International Conference on Robot and Human Interactive Communication (RO-MAN),
|
||||
Aug. 2020, pp. 313\u2013320, doi: 10.1109/RO-MAN47096.2020.9223343", "[3] U.
|
||||
Norman*, T. Dinkar*, B. Bruno, and C. Clavel, \"Studying Alignment in a
|
||||
Collaborative Learning Activity via Automatic Methods: The Link Between What We
|
||||
Say and Do,\" Dialogue &amp;amp; Discourse, vol. 13, no. 2, pp. 1\u201348,
|
||||
Aug. 2022, doi: 10.5210/dad.2022.201.", "[4] M. Sangin, G. Molinari, M.-A.
|
||||
N\u00fcssli, and P. Dillenbourg, \"Facilitating peer knowledge modeling: Effects
|
||||
of a knowledge awareness tool on collaborative learning outcomes and
|
||||
processes,\"\" Computers in Human Behavior, vol. 27, no. 3, pp. 1059\u20131067,
|
||||
May 2011, doi: 10.1016/j.chb.2010.05.032."]}</description>
|
||||
<subject classid="keyword" classname="keyword"
|
||||
schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>alignment</subject>
|
||||
<subject classid="keyword" classname="keyword"
|
||||
schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">situated
|
||||
dialogue</subject>
|
||||
<subject classid="keyword" classname="keyword"
|
||||
schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">collaborative
|
||||
learning</subject>
|
||||
<subject classid="keyword" classname="keyword"
|
||||
schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">spontaneous
|
||||
speech</subject>
|
||||
<subject classid="keyword" classname="keyword"
|
||||
schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>disfluency</subject>
|
||||
<subject classid="keyword" classname="keyword"
|
||||
schemeid="dnet:subject_classification_typologies"
|
||||
schemename="dnet:subject_classification_typologies" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9">mutual
|
||||
understanding</subject>
|
||||
<language classid="eng" classname="English" schemeid="dnet:languages"
|
||||
schemename="dnet:languages"/>
|
||||
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date"
|
||||
schemename="dnet:dataCite_date" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>2021-04-09</relevantdate>
|
||||
<relevantdate classid="issued" classname="issued" schemeid="dnet:dataCite_date"
|
||||
schemename="dnet:dataCite_date" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>2022-08-08</relevantdate>
|
||||
<publisher>Zenodo</publisher>
|
||||
<resulttype classid="software" classname="software"
|
||||
schemeid="dnet:result_typologies" schemename="dnet:result_typologies"/>
|
||||
<resourcetype classid="UNKNOWN" classname="UNKNOWN"
|
||||
schemeid="dnet:dataCite_resource" schemename="dnet:dataCite_resource"/>
|
||||
<programmingLanguage/>
|
||||
<context id="EC" label="European Commission" type="funding">
|
||||
<category id="EC::H2020" label="Horizon 2020 Framework Programme">
|
||||
<concept id="EC::H2020::MSCA-ITN-ETN" label="European Training Networks"/>
|
||||
</category>
|
||||
</context>
|
||||
<eoscifguidelines code="EOSC::Jupyter Notebook"
|
||||
label="EOSC::Jupyter Notebook"
|
||||
url=""
|
||||
semanticrelation="compliesWith"/>
|
||||
<datainfo>
|
||||
<inferred>true</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.8</trust>
|
||||
<inferenceprovenance>dedup-result-decisiontree-v3</inferenceprovenance>
|
||||
<provenanceaction classid="sysimport:dedup" classname="Inferred by OpenAIRE"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance=""
|
||||
provenanceaction="sysimport:actionset">
|
||||
<to class="IsSupplementTo" scheme="dnet:result_result_relations"
|
||||
type="publication">doi_dedup___::ae235765bbc422195a6c9f632b2d77eb</to>
|
||||
<collectedfrom name="arXiv.org e-Print Archive"
|
||||
id="opendoar____::6f4922f45568161a8cdf4ad2299f6d23"/>
|
||||
<pid classid="arXiv" classname="arXiv" schemeid="dnet:pid_types"
|
||||
schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>2104.04429</pid>
|
||||
<collectedfrom name="Infoscience - EPFL scientific publications"
|
||||
id="opendoar____::eecca5b6365d9607ee5a9d336962c534"/>
|
||||
<publisher>arXiv</publisher>
|
||||
<collectedfrom name="Crossref"
|
||||
id="openaire____::081b82f96300b6a6e3d282bad31cb6e2"/>
|
||||
<dateofacceptance>2022-08-05</dateofacceptance>
|
||||
<title classid="main title" classname="main title"
|
||||
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Studying
|
||||
Alignment in a Collaborative Learning Activity via Automatic Methods:
|
||||
The Link Between What We Say and Do</title>
|
||||
<collectedfrom name="ORCID"
|
||||
id="openaire____::806360c771262b4d6770e7cdf04b5c5a"/>
|
||||
<collectedfrom name="Datacite"
|
||||
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:actionset" trust="0.9"
|
||||
>10.48550/arxiv.2104.04429</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types"
|
||||
>10.5210/dad.2022.201</pid>
|
||||
</rel>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance=""
|
||||
provenanceaction="sysimport:actionset">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations"
|
||||
type="project">corda__h2020::c4515ebef538a734cf11f795347f5dac</to>
|
||||
<title>Advancing intuitive human-machine interaction with human-like social
|
||||
capabilities for education in schools</title>
|
||||
<code>765955</code>
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission"
|
||||
jurisdiction="EU"/>
|
||||
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
|
||||
<funding_level_1 name="MSCA-ITN-ETN"
|
||||
>ec__________::EC::H2020::MSCA-ITN-ETN</funding_level_1>
|
||||
</funding>
|
||||
<acronym>ANIMATAS</acronym>
|
||||
</rel>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance=""
|
||||
provenanceaction="sysimport:actionset">
|
||||
<to class="IsSupplementedBy" scheme="dnet:result_result_relations"
|
||||
type="dataset">doi_dedup___::0a6314b0ed275d915f5b57a259375691</to>
|
||||
<dateofacceptance>2021-03-22</dateofacceptance>
|
||||
<publisher>Zenodo</publisher>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.4627104</pid>
|
||||
<title classid="main title" classname="main title"
|
||||
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title"
|
||||
inferred="false" provenanceaction="sysimport:crosswalk:repository"
|
||||
trust="0.9">JUSThink Dialogue and Actions Corpus</title>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:actionset" trust="0.9"
|
||||
>10.5281/zenodo.4627103</pid>
|
||||
<collectedfrom name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<collectedfrom name="Datacite"
|
||||
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<result objidentifier="doi_________::c054151b6a8c4f41c7acf160651a6503">
|
||||
<publisher>Zenodo</publisher>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.4675070</pid>
|
||||
<title classid="main title" classname="main title"
|
||||
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title"
|
||||
inferred="false" provenanceaction="sysimport:crosswalk:repository"
|
||||
trust="0.9">JUSThink Alignment Analysis</title>
|
||||
<dateofacceptance>2021-04-09</dateofacceptance>
|
||||
<collectedfrom name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
</result>
|
||||
<result objidentifier="doi_________::04aaa160a921cafdc90e03483de0a26f">
|
||||
<dateofacceptance>2022-08-08</dateofacceptance>
|
||||
<publisher>Zenodo</publisher>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.6974562</pid>
|
||||
<collectedfrom name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<title classid="main title" classname="main title"
|
||||
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title"
|
||||
inferred="false" provenanceaction="sysimport:crosswalk:repository"
|
||||
trust="0.9">JUSThink Alignment Analysis (v1.1.0)</title>
|
||||
</result>
|
||||
<result objidentifier="doi_________::684a8fbe0ff09f288e9d29db897233bb">
|
||||
<title classid="main title" classname="main title"
|
||||
schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">JUSThink
|
||||
Alignment Analysis (v1.1.0)</title>
|
||||
<dateofacceptance>2022-08-08</dateofacceptance>
|
||||
<publisher>Zenodo</publisher>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:actionset" trust="0.9"
|
||||
>10.5281/zenodo.4675069</pid>
|
||||
<collectedfrom name="Datacite"
|
||||
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
</result>
|
||||
<instance>
|
||||
<accessright classid="OPEN" classname="Open Access"
|
||||
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="Datacite"
|
||||
id="openaire____::9e3be59865b2c1c335d32dae2fe7b254"/>
|
||||
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<dateofacceptance>2022-08-08</dateofacceptance>
|
||||
<instancetype classid="0029" classname="Software"
|
||||
schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:actionset" trust="0.9"
|
||||
>10.5281/zenodo.4675069</pid>
|
||||
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<license>https://opensource.org/licenses/MIT</license>
|
||||
<webresource>
|
||||
<url>https://doi.org/10.5281/zenodo.4675069</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance>
|
||||
<accessright classid="OPEN" classname="Open Access"
|
||||
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<dateofacceptance>2022-08-08</dateofacceptance>
|
||||
<instancetype classid="0029" classname="Software"
|
||||
schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.6974562</pid>
|
||||
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<license>https://opensource.org/licenses/MIT</license>
|
||||
<webresource>
|
||||
<url>https://doi.org/10.5281/zenodo.6974562</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
<instance>
|
||||
<accessright classid="OPEN" classname="Open Access"
|
||||
schemeid="dnet:access_modes" schemename="dnet:access_modes"/>
|
||||
<collectedfrom name="ZENODO"
|
||||
id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69"/>
|
||||
<dateofacceptance>2021-04-09</dateofacceptance>
|
||||
<instancetype classid="0029" classname="Software"
|
||||
schemeid="dnet:publication_resource"
|
||||
schemename="dnet:publication_resource"/>
|
||||
<pid classid="doi" classname="Digital Object Identifier"
|
||||
schemeid="dnet:pid_types" schemename="dnet:pid_types" inferred="false"
|
||||
provenanceaction="sysimport:crosswalk:repository" trust="0.9"
|
||||
>10.5281/zenodo.4675070</pid>
|
||||
<refereed classid="0000" classname="UNKNOWN" schemeid="dnet:review_levels"
|
||||
schemename="dnet:review_levels"/>
|
||||
<license>https://opensource.org/licenses/MIT</license>
|
||||
<webresource>
|
||||
<url>https://doi.org/10.5281/zenodo.4675070</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,99 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>doi_________::9cb0664d4c891c4baaf73f007c0c9de0</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2022-11-25T12:55:13Z</dri:dateOfCollection>
|
||||
<dri:status>under curation</dri:status>
|
||||
<counters />
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
<oaf:result>
|
||||
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">COVID-19 Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
|
||||
<creator rank="1" name="" surname="">Karki, Reagon</creator>
|
||||
<dateofacceptance />
|
||||
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
|
||||
<description><p>A Knowledge graph representation of compounds and associated biological entities in the BY-COVID and EOSC Future project.</p> <p><strong>Current status</strong></p> <ul> <li>Number of Nodes: 35952</li> <li>Number of Edges: 279462</li> <li>Human Proteins: 1347</li> <li>Assay: 15835</li> <li>Chemical/Compound: 4096</li> <li>Mechanism of Action: 739</li> <li>Pathway: 1513</li> <li>Disease: 1585</li> <li>SideEffect: 7420</li> <li>Biological Process: 2085</li> <li>Molecular Function: 1332</li> </ul> <p>Please check the BY_COVID_update_August.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in 'data\export' folder.</p> <p></p></description>
|
||||
<country classid="" classname="" schemeid="" schemename="" />
|
||||
<subject classid="" classname="" schemeid="" schemename="" />
|
||||
<relevantdate classid="" classname="" schemeid="" schemename="" />
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate />
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
|
||||
<source />
|
||||
<fulltext />
|
||||
<format />
|
||||
<storagedate />
|
||||
<resourcetype classid="" classname="" schemeid="" schemename="" />
|
||||
<device />
|
||||
<size />
|
||||
<version />
|
||||
<lastmetadataupdate />
|
||||
<metadataversionnumber />
|
||||
<documentationUrl />
|
||||
<codeRepositoryUrl />
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
|
||||
<contactperson />
|
||||
<contactgroup />
|
||||
<tool />
|
||||
<originalId>oai:zenodo.org:7351221</originalId>
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351221</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351221</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<context id="covid-19" label="COVID-19" type="community"></context>
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance />
|
||||
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
|
||||
<code>101017536</code>
|
||||
<acronym>EOSC Future</acronym>
|
||||
<title>EOSC Future</title>
|
||||
<contracttype classid="" classname="" schemeid="" schemename="" />
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
|
||||
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
|
||||
</funding>
|
||||
<websiteurl />
|
||||
</rel>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
|
||||
<code>101046203</code>
|
||||
<acronym>BY-COVID</acronym>
|
||||
<title>Beyond COVID</title>
|
||||
<contracttype classid="" classname="" schemeid="" schemename="" />
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
|
||||
<funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
|
||||
</funding>
|
||||
<websiteurl />
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<instance id="od______2659::040cee965a4544e343a2ba149783c3fc">
|
||||
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<dateofacceptance />
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/7351221</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,100 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>doi_________::07fdccabd77830e3caccf0b33c083f1b</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2022-11-25T01:08:31Z</dri:dateOfCollection>
|
||||
<dri:status>under curation</dri:status>
|
||||
<counters />
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">Monkeypox Knowledge Graph: A semantic resource embedding biological and chemical entities</title>
|
||||
<creator rank="1" name="" surname="">Karki, Reagon</creator>
|
||||
<creator rank="2" name="" surname="">Andrea, Zaliani</creator>
|
||||
<creator rank="3" name="" surname="">Gadiya, Yojana</creator>
|
||||
<creator rank="4" name="" surname="">Gribbon, Philip</creator>
|
||||
<dateofacceptance />
|
||||
<resulttype classid="software" classname="software" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
|
||||
<description><p>The Monkeypox KG is built using viral and human proteins reported in different resources. Additionally, the KG represents chemicals tested against Monkeypox and their targets, associated biological processes, molecular functions, diseases and side effects.</p> <p><strong>KG status</strong></p> <p>Version 1 stats:</p> <ul> <li>Number of Nodes: 8235</li> <li>Number of Edges: 40422</li> </ul> <p>Version 2 stats (2nd September) :</p> <ul> <li>Number of Nodes: 9129</li> <li>Number of Edges: 44568</li> </ul> <p>Please check the graph.ipynb for understanding step wise process of KG generation and KG statistics. The KG has been exported to formats such as graphml, sif and so on for visualizations in other platforms. For example, the graphml file can be imported to Cytoscape directly. These files are located in 'data\export' folder.</p> <p></p></description>
|
||||
<country classid="" classname="" schemeid="" schemename="" />
|
||||
<subject classid="" classname="" schemeid="" schemename="" />
|
||||
<relevantdate classid="" classname="" schemeid="" schemename="" />
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate />
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
|
||||
<source />
|
||||
<fulltext />
|
||||
<format />
|
||||
<storagedate />
|
||||
<resourcetype classid="" classname="" schemeid="" schemename="" />
|
||||
<device />
|
||||
<size />
|
||||
<version />
|
||||
<lastmetadataupdate />
|
||||
<metadataversionnumber />
|
||||
<documentationUrl />
|
||||
<codeRepositoryUrl />
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
|
||||
<contactperson />
|
||||
<contactgroup />
|
||||
<tool />
|
||||
<originalId>oai:zenodo.org:7351393</originalId>
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7351393</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7351393</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance />
|
||||
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
|
||||
<code>101017536</code>
|
||||
<acronym>EOSC Future</acronym>
|
||||
<title>EOSC Future</title>
|
||||
<contracttype classid="" classname="" schemeid="" schemename="" />
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
|
||||
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
|
||||
</funding>
|
||||
<websiteurl />
|
||||
</rel>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::4a3254eac2997eee0a9dcb7a7daedb81</to>
|
||||
<code>101046203</code>
|
||||
<acronym>BY-COVID</acronym>
|
||||
<title>Beyond COVID</title>
|
||||
<contracttype classid="" classname="" schemeid="" schemename="" />
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
|
||||
<funding_level_0 name="Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based">ec__________::EC::Horizon Europe Framework Programme - HORIZON-RIA\HORIZON Action Grant Budget-Based</funding_level_0>
|
||||
</funding>
|
||||
<websiteurl />
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<instance id="od______2659::db2bc6381545f80dc9feec808a173ec0">
|
||||
<instancetype classid="0029" classname="Software" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<dateofacceptance />
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/7351393</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -0,0 +1,85 @@
|
|||
<record>
|
||||
<result xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header>
|
||||
<dri:objIdentifier>doi_________::93d39dd7edef016928788c3500e149f1</dri:objIdentifier>
|
||||
<dri:dateOfCollection>2022-11-24T08:41:37Z</dri:dateOfCollection>
|
||||
<dri:status>under curation</dri:status>
|
||||
<counters/>
|
||||
</header>
|
||||
<metadata>
|
||||
<oaf:entity xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://namespace.openaire.eu/oaf http://namespace.openaire.eu/oaf http://www.openaire.eu/schema/0.2/oaf-0.2.xsd">
|
||||
<oaf:result>
|
||||
<title classid="main title" classname="main title" schemeid="dnet:dataCite_title" schemename="dnet:dataCite_title">ENVRI SP - Dashboard State of the Environment - Ocean Indicators</title>
|
||||
<creator rank="1" name="" surname="">Tjerk Krijger</creator>
|
||||
<dateofacceptance />
|
||||
<resulttype classid="other" classname="other" schemeid="dnet:result_typologies" schemename="dnet:result_typologies" />
|
||||
<language classid="" classname="" schemeid="dnet:languages" schemename="dnet:languages" />
|
||||
<description><p>The attached .yaml file is used as input to the Dashboard State of the Environment, which is a science project of the ENVRI-FAIR science cluster within EOSC-FUTURE. The contents of the file enable the visualization of Ocean indicators on the dashboard. It is possible to download the attached file and change the contents to include indicators from different domains such as atmosphere or biodiversity.</p></description>
|
||||
<country classid="" classname="" schemeid="" schemename="" />
|
||||
<subject classid="" classname="" schemeid="" schemename="" />
|
||||
<relevantdate classid="" classname="" schemeid="" schemename="" />
|
||||
<publisher>Zenodo</publisher>
|
||||
<embargoenddate />
|
||||
<journal issn="" eissn="" lissn="" ep="" iss="" sp="" vol="" />
|
||||
<source />
|
||||
<fulltext />
|
||||
<format />
|
||||
<storagedate />
|
||||
<resourcetype classid="" classname="" schemeid="" schemename="" />
|
||||
<device />
|
||||
<size />
|
||||
<version />
|
||||
<lastmetadataupdate />
|
||||
<metadataversionnumber />
|
||||
<documentationUrl />
|
||||
<codeRepositoryUrl />
|
||||
<programmingLanguage classid="" classname="" schemeid="" schemename="" />
|
||||
<contactperson />
|
||||
<contactgroup />
|
||||
<tool />
|
||||
<originalId>oai:zenodo.org:7353841</originalId>
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<pid classid="oai" classname="Open Archives Initiative" schemeid="dnet:pid_types" schemename="dnet:pid_types">oai:zenodo.org:7353841</pid>
|
||||
<pid classid="doi" classname="Digital Object Identifier" schemeid="dnet:pid_types" schemename="dnet:pid_types">10.5281/zenodo.7353841</pid>
|
||||
<bestaccessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<datainfo>
|
||||
<inferred>false</inferred>
|
||||
<deletedbyinference>false</deletedbyinference>
|
||||
<trust>0.9</trust>
|
||||
<inferenceprovenance />
|
||||
<provenanceaction classid="user:insert" classname="user:insert" schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions" />
|
||||
</datainfo>
|
||||
<rels>
|
||||
<rel inferred="false" trust="0.9" inferenceprovenance="" provenanceaction="user:claim">
|
||||
<to class="isProducedBy" scheme="dnet:result_project_relations" type="project">corda__h2020::256485716fdb9f5ca69007b7ca5a072b</to>
|
||||
<code>101017536</code>
|
||||
<acronym>EOSC Future</acronym>
|
||||
<title>EOSC Future</title>
|
||||
<contracttype classid="" classname="" schemeid="" schemename="" />
|
||||
<funding>
|
||||
<funder id="ec__________::EC" shortname="EC" name="European Commission" jurisdiction="" />
|
||||
<funding_level_0 name="H2020">ec__________::EC::H2020</funding_level_0>
|
||||
</funding>
|
||||
<websiteurl />
|
||||
</rel>
|
||||
</rels>
|
||||
<children>
|
||||
<instance id="od______2659::3e4323c221f269e5f3d6db4c61dd2ec8">
|
||||
<instancetype classid="0020" classname="Other ORP type" schemeid="dnet:publication_resource" schemename="dnet:publication_resource" />
|
||||
<collectedfrom name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<hostedby name="ZENODO" id="opendoar____::358aee4cc897452c00244351e4d91f69" />
|
||||
<accessright classid="OPEN" classname="Open Access" schemeid="dnet:access_modes" schemename="dnet:access_modes" />
|
||||
<dateofacceptance />
|
||||
<webresource>
|
||||
<url>https://zenodo.org/record/7353841</url>
|
||||
</webresource>
|
||||
</instance>
|
||||
</children>
|
||||
</oaf:result>
|
||||
</oaf:entity>
|
||||
</metadata>
|
||||
</result>
|
||||
</record>
|
|
@ -2,11 +2,11 @@
|
|||
<FIELDS>
|
||||
<FIELD indexable="false" name="oafentity" result="true" stat="false" tokenizable="false" xpath="//*[local-name() = 'entity']"/>
|
||||
<FIELD indexable="true" name="oaftype" result="false" stat="false" tokenizable="false" value="local-name(//*[local-name()='entity']/*[local-name() != 'extraInfo'])"/>
|
||||
<FIELD indexable="true" name="objIdentifier" result="false" stat="false" tokenizable="false" xpath="//header/dri:objIdentifier"/><!-- DATASOURCE FIELDS -->
|
||||
<FIELD indexable="true" name="datasourceofficialname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/officialname"/>
|
||||
<FIELD indexable="true" name="datasourceenglishname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/englishname"/>
|
||||
<FIELD indexable="true" name="datasourceoddescription" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/oddescription"/>
|
||||
<FIELD indexable="true" name="datasourceodsubjects" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odsubjects"/>
|
||||
<FIELD indexable="true" name="objIdentifier" result="false" stat="false" tokenizable="false" xpath="//header/dri:objIdentifier"/><!-- DATASOURCE FIELDS -->
|
||||
<FIELD copy="true" indexable="true" name="datasourceofficialname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/officialname"/>
|
||||
<FIELD copy="true" indexable="true" name="datasourceenglishname" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/englishname"/>
|
||||
<FIELD copy="true" indexable="true" name="datasourceoddescription" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/oddescription"/>
|
||||
<FIELD copy="true" indexable="true" name="datasourceodsubjects" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odsubjects"/>
|
||||
<FIELD indexable="true" name="datasourceodlanguages" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odlanguages"/>
|
||||
<FIELD indexable="true" name="datasourceodcontenttypes" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/odcontenttypes"/>
|
||||
<FIELD indexable="true" multivalued="false" name="datasourcetypename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/datasourcetype/@classname"/>
|
||||
|
@ -14,17 +14,16 @@
|
|||
<FIELD indexable="true" multivalued="false" name="datasourcetypeuiname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/datasourcetypeui/@classname"/>
|
||||
<FIELD indexable="true" multivalued="false" name="datasourcecompatibilityid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/openairecompatibility/@classid"/>
|
||||
<FIELD indexable="true" multivalued="false" name="datasourcecompatibilityname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/openairecompatibility/@classname"/>
|
||||
<FIELD indexable="true" multivalued="true" name="datasourcesubject" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='datasource']/subjects"/>
|
||||
<FIELD indexable="true" name="versioning" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/versioning"/>
|
||||
<!-- datasource fields for EOSC -->
|
||||
<FIELD indexable="true" name="datasourcejurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/jurisdiction"/>
|
||||
<FIELD copy="true" indexable="true" multivalued="true" name="datasourcesubject" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='datasource']/subjects"/>
|
||||
<FIELD indexable="true" name="versioning" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/versioning"/><!-- datasource fields for EOSC -->
|
||||
<FIELD indexable="true" name="datasourcejurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/jurisdiction/@classname"/>
|
||||
<FIELD indexable="true" name="datasourcethematic" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/thematic"/>
|
||||
<FIELD indexable="true" name="datasourceknowledge_graph" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/knowledgegraph"/>
|
||||
<FIELD indexable="true" name="datasourcecontentpolicy" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/contentpolicy"/>
|
||||
<!-- ORGANIZATION FIELDS -->
|
||||
<FIELD indexable="true" name="organizationlegalshortname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalshortname)"/>
|
||||
<FIELD indexable="true" name="organizationlegalname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalname)"/>
|
||||
<FIELD indexable="true" name="organizationalternativenames" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//alternativeNames)"/>
|
||||
<FIELD indexable="true" name="datasourcecontentpolicy" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/contentpolicy/@classname"/>
|
||||
<FIELD indexable="true" name="eosctype" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/eosctype/@classname"/>
|
||||
<FIELD indexable="true" name="eoscdatasourcetype" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='datasource']/eoscdatasourcetype/@classname"/><!-- ORGANIZATION FIELDS --><!-- ORGANIZATION FIELDS --><!-- ORGANIZATION FIELDS -->
|
||||
<FIELD copy="true" indexable="true" name="organizationlegalshortname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalshortname)"/>
|
||||
<FIELD copy="true" indexable="true" name="organizationlegalname" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//legalname)"/>
|
||||
<FIELD copy="true" indexable="true" name="organizationalternativenames" result="false" stat="false" type="ngramtext" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='organization']//alternativeNames)"/>
|
||||
<FIELD indexable="true" name="organizationeclegalbody" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/eclegalbody"/>
|
||||
<FIELD indexable="true" name="organizationeclegalperson" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/eclegalperson"/>
|
||||
<FIELD indexable="true" name="organizationecnonprofit" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecnonprofit"/>
|
||||
|
@ -34,18 +33,17 @@
|
|||
<FIELD indexable="true" name="organizationecenterprise" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecenterprise"/>
|
||||
<FIELD indexable="true" name="organizationecsmevalidated" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecsmevalidated"/>
|
||||
<FIELD indexable="true" name="organizationecnutscode" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/ecnutscode"/>
|
||||
<FIELD indexable="true" multivalued="false" name="organizationcountryname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/country/@classname"/>
|
||||
<!-- PROJECT FIELDS -->
|
||||
<FIELD indexable="true" name="projectcode" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
|
||||
<FIELD indexable="true" multivalued="false" name="organizationcountryname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='organization']/country/@classname"/><!-- PROJECT FIELDS -->
|
||||
<FIELD copy="true" indexable="true" name="projectcode" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
|
||||
<FIELD indexable="true" name="projectcode_nt" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/code"/>
|
||||
<FIELD indexable="true" name="projectacronym" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/acronym"/>
|
||||
<FIELD indexable="true" name="projecttitle" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/title"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectstartdate" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='project']/startdate"/>
|
||||
<FIELD copy="true" indexable="true" name="projectacronym" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/acronym"/>
|
||||
<FIELD copy="true" indexable="true" name="projecttitle" result="false" stat="false" type="ngramtext" xpath="//*[local-name()='entity']/*[local-name()='project']/title"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectstartdate" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='project']/startdate"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectstartyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='project']/startdate)"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectenddate" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='project']/enddate"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectenddate" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='project']/enddate"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectendyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='project']/enddate)"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectcallidentifier" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/callidentifier"/>
|
||||
<FIELD indexable="true" name="projectkeywords" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/keywords"/>
|
||||
<FIELD copy="true" indexable="true" name="projectkeywords" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/keywords"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectduration" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/duration"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectecsc39" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='project']/ecsc39)"/>
|
||||
<FIELD indexable="true" multivalued="false" name="projectoamandatepublications" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/oamandatepublications"/>
|
||||
|
@ -54,35 +52,36 @@
|
|||
<FIELD indexable="true" multivalued="false" name="projectcontracttypename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/contracttype/@classname"/>
|
||||
<FIELD indexable="true" name="fundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/id"/>
|
||||
<FIELD indexable="true" name="fundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/name"/>
|
||||
<FIELD indexable="true" name="fundinglevel0_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/description"/>
|
||||
<FIELD copy="true" indexable="true" name="fundinglevel0_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_0/description"/>
|
||||
<FIELD indexable="true" name="fundinglevel1_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/id"/>
|
||||
<FIELD indexable="true" name="fundinglevel1_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/name"/>
|
||||
<FIELD indexable="true" name="fundinglevel1_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/description"/>
|
||||
<FIELD copy="true" indexable="true" name="fundinglevel1_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_1/description"/>
|
||||
<FIELD indexable="true" name="fundinglevel2_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/id"/>
|
||||
<FIELD indexable="true" name="fundinglevel2_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/name"/>
|
||||
<FIELD indexable="true" name="fundinglevel2_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/description"/><!-- PROJECTS' FUNDER FIELDS: indexable only with the new funding path/context handling -->
|
||||
<FIELD copy="true" indexable="true" name="fundinglevel2_description" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree//funding_level_2/description"/><!-- PROJECTS' FUNDER FIELDS: indexable only with the new funding path/context handling -->
|
||||
<FIELD indexable="true" name="funder" result="false" stat="false" tokenizable="false" value="concat(./id/text(), '||', ./name/text(), '||', ./shortname/text())" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder"/>
|
||||
<FIELD indexable="true" name="fundershortname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/shortname"/>
|
||||
<FIELD indexable="true" name="funderid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/id"/>
|
||||
<FIELD indexable="true" name="fundername" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/name"/>
|
||||
<FIELD indexable="true" name="funderoriginalname" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/originalname"/>
|
||||
<FIELD indexable="true" name="funderjurisdiction" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='project']/fundingtree/funder/jurisdiction"/><!-- RESULT FIELDS -->
|
||||
<FIELD indexable="true" name="resulttitle" result="false" stat="false" xpath="//*[local-name() = 'entity']/*[local-name() ='result']/title | //*[local-name()='entity']/*[local-name()='result']/children/result/title"/>
|
||||
<FIELD indexable="true" name="resultsubject" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject)"/>
|
||||
<FIELD copy="true" indexable="true" name="resulttitle" result="false" stat="false" type="text_en" xpath="//*[local-name() = 'entity']/*[local-name() ='result']/title | //*[local-name()='entity']/*[local-name()='result']/children/result/title"/>
|
||||
<FIELD indexable="true" name="resultsubject" result="false" stat="false" type="text_en" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject)"/>
|
||||
<FIELD indexable="true" name="resultsubjectclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject/@classname)"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultembargoenddate" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='result']/embargoenddate"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultembargoenddate" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='result']/embargoenddate"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultembargoendyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='result']/embargoenddate)"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resulttypeid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/resulttype/@classid"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resulttypename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/resulttype/@classname"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultlanguagename" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/language/@classname"/>
|
||||
<FIELD indexable="true" name="resultpublisher" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/*[local-name()='publisher']"/>
|
||||
<FIELD indexable="true" name="resultdescription" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']//*[local-name()='description']"/>
|
||||
<FIELD copy="true" indexable="true" name="resultpublisher" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/*[local-name()='publisher']"/>
|
||||
<FIELD copy="true" indexable="true" name="resultdescription" result="false" stat="false" type="text_en" xpath="//*[local-name()='entity']/*[local-name()='result']//*[local-name()='description']"/>
|
||||
<FIELD indexable="true" name="resultlicense" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/children/instance/license"/>
|
||||
<FIELD indexable="true" name="resultaccessright" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*[local-name()='result']/children/instance/accessright/@classname"/>
|
||||
<FIELD indexable="true" name="resultresourcetypename" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/resourcetype/@classname"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultbestaccessright" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/bestaccessright/@classname)"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultdateofacceptance" result="false" stat="false" type="pdate" value="//*[local-name()='entity']/*[local-name()='result']/dateofacceptance"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultacceptanceyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='result']/dateofacceptance)"/>
|
||||
<FIELD indexable="true" multivalued="true" name="resultauthor" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
|
||||
<FIELD indexable="true" multivalued="false" name="resultdateofacceptance" result="false" stat="false" type="date" value="//*[local-name()='entity']/*[local-name()='result']/dateofacceptance"/>
|
||||
<FIELD copy="true" indexable="true" multivalued="false" name="resultacceptanceyear" result="false" stat="false" tokenizable="false" value="dnet:extractYear(//*[local-name()='entity']/*[local-name()='result']/dateofacceptance)"/>
|
||||
<FIELD copy="true" indexable="true" multivalued="true" name="resultauthor" result="false" stat="false" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
|
||||
<FIELD indexable="true" multivalued="true" name="resultauthor_nt" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator"/>
|
||||
<FIELD indexable="true" multivalued="true" name="authorid" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator/@*[local-name() != 'rank' and local-name() != 'name' and local-name() != 'surname']"/>
|
||||
<FIELD indexable="true" multivalued="true" name="authoridtype" result="false" stat="false" type="string_ci" xpath="//*[local-name()='entity']/*[local-name()='result']/creator/@*[local-name() != 'rank' and local-name() != 'name' and local-name() != 'surname']/local-name()"/>
|
||||
|
@ -94,26 +93,29 @@
|
|||
<FIELD indexable="true" name="resultdupid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*//children/result/@objidentifier"/>
|
||||
<FIELD indexable="true" name="organizationdupid" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']/*//children/organization/@objidentifier"/>
|
||||
<FIELD indexable="true" name="externalrefsite" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/sitename)"/>
|
||||
<FIELD indexable="true" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
|
||||
<FIELD copy="true" indexable="true" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
|
||||
<FIELD indexable="true" name="externalrefclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/qualifier/@classid)"/>
|
||||
<FIELD indexable="true" name="externalrefid" result="false" stat="false" tokenizable="false" xpath="(//*[local-name()='entity']/*//children/externalreference/refidentifier)"/>
|
||||
<FIELD indexable="true" name="resultidentifier" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
|
||||
<FIELD indexable="true" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/><!-- REL FIELDS -->
|
||||
<FIELD copy="true" indexable="true" name="resultidentifier" result="false" stat="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
|
||||
<FIELD copy="true" indexable="true" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/>
|
||||
<FIELD indexable="true" name="eoscifguidelines" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name() = 'result']/eoscifguidelines/@code)"/><!-- FOS and SDGs non tokenizable for faceted search-->
|
||||
<FIELD indexable="true" name="fos" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS'])"/>
|
||||
<FIELD indexable="true" name="sdg" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='SDG'])"/><!-- REL FIELDS -->
|
||||
<FIELD indexable="true" name="reldatasourcecompatibilityid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='datasource']/openairecompatibility/@classid)"/>
|
||||
<FIELD indexable="true" name="relproject" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./text(), '||', dnet:pickFirst(../acronym/text(), ../title/text())))" xpath="//*[local-name()='entity']/*//rel/to[@type='project']"/>
|
||||
<FIELD indexable="true" name="relprojectid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='project'])"/>
|
||||
<FIELD indexable="true" name="relprojectcode" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/code)"/>
|
||||
<FIELD indexable="true" name="relprojectname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/acronym)"/>
|
||||
<FIELD indexable="true" name="relprojecttitle" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/title)"/>
|
||||
<FIELD copy="true" indexable="true" name="relprojectname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/acronym)"/>
|
||||
<FIELD copy="true" indexable="true" name="relprojecttitle" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/title)"/>
|
||||
<FIELD indexable="true" name="relcontracttypeid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/contracttype/@classid)"/>
|
||||
<FIELD indexable="true" name="relcontracttypename" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/contracttype/@classname)"/>
|
||||
<FIELD copy="true" indexable="true" name="relcontracttypename" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='project']/contracttype/@classname)"/>
|
||||
<FIELD indexable="true" name="relorganizationcountryid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid)"/>
|
||||
<FIELD indexable="true" name="relorganizationcountryname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classname)"/>
|
||||
<FIELD copy="true" indexable="true" name="relorganizationcountryname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classname)"/>
|
||||
<FIELD indexable="true" name="relorganizationid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='organization'])"/>
|
||||
<FIELD indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
|
||||
<FIELD indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
|
||||
<FIELD copy="true" indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
|
||||
<FIELD copy="true" indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
|
||||
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='result'])"/>
|
||||
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/resulttype/@classid)"/>
|
||||
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
|
||||
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
|
||||
<FIELD indexable="true" name="relfundinglevel0_id" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0"/>
|
||||
<FIELD indexable="true" name="relfundinglevel0_name" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//rel/funding/funding_level_0/@name/string()"/>
|
||||
|
@ -132,13 +134,15 @@
|
|||
<FIELD indexable="true" name="relfunderjurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']//rel/funding/funder/@jurisdiction)"/><!-- Collected from of the related entity. Available for result-result relationships -->
|
||||
<FIELD indexable="true" name="relcollectedfromid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@id)"/>
|
||||
<FIELD indexable="true" name="relcollectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/collectedfrom/@name)"/>
|
||||
<FIELD indexable="true" name="relvalidated" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./validated]/to[@type='project'])"/>
|
||||
<FIELD indexable="true" name="semrelid" result="false" stat="false" tokenizable="false" value="concat(./to/text(), '||', ./to/@class/string())" xpath="//*[local-name()='entity']//rel"/><!-- COMMON FIELDS -->
|
||||
<FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="pdate" value="//header/*[local-name()='dateOfCollection']"/>
|
||||
<FIELD indexable="true" multivalued="false" name="dateofcollection" result="false" stat="false" type="date" value="//header/*[local-name()='dateOfCollection']"/>
|
||||
<FIELD indexable="true" name="status" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//header/*[local-name()='status']"/>
|
||||
<FIELD indexable="true" name="collectedfrom" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./@id, '||', ./@name))" xpath="//*[local-name()='entity']/*/*[local-name()='collectedfrom'] | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']"/>
|
||||
<FIELD indexable="true" name="collectedfromdatasourceid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@id | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@id)"/>
|
||||
<FIELD indexable="true" name="collectedfromname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/*[local-name()='collectedfrom']/@name | //*[local-name()='entity']/*//*[local-name() = 'instance']/*[local-name()='collectedfrom']/@name)"/>
|
||||
<FIELD indexable="true" name="originalid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//*[local-name()='entity']/*/*[local-name()='originalId']"/>
|
||||
<FIELD indexable="true" name="pid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*/pid/text()|//*[local-name()='instance']/*[local-name()='alternateidentifier']/text())"/>
|
||||
<FIELD indexable="true" name="pid" result="false" stat="false" tokenizable="false" type="string_ci" xpath="//*[local-name()='entity']/*/pid/text()"/>
|
||||
<FIELD indexable="true" name="pidclassid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/pid/@classid)"/>
|
||||
<FIELD indexable="true" name="pidclassname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/pid/@classname)"/>
|
||||
<FIELD indexable="true" name="inferred" result="false" stat="false" tokenizable="false" xpath="//*[local-name()='entity']//datainfo/inferred"/>
|
||||
|
@ -156,20 +160,6 @@
|
|||
<FIELD indexable="true" name="categoryname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category/@label)"/>
|
||||
<FIELD indexable="true" name="conceptid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@id)"/>
|
||||
<FIELD indexable="true" name="conceptname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@label)"/><!-- new index field for country info from different xpaths for any type of entity -->
|
||||
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/><!-- COUNTER FIELDS -->
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_dedup" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_dedup/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_authorship" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_authorship/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_participation" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_participation/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_similarity" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_similarity/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset_claimed" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset_claimed/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset_collected" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset_collected/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_publicationdataset_inferred" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_publicationDataset_inferred/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome_claimed" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome_claimed/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome_collected" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome_collected/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_outcome_inferred" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_outcome_inferred/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_affiliation" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_affiliation/@value"/>
|
||||
<FIELD header="true" indexable="true" multivalued="false" name="counter_doi" result="false" stored="true" type="pint" xpath="/record/result/*[local-name()='header']/*[local-name()='counters']/counter_doi/@value"/>
|
||||
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/>
|
||||
</FIELDS>
|
||||
</LAYOUT>
|
|
@ -21,7 +21,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>hive_jdbc_url</name>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000/;UseNativeQuery=1;?spark.executor.memory=19166291558;spark.yarn.executor.memoryOverhead=3225;spark.driver.memory=11596411699;spark.yarn.driver.memoryOverhead=1228</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.workflow.notification.url</name>
|
||||
|
|
|
@ -42,7 +42,9 @@ SELECT p.id,
|
|||
CASE WHEN prr2.id IS NULL THEN 0 ELSE prr2.dp END AS delayedpubs,
|
||||
p.callidentifier,
|
||||
p.code,
|
||||
p.totalcost
|
||||
p.totalcost,
|
||||
p.fundedamount,
|
||||
p.currency
|
||||
FROM ${stats_db_name}.project_tmp p
|
||||
LEFT JOIN (SELECT pr.id, count(distinct pr.result) AS np
|
||||
FROM ${stats_db_name}.project_results pr
|
||||
|
|
|
@ -59,7 +59,7 @@ UNION ALL
|
|||
SELECT * FROM ${stats_db_name}.otherresearchproduct_sources;
|
||||
|
||||
|
||||
create table ${stats_db_name}.result_orcid STORED AS PARQUET as
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_orcid STORED AS PARQUET as
|
||||
select distinct res.id, regexp_replace(res.orcid, 'http://orcid.org/' ,'') as orcid
|
||||
from (
|
||||
SELECT substr(res.id, 4) as id, auth_pid.value as orcid
|
||||
|
@ -69,7 +69,7 @@ from (
|
|||
LATERAL VIEW explode(auth.pid.qualifier.classid) apt as author_pid_type
|
||||
WHERE res.datainfo.deletedbyinference = FALSE and res.datainfo.invisible = FALSE and author_pid_type = 'orcid') as res;
|
||||
|
||||
create table ${stats_db_name}.result_result stored as parquet as
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_result stored as parquet as
|
||||
select substr(rel.source, 4) as source, substr(rel.target, 4) as target, relclass, subreltype
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
|
@ -82,7 +82,7 @@ where reltype='resultResult'
|
|||
and r2.resulttype.classname != 'other'
|
||||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE;
|
||||
|
||||
create table ${stats_db_name}.result_citations_oc stored as parquet as
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_citations_oc stored as parquet as
|
||||
select substr(target, 4) as id, count(distinct substr(source, 4)) as citations
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
|
@ -97,7 +97,7 @@ where relClass='Cites' and rel.datainfo.provenanceaction.classid = 'sysimport:cr
|
|||
and rel.datainfo.deletedbyinference=false and rel.datainfo.invisible = FALSE
|
||||
group by substr(target, 4);
|
||||
|
||||
create table ${stats_db_name}.result_references_oc stored as parquet as
|
||||
CREATE TABLE IF NOT EXISTS ${stats_db_name}.result_references_oc stored as parquet as
|
||||
select substr(source, 4) as id, count(distinct substr(target, 4)) as references
|
||||
from ${openaire_db_name}.relation rel
|
||||
join ${openaire_db_name}.result r1 on rel.source=r1.id
|
||||
|
|
|
@ -48,7 +48,9 @@ create table TARGET.result stored as parquet as
|
|||
'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki
|
||||
'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank
|
||||
'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3', -- École des Ponts ParisTech
|
||||
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a' -- Nanyang Technological University
|
||||
'openorgs____::e15adb13c4dadd49de4d35c39b5da93a', -- Nanyang Technological University
|
||||
'openorgs____::4b34103bde246228fcd837f5f1bf4212', -- Autonomous University of Barcelona
|
||||
'openorgs____::72ec75fcfc4e0df1a76dc4c49007fceb' -- McMaster University
|
||||
) )) foo;
|
||||
compute stats TARGET.result;
|
||||
|
||||
|
|
|
@ -48,7 +48,9 @@ CREATE TABLE ${stats_db_name}.project_tmp
|
|||
delayedpubs INT,
|
||||
callidentifier STRING,
|
||||
code STRING,
|
||||
totalcost FLOAT
|
||||
totalcost FLOAT,
|
||||
fundedamount FLOAT,
|
||||
currency STRING
|
||||
) CLUSTERED BY (id) INTO 100 buckets stored AS orc tblproperties ('transactional' = 'true');
|
||||
|
||||
INSERT INTO ${stats_db_name}.project_tmp
|
||||
|
@ -72,7 +74,9 @@ SELECT substr(p.id, 4) AS id,
|
|||
0 AS delayedpubs,
|
||||
p.callidentifier.value AS callidentifier,
|
||||
p.code.value AS code,
|
||||
p.totalcost AS totalcost
|
||||
p.totalcost AS totalcost,
|
||||
p.fundedamount AS fundedamount,
|
||||
p.currency.value AS currency
|
||||
FROM ${openaire_db_name}.project p
|
||||
WHERE p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;
|
||||
|
||||
|
|
Loading…
Reference in New Issue