Merge branch 'beta' into dedup_new_comparators

This commit is contained in:
Michele De Bonis 2024-10-14 15:24:38 +02:00
commit 6c17993d16
47 changed files with 574 additions and 298 deletions

View File

@ -0,0 +1,70 @@
/*
* Copyright (c) 2024.
* SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche
* SPDX-License-Identifier: AGPL-3.0-or-later
*/
package eu.dnetlib.dhp.schema.oaf;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;
public class HashableStructuredProperty extends StructuredProperty {
private static final long serialVersionUID = 8371670185221126045L;
public static HashableStructuredProperty newInstance(String value, Qualifier qualifier, DataInfo dataInfo) {
if (value == null) {
return null;
}
final HashableStructuredProperty sp = new HashableStructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static HashableStructuredProperty newInstance(StructuredProperty sp) {
HashableStructuredProperty hsp = new HashableStructuredProperty();
hsp.setQualifier(sp.getQualifier());
hsp.setValue(sp.getValue());
hsp.setQualifier(sp.getQualifier());
return hsp;
}
public static StructuredProperty toStructuredProperty(HashableStructuredProperty hsp) {
StructuredProperty sp = new StructuredProperty();
sp.setQualifier(hsp.getQualifier());
sp.setValue(hsp.getValue());
sp.setQualifier(hsp.getQualifier());
return sp;
}
@Override
public int hashCode() {
return new HashCodeBuilder(11, 91)
.append(getQualifier().getClassid())
.append(getQualifier().getSchemeid())
.append(getValue())
.hashCode();
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj == this) {
return true;
}
if (obj.getClass() != getClass()) {
return false;
}
final HashableStructuredProperty rhs = (HashableStructuredProperty) obj;
return new EqualsBuilder()
.append(getQualifier().getClassid(), rhs.getQualifier().getClassid())
.append(getQualifier().getSchemeid(), rhs.getQualifier().getSchemeid())
.append(getValue(), rhs.getValue())
.isEquals();
}
}

View File

@ -43,34 +43,4 @@ public class CleaningFunctions {
return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue);
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid
.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pidType) {
// TODO add cleaning for more PID types as needed
case "doi":
return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX);
}
return value;
}
}

View File

@ -6,18 +6,11 @@ import org.apache.commons.lang3.StringUtils;
public class DoiCleaningRule {
public static String clean(final String doi) {
return doi
.toLowerCase()
.replaceAll("\\s", "")
.replaceAll("^doi:", "")
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
}
public static String normalizeDoi(final String input) {
if (input == null)
if (doi == null)
return null;
final String replaced = input
final String replaced = doi
.replaceAll("\\n|\\r|\\t|\\s", "")
.replaceAll("^doi:", "")
.toLowerCase()
.replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX);
if (StringUtils.isEmpty(replaced))
@ -32,7 +25,6 @@ public class DoiCleaningRule {
return null;
return ret;
}
}

View File

@ -563,12 +563,24 @@ public class GraphCleaningFunctions extends CleaningFunctions {
Optional
.ofNullable(i.getPid())
.ifPresent(pid -> {
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
final Set<HashableStructuredProperty> pids = pid
.stream()
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
Optional
.ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> {
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
final Set<HashableStructuredProperty> altIds = altId
.stream()
.map(HashableStructuredProperty::newInstance)
.collect(Collectors.toCollection(HashSet::new));
i
.setAlternateIdentifier(
Sets
.difference(altIds, pids)
.stream()
.map(HashableStructuredProperty::toStructuredProperty)
.collect(Collectors.toList()));
});
});

View File

@ -175,7 +175,7 @@ public class IdentifierFactory implements Serializable {
return entity
.getPid()
.stream()
.map(CleaningFunctions::normalizePidValue)
.map(PidCleaner::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(
Collectors
@ -207,7 +207,7 @@ public class IdentifierFactory implements Serializable {
// filter away PIDs provided by a DS that is not considered an authority for the
// given PID Type
.filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles))
.map(CleaningFunctions::normalizePidValue)
.map(PidCleaner::normalizePidValue)
.filter(p -> isNotFromDelegatedAuthority(collectedFrom, p))
.filter(CleaningFunctions::pidFilter))
.orElse(Stream.empty());

View File

@ -972,7 +972,7 @@ public class MergeUtils {
private static String extractKeyFromPid(final StructuredProperty pid) {
if (pid == null)
return null;
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
final StructuredProperty normalizedPid = PidCleaner.normalizePidValue(pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
}

View File

@ -18,8 +18,8 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null)
return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
StructuredProperty l = PidCleaner.normalizePidValue(left);
StructuredProperty r = PidCleaner.normalizePidValue(right);
return Optional
.ofNullable(l.getValue())

View File

@ -28,6 +28,7 @@ import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
import net.minidev.json.JSONArray;
import scala.collection.JavaConverters;
import scala.collection.Seq;
@ -104,7 +105,7 @@ public class DHPUtils {
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
final String cleanedPid = PidCleaner.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
}

View File

@ -29,7 +29,7 @@ class IdentifierFactoryTest {
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
verifyIdentifier(
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
"publication_doi3.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
verifyIdentifier(
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
@ -41,7 +41,7 @@ class IdentifierFactoryTest {
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
verifyIdentifier(
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
"publication_pmc2.json", "50|pmc_________::e2a339e0e11bfbf55462e14a07f1b304", true);
verifyIdentifier(
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);

View File

@ -29,7 +29,7 @@
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
"value": "PMC21459329"
}
]
}

View File

@ -13,7 +13,7 @@
},
{
"qualifier":{"classid":"pmc"},
"value":"21459329"
"value":"PMC21459329"
}
]
}

View File

@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
// function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
TreeNodeStats stats = new TreeNodeStats();
// for each field in the node, it computes the
for (FieldConf fieldConf : fields) {

View File

@ -9,11 +9,8 @@ public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
private final boolean ignoreUndefined;
public TreeNodeStats(boolean ignoreUndefined) {
public TreeNodeStats() {
this.results = new HashMap<>();
this.ignoreUndefined = ignoreUndefined;
}
public Map<String, FieldStats> getResults() {
@ -25,10 +22,7 @@ public class TreeNodeStats implements Serializable {
}
public int fieldsCount() {
if (ignoreUndefined)
return this.results.size();
else
return this.results.size() - undefinedCount(); // do not count undefined
return this.results.size();
}
public int undefinedCount() {
@ -84,22 +78,11 @@ public class TreeNodeStats implements Serializable {
double min = 100.0; // random high value
for (FieldStats fs : this.results.values()) {
if (fs.getResult() < min) {
if (fs.getResult() == -1) {
if (fs.isCountIfUndefined()) {
min = 0.0;
} else {
min = -1;
}
} else {
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
min = fs.getResult();
}
}
}
if (ignoreUndefined) {
return min == -1.0 ? 0.0 : min;
} else {
return min;
}
return min;
}
// if at least one is true, return 1.0
@ -108,11 +91,7 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0;
}
if (!ignoreUndefined && undefinedCount() > 0) {
return -1.0;
} else {
return 0.0;
}
return 0.0;
}
// if at least one is false, return 0.0
@ -121,7 +100,7 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined())
return ignoreUndefined ? 0.0 : -1.0;
return 0.0;
} else {
if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0;

View File

@ -44,10 +44,12 @@ public class TreeProcessor {
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats);
double finalScore = stats.getFinalScore(currentNode.getAggregation());
if (finalScore == -1.0)
// if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
nextNodeName = currentNode.getUndefined();
else if (finalScore >= currentNode.getThreshold()) {
}
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive();
} else {
nextNodeName = currentNode.getNegative();

View File

@ -10,7 +10,6 @@ import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
@ -29,6 +28,7 @@ import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.DoiCleaningRule;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
@ -46,6 +46,8 @@ public class PrepareAffiliationRelations implements Serializable {
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556";
public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE";
public static final String DOI_URL_PREFIX = "https://doi.org/";
public static final int DOI_URL_PREFIX_LENGTH = 16;
public static <I extends Result> void main(String[] args) throws Exception {
@ -98,35 +100,26 @@ public class PrepareAffiliationRelations implements Serializable {
private static void createActionSet(SparkSession spark, String crossrefInputPath, String pubmedInputPath,
String openapcInputPath, String dataciteInputPath, String webcrawlInputPath, String publisherlInputPath,
String outputPath) {
List<KeyValue> collectedFromCrossref = OafMapperUtils
.listKeyValues(ModelConstants.CROSSREF_ID, "Crossref");
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelations(
spark, crossrefInputPath, collectedFromCrossref);
List<KeyValue> collectedfromOpenAIRE = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> crossrefRelations = prepareAffiliationRelationsNewModel(
spark, crossrefInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromPubmed = OafMapperUtils
.listKeyValues(ModelConstants.PUBMED_CENTRAL_ID, "Pubmed");
JavaPairRDD<Text, Text> pubmedRelations = prepareAffiliationRelations(
spark, pubmedInputPath, collectedFromPubmed);
spark, pubmedInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromOpenAPC = OafMapperUtils
.listKeyValues(ModelConstants.OPEN_APC_ID, "OpenAPC");
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelations(
spark, openapcInputPath, collectedFromOpenAPC);
JavaPairRDD<Text, Text> openAPCRelations = prepareAffiliationRelationsNewModel(
spark, openapcInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromDatacite = OafMapperUtils
.listKeyValues(ModelConstants.DATACITE_ID, "Datacite");
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
spark, dataciteInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl);
spark, webcrawlInputPath, collectedfromOpenAIRE);
List<KeyValue> collectedfromPublisher = OafMapperUtils
.listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME);
JavaPairRDD<Text, Text> publisherRelations = prepareAffiliationRelationFromPublisher(
spark, publisherlInputPath, collectedfromPublisher);
spark, publisherlInputPath, collectedfromOpenAIRE);
crossrefRelations
.union(pubmedRelations)
@ -138,6 +131,21 @@ public class PrepareAffiliationRelations implements Serializable {
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisherNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Organizations` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"));
}
private static JavaPairRDD<Text, Text> prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath,
List<KeyValue> collectedfrom) {
@ -165,6 +173,20 @@ public class PrepareAffiliationRelations implements Serializable {
return getTextTextJavaPairRDD(collectedfrom, df);
}
private static <I extends Result> JavaPairRDD<Text, Text> prepareAffiliationRelationsNewModel(SparkSession spark,
String inputPath,
List<KeyValue> collectedfrom) {
// load and parse affiliation relations from HDFS
Dataset<Row> df = spark
.read()
.schema(
"`DOI` STRING, `Matchings` ARRAY<STRUCT<`PID`:STRING, `Value`:STRING,`Confidence`:DOUBLE, `Status`:STRING>>")
.json(inputPath)
.where("DOI is not null");
return getTextTextJavaPairRDDNew(collectedfrom, df);
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDD(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays
df = df
@ -181,7 +203,7 @@ public class PrepareAffiliationRelations implements Serializable {
// DOI to OpenAIRE id
final String paperId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", row.getAs("doi")));
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
// ROR id to OpenAIRE id
final String affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("rorid"));
@ -213,6 +235,69 @@ public class PrepareAffiliationRelations implements Serializable {
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
}
private static JavaPairRDD<Text, Text> getTextTextJavaPairRDDNew(List<KeyValue> collectedfrom, Dataset<Row> df) {
// unroll nested arrays
df = df
.withColumn("matching", functions.explode(new Column("Matchings")))
.select(
new Column("DOI").as("doi"),
new Column("matching.PID").as("pidtype"),
new Column("matching.Value").as("pidvalue"),
new Column("matching.Confidence").as("confidence"),
new Column("matching.Status").as("status"))
.where("status = 'active'");
// prepare action sets for affiliation relations
return df
.toJavaRDD()
.flatMap((FlatMapFunction<Row, Relation>) row -> {
// DOI to OpenAIRE id
final String paperId = ID_PREFIX
+ IdentifierFactory.md5(DoiCleaningRule.clean(removePrefix(row.getAs("doi"))));
// Organization to OpenAIRE identifier
String affId = null;
if (row.getAs("pidtype").equals("ROR"))
// ROR id to OpenIARE id
affId = GenerateRorActionSetJob.calculateOpenaireId(row.getAs("pidvalue"));
else
// getting the OpenOrgs identifier for the organization
affId = row.getAs("pidvalue");
Qualifier qualifier = OafMapperUtils
.qualifier(
BIP_AFFILIATIONS_CLASSID,
BIP_AFFILIATIONS_CLASSNAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS);
// format data info; setting `confidence` into relation's `trust`
DataInfo dataInfo = OafMapperUtils
.dataInfo(
false,
BIP_INFERENCE_PROVENANCE,
true,
false,
qualifier,
Double.toString(row.getAs("confidence")));
// return bi-directional relations
return getAffiliationRelationPair(paperId, affId, collectedfrom, dataInfo).iterator();
})
.map(p -> new AtomicAction(Relation.class, p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))));
}
private static String removePrefix(String doi) {
if (doi.startsWith(DOI_URL_PREFIX))
return doi.substring(DOI_URL_PREFIX_LENGTH);
return doi;
}
private static List<Relation> getAffiliationRelationPair(String paperId, String affId, List<KeyValue> collectedfrom,
DataInfo dataInfo) {
return Arrays

View File

@ -112,7 +112,7 @@ public class SparkAtomicActionUsageJob implements Serializable {
.joinWith(datasource, resultModel.col("datasourceId").equalTo(datasource.col("id")), "left")
.map((MapFunction<Tuple2<UsageStatsResultModel, Datasource>, UsageStatsResultModel>) t2 -> {
UsageStatsResultModel usrm = t2._1();
if(Optional.ofNullable(t2._2()).isPresent())
if (Optional.ofNullable(t2._2()).isPresent())
usrm.setDatasourceId(usrm.getDatasourceId() + "||" + t2._2().getOfficialname().getValue());
else
usrm.setDatasourceId(usrm.getDatasourceId() + "||NO_MATCH_FOUND");

View File

@ -28,13 +28,19 @@
"paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite",
"paramRequired": true
},{
},
{
"paramName": "wip",
"paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true
}
,
},
{
"paramName": "pub",
"paramLongName": "publisherInputPath",
"paramDescription": "the path to get the input data from publishers",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",

View File

@ -332,7 +332,7 @@ case object Crossref2Oaf {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
//MAPPING Crossref DOI into PID
val doi: String = DoiCleaningRule.normalizeDoi((json \ "DOI").extract[String])
val doi: String = DoiCleaningRule.clean((json \ "DOI").extract[String])
result.setPid(
List(
structuredProperty(
@ -673,7 +673,7 @@ case object Crossref2Oaf {
val doi = input.getString(0)
val rorId = input.getString(1)
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.normalizeDoi(doi)}"
val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}"
val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId)
val r: Relation = new Relation

View File

@ -407,10 +407,9 @@ object DataciteToOAFTransformation {
)
}
if (c.affiliation.isDefined)
a.setAffiliation(
a.setRawAffiliationString(
c.affiliation.get
.filter(af => af.nonEmpty)
.map(af => OafMapperUtils.field(af, dataInfo))
.asJava
)
a.setRank(idx + 1)

View File

@ -28,8 +28,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class PrepareAffiliationRelationsTest {
@ -39,8 +39,7 @@ public class PrepareAffiliationRelationsTest {
private static Path workingDir;
private static final String ID_PREFIX = "50|doi_________::";
private static final Logger log = LoggerFactory
.getLogger(PrepareAffiliationRelationsTest.class);
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelationsTest.class);
@BeforeAll
public static void beforeAll() throws IOException {
@ -74,26 +73,34 @@ public class PrepareAffiliationRelationsTest {
@Test
void testMatch() throws Exception {
String crossrefAffiliationRelationPath = getClass()
String crossrefAffiliationRelationPathNew = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror.json")
.getPath();
String crossrefAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/doi_to_ror_old.json")
.getPath();
String publisherAffiliationRelationPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publishers")
.getPath();
String publisherAffiliationRelationOldPath = getClass()
.getResource("/eu/dnetlib/dhp/actionmanager/bipaffiliations/publichers_old")
.getPath();
String outputPath = workingDir.toString() + "/actionSet";
PrepareAffiliationRelations
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-crossrefInputPath", crossrefAffiliationRelationPath,
"-crossrefInputPath", crossrefAffiliationRelationPathNew,
"-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPathNew,
"-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath,
"-publisherInputPath", publisherAffiliationRelationPath,
"-publisherInputPath", publisherAffiliationRelationOldPath,
"-outputPath", outputPath
});
@ -104,13 +111,8 @@ public class PrepareAffiliationRelationsTest {
.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
.map(aa -> ((Relation) aa.getPayload()));
// for (Relation r : tmp.collect()) {
// System.out.println(
// r.getSource() + "\t" + r.getTarget() + "\t" + r.getRelType() + "\t" + r.getRelClass() + "\t" + r.getSubRelType() + "\t" + r.getValidationDate() + "\t" + r.getDataInfo().getTrust() + "\t" + r.getDataInfo().getInferred()
// );
// }
// count the number of relations
assertEquals(138, tmp.count());
assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 =
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
@ -121,7 +123,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations
Assertions
.assertEquals(
69, execVerification
75, execVerification
.filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList()
@ -129,21 +131,21 @@ public class PrepareAffiliationRelationsTest {
Assertions
.assertEquals(
69, execVerification
75, execVerification
.filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList()
.size());
// check confidence value of a specific relation
String sourceDOI = "10.1061/(asce)0733-9399(2002)128:7(759)";
String sourceDOI = "10.1089/10872910260066679";
final String sourceOpenaireId = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", sourceDOI));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", sourceDOI));
Assertions
.assertEquals(
"0.7071067812", execVerification
"1.0", execVerification
.filter(
"source='" + sourceOpenaireId + "'")
.collectAsList()
@ -151,11 +153,34 @@ public class PrepareAffiliationRelationsTest {
.getString(4));
final String publisherid = ID_PREFIX
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s00217-010-1268-9"));
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/03265fv13");
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1089/10872910260066679"));
final String rorId = "20|ror_________::" + IdentifierFactory.md5("https://ror.org/05cf8a891");
Assertions
.assertEquals(
1, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count());
Assertions
.assertEquals(
1, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue("doi", "10.1007/s00217-010-1268-9"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/03265fv13") + "'")
.count());
Assertions
.assertEquals(
3, execVerification
.filter(
"source = '" + ID_PREFIX
+ IdentifierFactory
.md5(PidCleaner.normalizePidValue("doi", "10.1007/3-540-47984-8_14"))
+ "' and target = '" + "20|ror_________::"
+ IdentifierFactory.md5("https://ror.org/00a0n9e72") + "'")
.count());
}
}

View File

@ -31,6 +31,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class CreateOpenCitationsASTest {
@ -280,17 +281,17 @@ public class CreateOpenCitationsASTest {
@Test
void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass()
.getResource(

View File

@ -28,6 +28,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
/**
* @author miriam.baglioni
@ -270,17 +271,17 @@ public class CreateTAActionSetTest {
@Test
void testRelationsSourceTargetCouple() throws Exception {
final String doi1 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-015-3684-x"));
final String doi2 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1111/j.1551-2916.2008.02408.x"));
final String doi3 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-014-2114-9"));
final String doi4 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/j.ceramint.2013.09.069"));
final String doi5 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1007/s10854-009-9913-4"));
final String doi6 = "50|doi_________::"
+ IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("doi", "10.1016/0038-1098(72)90370-5"));
String inputPath = getClass()
.getResource(

View File

@ -1,9 +1,10 @@
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
{"DOI":"10.1021\/ac020069k","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/01f5ytq51","Status":"active","Confidence":1}]}
{"DOI":"10.1161\/01.cir.0000013846.72805.7e","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02pttbw34","Status":"active","Confidence":1}]}
{"DOI":"10.1161\/hy02t2.102992","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/00qqv6244","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/00p991c53","Status":"active","Confidence":1}]}
{"DOI":"10.1126\/science.1073633","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03xez1567","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/006w34k90","Status":"active","Confidence":1}]}
{"DOI":"10.1089\/10872910260066679","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/05cf8a891","Status":"active","Confidence":1}]}
{"DOI":"10.1108\/02656719610116117","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/03mnm0t94","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/007tn5k56","Status":"active","Confidence":1}]}
{"DOI":"10.1080\/01443610050111986","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/001x4vz59","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/01tmqtf75","Status":"active","Confidence":1}]}
{"DOI":"10.1021\/cm020118+","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/02cf1je33","Confidence":1,"Status":"inactive"},{"PID":"ROR","Value":"https:\/\/ror.org\/01hvx5h04","Confidence":1,"Status":"active"}]}
{"DOI":"10.1161\/hc1202.104524","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/040r8fr65","Status":"active","Confidence":1},{"PID":"ROR","Value":"https:\/\/ror.org\/04fctr677","Status":"active","Confidence":1}]}
{"DOI":"10.1021\/ma011134f","Matchings":[{"PID":"ROR","Value":"https:\/\/ror.org\/04tj63d06","Status":"active","Confidence":1}]}

View File

@ -0,0 +1,9 @@
{"DOI":"10.1061\/(asce)0733-9399(2002)128:7(759)","Matchings":[{"RORid":"https:\/\/ror.org\/03yxnpp24","Confidence":0.7071067812},{"RORid":"https:\/\/ror.org\/01teme464","Confidence":0.89}]}
{"DOI":"10.1105\/tpc.8.3.343","Matchings":[{"RORid":"https:\/\/ror.org\/02k40bc56","Confidence":0.7071067812}]}
{"DOI":"10.1161\/01.cir.0000013305.01850.37","Matchings":[{"RORid":"https:\/\/ror.org\/00qjgza05","Confidence":1}]}
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "https://doi.org/10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}

View File

@ -0,0 +1,6 @@
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/00a0n9e72", "Confidence": 1}]}
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03bea9k73", "Confidence": 1}, {"RORid": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"RORid": "https://ror.org/04xfq0f34", "Confidence": 1}]}
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/04agmb972", "Confidence": 1}, {"RORid": "https://ror.org/05m7pjf47", "Confidence": 1}]}
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"RORid": "https://ror.org/03265fv13", "Confidence": 1}]}

View File

@ -0,0 +1,6 @@
{"DOI": "10.1007/s00217-010-1268-9", "Authors": [{"Name": {"Full": "Martin Zarnkow", "First": null, "Last": null}, "Raw_affiliations": ["TU M\u00fcnchen, Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Andrea Faltermaier", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Werner Back", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Technologie der Brauerei I"], "Organization_PIDs": []}, {"Name": {"Full": "Martina Gastl", "First": null, "Last": null}, "Raw_affiliations": ["Lehrstuhl f\u00fcr Brau- und Getr\u00e4nketechnologie"], "Organization_PIDs": []}, {"Name": {"Full": "Elkek K. Arendt", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}
{"DOI": "10.1007/BF01154707", "Authors": [{"Name": {"Full": "Buggy, M.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}, {"Name": {"Full": "Carew, A.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Materials Science and Technology, University of Limerick, Limerick, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/00a0n9e72", "Confidence": 1}]}
{"DOI": "10.1007/s10237-017-0974-7", "Authors": [{"Name": {"Full": "Donnacha J. McGrath", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Anja Lena Thiebes", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Christian G. Cornelissen", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Barry O\u2019Brien", "First": null, "Last": null}, "Raw_affiliations": ["Department for Internal Medicine \u2013 Section for Pneumology, Medical Faculty, RWTH Aachen University, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}, {"Name": {"Full": "Stefan Jockenhoevel", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}, {"Name": {"Full": "Mark Bruzzi", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biohybrid and Medical Textiles (BioTex), AME-Helmholtz Institute for Biomedical Engineering, ITA-Institut f\u00fcr Textiltechnik, RWTH Aachen University and at AMIBM Maastricht University, Maastricht, The Netherlands, Aachen, Germany"], "Organization_PIDs": [{"Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}]}, {"Name": {"Full": "Peter E. McHugh", "First": null, "Last": null}, "Raw_affiliations": ["Biomechanics Research Centre (BMEC), Biomedical Engineering, College of Engineering and Informatics, NUI Galway, Galway, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/03bea9k73", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/03bea9k73", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/02jz4aj89", "Confidence": 0.82}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 0.87}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04xfq0f34", "Confidence": 1}]}
{"DOI": "10.1007/BF03168973", "Authors": [{"Name": {"Full": "Sheehan, G.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}, {"Name": {"Full": "Chew, N.", "First": null, "Last": null}, "Raw_affiliations": ["Dept of Infectious Diseases, Mater Misercordiae Hospital, Dublin 7"], "Organization_PIDs": []}], "Organizations": []}
{"DOI": "10.1007/s00338-009-0480-1", "Authors": [{"Name": {"Full": "Gleason, D. F.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Danilowicz, B. S.", "First": null, "Last": null}, "Raw_affiliations": ["Department of Biology, Georgia Southern University, Statesboro, USA"], "Organization_PIDs": [{"Value": "https://ror.org/04agmb972", "Confidence": 1}]}, {"Name": {"Full": "Nolan, C. J.", "First": null, "Last": null}, "Raw_affiliations": ["School of Biology and Environmental Science, University College Dublin, Dublin 4, Ireland"], "Organization_PIDs": [{"Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}], "Organizations": [{"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/04agmb972", "Confidence": 1}, {"Provenance":"AffRo","PID":"ROR","Status":"active","Value": "https://ror.org/05m7pjf47", "Confidence": 1}]}
{"DOI": "10.1007/s10993-010-9187-y", "Authors": [{"Name": {"Full": "Martin Howard", "First": null, "Last": null}, "Raw_affiliations": ["University College Cork"], "Organization_PIDs": [{"Value": "https://ror.org/03265fv13", "Confidence": 1}]}], "Organizations": [{"PID":"ROR","Status":"active","Value": "https://ror.org/03265fv13", "Confidence": 1}]}

View File

@ -313,7 +313,7 @@ case object ConversionUtil {
if (f.author.DisplayName.isDefined)
a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setRawAffiliationString(List(f.affiliation).asJava)
a.setPid(
List(
createSP(
@ -386,7 +386,7 @@ case object ConversionUtil {
a.setFullname(f.author.DisplayName.get)
if (f.affiliation != null)
a.setAffiliation(List(asField(f.affiliation)).asJava)
a.setRawAffiliationString(List(f.affiliation).asJava)
a.setPid(
List(

View File

@ -9,10 +9,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -25,8 +22,6 @@ public class GraphHiveTableImporterJob {
private static final Logger log = LoggerFactory.getLogger(GraphHiveTableImporterJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -74,7 +69,12 @@ public class GraphHiveTableImporterJob {
private static <T extends Oaf> void loadGraphTable(SparkSession spark, String inputPath, String hiveDbName,
Class<T> clazz, int numPartitions) {
Dataset<String> dataset = spark.read().textFile(inputPath);
final Encoder<T> clazzEncoder = Encoders.bean(clazz);
Dataset<Row> dataset = spark
.read()
.schema(clazzEncoder.schema())
.json(inputPath);
if (numPartitions > 0) {
log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions);
@ -82,7 +82,6 @@ public class GraphHiveTableImporterJob {
}
dataset
.map((MapFunction<String, T>) s -> OBJECT_MAPPER.readValue(s, clazz), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.saveAsTable(tableIdentifier(hiveDbName, clazz));

View File

@ -55,29 +55,7 @@ import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.EoscIfGuidelines;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.InstanceTypeMapping;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.Subject;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
@ -667,22 +645,25 @@ public abstract class AbstractMdRecordToOafMapper {
return this.vocs.getTermAsQualifier(schemeId, classId);
}
protected List<StructuredProperty> prepareListStructPropsWithValidQualifier(
protected List<HashableStructuredProperty> prepareListStructPropsWithValidQualifier(
final Node node,
final String xpath,
final String xpathClassId,
final String schemeId,
final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>();
final Set<HashableStructuredProperty> res = new HashSet<>();
for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId).trim();
if (this.vocs.termExists(schemeId, classId)) {
res.add(structuredProperty(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
res
.add(
HashableStructuredProperty
.newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info));
}
}
return res;
return Lists.newArrayList(res);
}
protected List<StructuredProperty> prepareListStructProps(

View File

@ -25,6 +25,7 @@ import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@ -380,7 +381,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
return prepareListStructPropsWithValidQualifier(
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
.stream()
.map(CleaningFunctions::normalizePidValue)
.map(PidCleaner::normalizePidValue)
.collect(Collectors.toList());
}

View File

@ -24,6 +24,7 @@ import eu.dnetlib.dhp.schema.common.RelationInverse;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@ -93,7 +94,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
author.setFullname(String.format("%s, %s", author.getSurname(), author.getName()));
}
author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info));
author.setRawAffiliationString(prepareListString(n, "./*[local-name()='affiliation']"));
author.setPid(preparePids(n, info));
author.setRank(pos++);
res.add(author);
@ -504,7 +505,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
final Set<StructuredProperty> res = new HashSet<>();
final Set<HashableStructuredProperty> res = new HashSet<>();
res
.addAll(
prepareListStructPropsWithValidQualifier(
@ -524,7 +525,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
return res
.stream()
.map(CleaningFunctions::normalizePidValue)
.map(PidCleaner::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}

View File

@ -73,14 +73,10 @@ public class GraphHiveImporterJobTest {
GraphHiveImporterJob
.main(
new String[] {
"-isSparkSessionManaged",
Boolean.FALSE.toString(),
"-inputPath",
getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
"-hiveMetastoreUris",
"",
"-hiveDbName",
dbName
"--isSparkSessionManaged", Boolean.FALSE.toString(),
"--inputPath", getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(),
"--hiveMetastoreUris", "",
"--hiveDbName", dbName
});
ModelSupport.oafTypes

View File

@ -388,7 +388,7 @@ public class CleanGraphSparkJobTest {
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size());
assertEquals(3, fos_subjects.size());
assertTrue(
fos_subjects
@ -396,18 +396,10 @@ public class CleanGraphSparkJobTest {
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive"
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
verify_keyword(p, "In Situ Hybridization");
verify_keyword(p, "Avicennia");
verify_keyword(p, "FOS: Mathematics");
verify_keyword(p, "FOS: Computer and information sciences");
}
@Test

View File

@ -266,7 +266,7 @@ public class GraphCleaningFunctionsTest {
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size());
assertEquals(3, fos_subjects.size());
assertTrue(
fos_subjects
@ -274,18 +274,18 @@ public class GraphCleaningFunctionsTest {
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive"
.equals(s.getDataInfo().getProvenanceaction().getClassid())));
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"subject:fos".equals(s.getDataInfo().getProvenanceaction().getClassid())));
verify_keyword(p_cleaned, "In Situ Hybridization");
verify_keyword(p_cleaned, "Avicennia");
verify_keyword(p_cleaned, "FOS: Computer and information sciences");
verify_keyword(p_cleaned, "FOS: Mathematics");
// TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned));

View File

@ -44,7 +44,7 @@ class GenerateEntitiesApplicationTest {
}
@Test
void testMergeResult() throws IOException, DocumentException {
void testMergeResult() throws IOException {
Result publication = getResult("oaf_record.xml", Publication.class);
Result dataset = getResult("odf_dataset.xml", Dataset.class);
Result software = getResult("odf_software.xml", Software.class);
@ -69,15 +69,15 @@ class GenerateEntitiesApplicationTest {
verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
}
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
protected <T extends Result> void verifyMerge(Result r1, Result r2, Class<T> clazz,
String resultType) {
final Result merge = (Result) MergeUtils.merge(publication, dataset);
final Result merge = MergeUtils.checkedMerge(r1, r2, true);
assertTrue(clazz.isAssignableFrom(merge.getClass()));
assertEquals(resultType, merge.getResulttype().getClassid());
}
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz)
throws IOException, DocumentException {
throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
return new OdfToOafMapper(vocs, false, true)
.processMdRecord(xml)

View File

@ -216,7 +216,7 @@ class MappersTest {
}
@Test
void testPublication_PubMed() throws IOException, DocumentException {
void testPublication_PubMed() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record_pubmed.xml")));
@ -264,8 +264,17 @@ class MappersTest {
assertFalse(p.getSubject().isEmpty());
assertFalse(p.getPid().isEmpty());
assertEquals("PMC1517292", p.getPid().get(0).getValue());
assertEquals("pmc", p.getPid().get(0).getQualifier().getClassid());
assertTrue(p.getPid().stream().anyMatch(pi -> "pmc".equals(pi.getQualifier().getClassid())));
assertEquals(
"PMC1517292",
p
.getPid()
.stream()
.filter(pi -> "pmc".equals(pi.getQualifier().getClassid()))
.findFirst()
.get()
.getValue());
assertNotNull(p.getInstance());
assertFalse(p.getInstance().isEmpty());
@ -292,7 +301,7 @@ class MappersTest {
}
@Test
void testPublicationInvisible() throws IOException, DocumentException {
void testPublicationInvisible() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_record.xml")));
@ -307,6 +316,25 @@ class MappersTest {
}
@Test
void testPublicationInvisible_BASE() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record_base.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, true, true).processMdRecord(xml);
assertFalse(list.isEmpty());
assertTrue(list.get(0) instanceof Publication);
final Publication p = (Publication) list.get(0);
assertTrue(p.getDataInfo().getInvisible());
System.out.println(new ObjectMapper().writeValueAsString(p));
}
@Test
void testOdfFwfEBookLibrary() throws IOException {
final String xml = IOUtils
@ -318,7 +346,7 @@ class MappersTest {
}
@Test
void testDataset() throws IOException, DocumentException {
void testDataset() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_dataset.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -332,19 +360,19 @@ class MappersTest {
final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2);
assertEquals(d.getId(), r1.getSource());
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r1.getTarget());
assertEquals(d.getId(), r1.getTarget());
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r1.getSource());
assertEquals(ModelConstants.RESULT_PROJECT, r1.getRelType());
assertEquals(ModelConstants.OUTCOME, r1.getSubRelType());
assertEquals(ModelConstants.IS_PRODUCED_BY, r1.getRelClass());
assertEquals(ModelConstants.PRODUCES, r1.getRelClass());
assertTrue(r1.getValidated());
assertEquals("2020-01-01", r1.getValidationDate());
assertEquals(d.getId(), r2.getTarget());
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r2.getSource());
assertEquals(d.getId(), r2.getSource());
assertEquals("40|corda_______::e06332dee33bec6c2ba4c98601053229", r2.getTarget());
assertEquals(ModelConstants.RESULT_PROJECT, r2.getRelType());
assertEquals(ModelConstants.OUTCOME, r2.getSubRelType());
assertEquals(ModelConstants.PRODUCES, r2.getRelClass());
assertEquals(ModelConstants.IS_PRODUCED_BY, r2.getRelClass());
assertTrue(r2.getValidated());
assertEquals("2020-01-01", r2.getValidationDate());
@ -378,15 +406,15 @@ class MappersTest {
assertEquals("Baracchini", author.get().getSurname());
assertEquals("Theo", author.get().getName());
assertEquals(1, author.get().getAffiliation().size());
final Optional<Field<String>> opAff = author
assertEquals(1, author.get().getRawAffiliationString().size());
final Optional<String> opAff = author
.get()
.getAffiliation()
.getRawAffiliationString()
.stream()
.findFirst();
assertTrue(opAff.isPresent());
final Field<String> affiliation = opAff.get();
assertEquals("ISTI-CNR", affiliation.getValue());
final String affiliation = opAff.get();
assertEquals("ISTI-CNR", affiliation);
assertFalse(d.getSubject().isEmpty());
assertFalse(d.getInstance().isEmpty());
@ -450,7 +478,7 @@ class MappersTest {
}
@Test
void testOdfBielefeld() throws IOException, DocumentException {
void testOdfBielefeld() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_bielefeld.xml")));
@ -501,7 +529,7 @@ class MappersTest {
}
@Test
void testOpentrial() throws IOException, DocumentException {
void testOpentrial() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_opentrial.xml")));
@ -741,7 +769,7 @@ class MappersTest {
}
@Test
void testSoftware() throws IOException, DocumentException {
void testSoftware() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_software.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -763,22 +791,21 @@ class MappersTest {
final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2);
assertEquals(s.getId(), r1.getSource());
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getTarget());
assertEquals(s.getId(), r1.getTarget());
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r1.getSource());
assertEquals(ModelConstants.RESULT_RESULT, r1.getRelType());
assertEquals(ModelConstants.RELATIONSHIP, r1.getSubRelType());
assertEquals(ModelConstants.IS_REFERENCED_BY, r1.getRelClass());
assertEquals(ModelConstants.REFERENCES, r1.getRelClass());
assertEquals(s.getId(), r2.getTarget());
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getSource());
assertEquals(s.getId(), r2.getSource());
assertEquals("50|doi_________::b453e7b4b2130ace57ff0c3db470a982", r2.getTarget());
assertEquals(ModelConstants.RESULT_RESULT, r2.getRelType());
assertEquals(ModelConstants.RELATIONSHIP, r2.getSubRelType());
assertEquals(ModelConstants.REFERENCES, r2.getRelClass());
assertEquals(ModelConstants.IS_REFERENCED_BY, r2.getRelClass());
}
@Test
void testClaimDedup() throws IOException, DocumentException {
void testClaimDedup() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_dedup.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -792,7 +819,7 @@ class MappersTest {
}
@Test
void testNakala() throws IOException, DocumentException {
void testNakala() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_nakala.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -820,7 +847,7 @@ class MappersTest {
}
@Test
void testEnermaps() throws IOException, DocumentException {
void testEnermaps() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("enermaps.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -845,7 +872,7 @@ class MappersTest {
}
@Test
void testClaimFromCrossref() throws IOException, DocumentException {
void testClaimFromCrossref() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_claim_crossref.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -862,7 +889,7 @@ class MappersTest {
}
@Test
void testODFRecord() throws IOException, DocumentException {
void testODFRecord() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_record.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
@ -882,7 +909,7 @@ class MappersTest {
}
@Test
void testTextGrid() throws IOException, DocumentException {
void testTextGrid() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("textgrid.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -916,7 +943,7 @@ class MappersTest {
}
@Test
void testBologna() throws IOException, DocumentException {
void testBologna() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf-bologna.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -933,7 +960,7 @@ class MappersTest {
}
@Test
void testJairo() throws IOException, DocumentException {
void testJairo() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_jairo.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -971,7 +998,7 @@ class MappersTest {
}
@Test
void testZenodo() throws IOException, DocumentException {
void testZenodo() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1016,7 +1043,7 @@ class MappersTest {
}
@Test
void testOdfFromHdfs() throws IOException, DocumentException {
void testOdfFromHdfs() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_from_hdfs.xml")));
@ -1065,7 +1092,7 @@ class MappersTest {
}
@Test
void testXMLEncodedURL() throws IOException, DocumentException {
void testXMLEncodedURL() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url.xml")));
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1081,7 +1108,7 @@ class MappersTest {
}
@Test
void testXMLEncodedURL_ODF() throws IOException, DocumentException {
void testXMLEncodedURL_ODF() throws IOException {
final String xml = IOUtils
.toString(Objects.requireNonNull(getClass().getResourceAsStream("encoded-url_odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
@ -1245,7 +1272,7 @@ class MappersTest {
}
@Test
void testRiunet() throws IOException, DocumentException {
void testRiunet() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("riunet.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
@ -1291,7 +1318,7 @@ class MappersTest {
}
@Test
void testIRISPub() throws IOException, DocumentException {
void testIRISPub() throws IOException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("iris-odf.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");

View File

@ -794,28 +794,6 @@
},
"value": "FOS: Computer and information sciences"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "0101 mathematics"
},
{
"dataInfo": {
"deletedbyinference": false,
@ -831,8 +809,8 @@
"trust": "0.9"
},
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"classid": "FOS",
"classname": "Fields of Science and Technology classification",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
@ -910,8 +888,8 @@
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:actionset",
"classname": "Harvested",
"classid": "subject:fos",
"classname": "subject:fos",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
@ -923,7 +901,7 @@
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "Avicennia"
"value": "0102 computer and information sciences"
},
{
"dataInfo": {

View File

@ -0,0 +1,129 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaire="http://namespace.openaire.eu/schema/oaire/"
xmlns:datacite="http://datacite.org/schema/kernel-4"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<dri:objIdentifier>base_oa_____::7ecf1ef502253efffe203ca9a22bb9f1</dri:objIdentifier>
<identifier>ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</identifier>
<datestamp>2020-12-22T10:30:27Z</datestamp>
<dr:dateOfTransformation>2024-09-10T17:21:36.972Z</dr:dateOfTransformation>
</header>
<metadata>
<datacite:resource>
<datacite:identifier identifierType="DOI">https://doi.org/10.1016/j.envint.2014.07.004</datacite:identifier>
<datacite:alternateIdentifiers>
<datacite:identifier alternateIdentifierType="url">https://espace.library.uq.edu.au/view/UQ:336902</datacite:identifier>
<datacite:identifier alternateIdentifierType="oai-original">ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</datacite:identifier>
</datacite:alternateIdentifiers>
<datacite:relatedIdentifiers/>
<datacite:resourceType>Article contribution</datacite:resourceType>
<datacite:titles>
<datacite:title>The role of environmental factors in the spatial distribution of Japanese encephalitis in mainland China</datacite:title>
</datacite:titles>
<datacite:creators>
<datacite:creator>
<datacite:creatorName>Wang, Liya</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Hu, Wenbiao</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Soares Magalhaes, Ricardo J.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Bi, Peng</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Ding, Fan</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Sun, Hailong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Li, Shenlong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Yin, Wenwu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Wei, Lan</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Liu, Qiyong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Haque, Ubydul</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Sun, Yansong</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Huang, Liuyu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Tong, Shilu</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Clements, Archie C.A.</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Zhang, Wenyi</datacite:creatorName>
</datacite:creator>
<datacite:creator>
<datacite:creatorName>Li, Chengyi</datacite:creatorName>
</datacite:creator>
</datacite:creators>
<datacite:contributors/>
<datacite:descriptions>
<datacite:description descriptionType="Abstract">Japanese encephalitis (JE) is the most common cause of viral encephalitis and an important public health concern in the Asia-Pacific region, particularly in China where 50% of global cases are notified. To explore the association between environmental factors and human JE cases and identify the high risk areas for JE transmission in China, we used annual notified data on JE cases at the center of administrative township and environmental variables with a pixel resolution of 1. km. ×. 1. km from 2005 to 2011 to construct models using ecological niche modeling (ENM) approaches based on maximum entropy. These models were then validated by overlaying reported human JE case localities from 2006 to 2012 onto each prediction map. ENMs had good discriminatory ability with the area under the curve (AUC) of the receiver operating curve (ROC) of 0.82-0.91, and low extrinsic omission rate of 5.44-7.42%. Resulting maps showed JE being presented extensively throughout southwestern and central China, with local spatial variations in probability influenced by minimum temperatures, human population density, mean temperatures, and elevation, with contribution of 17.94%-38.37%, 15.47%-21.82%, 3.86%-21.22%, and 12.05%-16.02%, respectively. Approximately 60% of JE cases occurred in predicted high risk areas, which covered less than 6% of areas in mainland China. Our findings will help inform optimal geographical allocation of the limited resources available for JE prevention and control in China, find hidden high-risk areas, and increase the effectiveness of public health interventions against JE transmission.</datacite:description>
</datacite:descriptions>
<datacite:subjects>
<datacite:subject>Japanese encephalitis</datacite:subject>
<datacite:subject>Ecological niche model</datacite:subject>
<datacite:subject>MaxEnt</datacite:subject>
<datacite:subject>China</datacite:subject>
<datacite:subject>2300 Environmental Science</datacite:subject>
<datacite:subject classificationCode="950" subjectScheme="ddc">950</datacite:subject>
</datacite:subjects>
<datacite:publisher>Pergamon Press</datacite:publisher>
<datacite:publicationYear>2014</datacite:publicationYear>
<datacite:formats/>
<datacite:language>eng</datacite:language>
<oaf:accessrights/>
</datacite:resource>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:accessrights>UNKNOWN</oaf:accessrights>
<oaf:identifier identifierType="doi">10.1163/qwerty</oaf:identifier>
<oaf:identifier identifierType="doi">0.1163/18763308-90001038</oaf:identifier>
<oaf:identifier identifierType="doi">https://doi.org/10.1016/j.envint.2014.07.004</oaf:identifier>
<oaf:identifier identifierType="doi">https://doi.org/10.1080/09672567.2013.792375</oaf:identifier>
<oaf:identifier identifierType="doi">http://doi.org/10.1080/08673487.2012.812376</oaf:identifier>
<oaf:identifier identifierType="doi">http://dx.doi.org/10.1090/08673487.2012.812376</oaf:identifier>
<oaf:identifier identifierType="url">https://espace.library.uq.edu.au/view/UQ:336902</oaf:identifier>
<oaf:identifier identifierType="oai-original">ftunivqespace:oai:espace.library.uq.edu.au:UQ:336902</oaf:identifier>
<oaf:hostedBy name="The University of Queensland: UQ eSpace" id="opendoar____::575"/>
<oaf:collectedFrom name="Bielefeld Academic Search Engine (BASE)"
id="openaire____::base_search"/>
<oaf:dateAccepted>2014-12-01</oaf:dateAccepted>
<oaf:relation relClass="hasAuthorInstitution"
relType="resultOrganization"
subRelType="affiliation"
targetType="organization">ror_________::https://ror.org/00rqy9422</oaf:relation>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.89</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:aggregator"
classname="sysimport:crosswalk:aggregator"
schemeid="dnet:provenanceActions"
schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</metadata>
</record>

View File

@ -130,5 +130,10 @@
"value": [
"Pippo", "Foo"
]
},
{
"field": "typology",
"type": "string",
"value": "Government"
}
]

View File

@ -4,12 +4,13 @@ import eu.dnetlib.dhp.schema.sx.scholix.ScholixResource
import eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
import org.junit.jupiter.api.Test
import org.junit.jupiter.api.{Disabled, Test}
import org.objenesis.strategy.StdInstantiatorStrategy
class ScholixGenerationTest {
@Test
@Disabled
def generateScholix(): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()

View File

@ -937,7 +937,7 @@
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[7.0.1]</dhp-schemas.version>
<dhp-schemas.version>[8.0.1]</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>