result pids (new xpaths + IS vocabularies)

This commit is contained in:
Michele Artini 2020-05-26 13:11:09 +02:00
parent 7a7272d9ec
commit a25598140a
4 changed files with 189 additions and 115 deletions

View File

@ -64,8 +64,10 @@ public abstract class AbstractMdRecordToOafMapper {
protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4"; protected static final String DATACITE_SCHEMA_KERNEL_4 = "http://datacite.org/schema/kernel-4";
protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3"; protected static final String DATACITE_SCHEMA_KERNEL_3 = "http://datacite.org/schema/kernel-3";
protected static final Qualifier ORCID_PID_TYPE = qualifier("ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES); protected static final Qualifier ORCID_PID_TYPE = qualifier(
protected static final Qualifier MAG_PID_TYPE = qualifier("MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES); "ORCID", "Open Researcher and Contributor ID", DNET_PID_TYPES, DNET_PID_TYPES);
protected static final Qualifier MAG_PID_TYPE = qualifier(
"MAGIdentifier", "Microsoft Academic Graph Identifier", DNET_PID_TYPES, DNET_PID_TYPES);
protected static final Map<String, String> nsContext = new HashMap<>(); protected static final Map<String, String> nsContext = new HashMap<>();
@ -79,7 +81,8 @@ public abstract class AbstractMdRecordToOafMapper {
nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3); nsContext.put("datacite", DATACITE_SCHEMA_KERNEL_3);
} }
protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
"main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) { protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
this.code2name = code2name; this.code2name = code2name;
@ -93,15 +96,20 @@ public abstract class AbstractMdRecordToOafMapper {
.parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3)); .parseText(xml.replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3));
final String type = doc.valueOf("//dr:CobjCategory/@type"); final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = getProvenanceDatasource(doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); final KeyValue collectedFrom = getProvenanceDatasource(
doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name");
if (collectedFrom == null) { return null; } if (collectedFrom == null) {
return null;
}
final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
? collectedFrom ? collectedFrom
: getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name");
if (hostedBy == null) { return null; } if (hostedBy == null) {
return null;
}
final DataInfo info = prepareDataInfo(doc); final DataInfo info = prepareDataInfo(doc);
final long lastUpdateTimestamp = new Date().getTime(); final long lastUpdateTimestamp = new Date().getTime();
@ -116,7 +124,9 @@ public abstract class AbstractMdRecordToOafMapper {
final String dsId = doc.valueOf(xpathId); final String dsId = doc.valueOf(xpathId);
final String dsName = doc.valueOf(xpathName); final String dsName = doc.valueOf(xpathName);
if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) { return null; } if (StringUtils.isBlank(dsId) | StringUtils.isBlank(dsName)) {
return null;
}
return keyValue(createOpenaireId(10, dsId, true), dsName); return keyValue(createOpenaireId(10, dsId, true), dsName);
} }
@ -201,9 +211,15 @@ public abstract class AbstractMdRecordToOafMapper {
final String projectId = createOpenaireId(40, originalId, true); final String projectId = createOpenaireId(40, originalId, true);
res res
.add(getRelation(docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info,
lastUpdateTimestamp));
res res
.add(getRelation(projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info,
lastUpdateTimestamp));
} }
} }
@ -398,7 +414,9 @@ public abstract class AbstractMdRecordToOafMapper {
final String sp = n.valueOf("@sp"); final String sp = n.valueOf("@sp");
final String vol = n.valueOf("@vol"); final String vol = n.valueOf("@vol");
final String edition = n.valueOf("@edition"); final String edition = n.valueOf("@edition");
if (StringUtils.isNotBlank(name)) { return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info); } if (StringUtils.isNotBlank(name)) {
return journal(name, issnPrinted, issnOnline, issnLinking, ep, iss, sp, vol, edition, null, null, info);
}
} }
return null; return null;
} }
@ -451,7 +469,10 @@ public abstract class AbstractMdRecordToOafMapper {
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
res res
.add(structuredProperty(n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), n.valueOf("@schemename"), info)); .add(
structuredProperty(
n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"),
n.valueOf("@schemename"), info));
} }
return res; return res;
} }
@ -459,7 +480,9 @@ public abstract class AbstractMdRecordToOafMapper {
protected OAIProvenance prepareOAIprovenance(final Document doc) { protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
if (n == null) { return null; } if (n == null) {
return null;
}
final String identifier = n.valueOf("./*[local-name()='identifier']"); final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']");
@ -474,7 +497,9 @@ public abstract class AbstractMdRecordToOafMapper {
protected DataInfo prepareDataInfo(final Document doc) { protected DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo"); final Node n = doc.selectSingleNode("//oaf:datainfo");
if (n == null) { return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9"); } if (n == null) {
return dataInfo(false, null, false, false, REPOSITORY_PROVENANCE_ACTIONS, "0.9");
}
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
@ -486,7 +511,9 @@ public abstract class AbstractMdRecordToOafMapper {
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust"); final String trust = n.valueOf("./oaf:trust");
return dataInfo(deletedbyinference, inferenceprovenance, inferred, false, qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust); return dataInfo(
deletedbyinference, inferenceprovenance, inferred, false,
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), trust);
} }
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) { protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {

View File

@ -56,7 +56,8 @@ public class GenerateEntitiesApplication {
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString(GenerateEntitiesApplication.class .toString(
GenerateEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
@ -105,7 +106,8 @@ public class GenerateEntitiesApplication {
for (final String sp : existingSourcePaths) { for (final String sp : existingSourcePaths) {
inputRdd = inputRdd inputRdd = inputRdd
.union(sc .union(
sc
.sequenceFile(sp, Text.class, Text.class) .sequenceFile(sp, Text.class, Text.class)
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
.map(k -> convertToListOaf(k._1(), k._2(), code2name)) .map(k -> convertToListOaf(k._1(), k._2(), code2name))
@ -117,7 +119,8 @@ public class GenerateEntitiesApplication {
.mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf))
.reduceByKey((o1, o2) -> merge(o1, o2)) .reduceByKey((o1, o2) -> merge(o1, o2))
.map(Tuple2::_2) .map(Tuple2::_2)
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() .map(
oaf -> oaf.getClass().getSimpleName().toLowerCase()
+ "|" + "|"
+ OBJECT_MAPPER.writeValueAsString(oaf)) + OBJECT_MAPPER.writeValueAsString(oaf))
.saveAsTextFile(targetPath, GzipCodec.class); .saveAsTextFile(targetPath, GzipCodec.class);
@ -194,8 +197,10 @@ public class GenerateEntitiesApplication {
private static Map<String, String> loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { private static Map<String, String> loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException {
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String xquery = final String xquery = IOUtils
IOUtils.toString(GenerateEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); .toString(
GenerateEntitiesApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery"));
final Map<String, String> map = new HashMap<>(); final Map<String, String> map = new HashMap<>();

View File

@ -130,7 +130,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final Instance instance = new Instance(); final Instance instance = new Instance();
instance instance
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); .setInstancetype(
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom); instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby); instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
@ -146,7 +147,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
instance instance
.setUrl(nodes .setUrl(
nodes
.stream() .stream()
.filter(n -> StringUtils.isNotBlank(n.getText())) .filter(n -> StringUtils.isNotBlank(n.getText()))
.map(n -> n.getText().trim()) .map(n -> n.getText().trim())
@ -277,9 +279,15 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
final String otherId = createOpenaireId(50, originalId, false); final String otherId = createOpenaireId(50, originalId, false);
res res
.add(getRelation(docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info,
lastUpdateTimestamp));
res res
.add(getRelation(otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info,
lastUpdateTimestamp));
} }
} }
return res; return res;
@ -292,6 +300,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
return prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info); return prepareListStructProps(
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info);
} }
} }

View File

@ -120,7 +120,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final Instance instance = new Instance(); final Instance instance = new Instance();
instance instance
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE)); .setInstancetype(
prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE));
instance.setCollectedfrom(collectedfrom); instance.setCollectedfrom(collectedfrom);
instance.setHostedby(hostedby); instance.setHostedby(hostedby);
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
@ -169,7 +170,10 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
&& !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Updated")
&& !dateType.equalsIgnoreCase("Available")) { && !dateType.equalsIgnoreCase("Available")) {
res res
.add(structuredProperty(((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); .add(
structuredProperty(
((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE,
info));
} }
} }
return res; return res;
@ -221,14 +225,16 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
protected List<Field<String>> prepareOtherResearchProductContactGroups( protected List<Field<String>> prepareOtherResearchProductContactGroups(
final Document doc, final Document doc,
final DataInfo info) { final DataInfo info) {
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); return prepareListFields(
doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info);
} }
@Override @Override
protected List<Field<String>> prepareOtherResearchProductContactPersons( protected List<Field<String>> prepareOtherResearchProductContactPersons(
final Document doc, final Document doc,
final DataInfo info) { final DataInfo info) {
return prepareListFields(doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); return prepareListFields(
doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info);
} }
@Override @Override
@ -254,7 +260,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
protected List<Field<String>> prepareSoftwareDocumentationUrls( protected List<Field<String>> prepareSoftwareDocumentationUrls(
final Document doc, final Document doc,
final DataInfo info) { final DataInfo info) {
return prepareListFields(doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); return prepareListFields(
doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info);
} }
// DATASETS // DATASETS
@ -328,16 +335,29 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
if (type.equalsIgnoreCase("IsSupplementTo")) { if (type.equalsIgnoreCase("IsSupplementTo")) {
res res
.add(getRelation(docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info,
lastUpdateTimestamp));
res res
.add(getRelation(otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info,
lastUpdateTimestamp));
} else if (type.equals("IsPartOf")) { } else if (type.equals("IsPartOf")) {
res res
.add(getRelation(docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, lastUpdateTimestamp)); .add(
getRelation(
docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info,
lastUpdateTimestamp));
res res
.add(getRelation(otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, lastUpdateTimestamp)); .add(
} else {} getRelation(
otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info,
lastUpdateTimestamp));
} else {
}
} }
} }
return res; return res;
@ -345,15 +365,28 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE, DNET_DATA_CITE_RESOURCE); return prepareQualifier(
doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE,
DNET_DATA_CITE_RESOURCE);
} }
@Override @Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
res.addAll(prepareListStructProps(doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); res
res.addAll(prepareListStructProps(doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); .addAll(
res.addAll(prepareListStructProps(doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']", "@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info)); prepareListStructProps(
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
res
.addAll(
prepareListStructProps(
doc, "//datacite:identifier[@identifierType != 'URL']", "@identifierType", "dnet:pid_types",
"dnet:pid_types", info));
res
.addAll(
prepareListStructProps(
doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL']",
"@alternateIdentifierType", "dnet:pid_types", "dnet:pid_types", info));
return res; return res;
} }