refined mapping for the extraction of the original resource type

This commit is contained in:
Claudio Atzori 2024-01-11 16:28:26 +01:00
parent 62104790ae
commit 2753044d13
2 changed files with 42 additions and 7 deletions

View File

@ -11,6 +11,7 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.Element;
@ -27,6 +28,15 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
private static Set<String> DC_TYPE_PUBLICATION_VERSION = new HashSet<>();
static {
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/submittedVersion");
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/acceptedVersion");
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/publishedVersion");
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/updatedVersion");
}
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
final boolean forceOrginalId) {
super(vocs, invisible, shouldHashId, forceOrginalId);
@ -192,24 +202,40 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
/**
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
* openaire's info:eu-repo type, and as last resort picks the 1st type text available
* openaire's info:eu-repo type, but excluding the following
*
* info:eu-repo/semantics/draft
* info:eu-repo/semantics/submittedVersion
* info:eu-repo/semantics/acceptedVersion
* info:eu-repo/semantics/publishedVersion
* info:eu-repo/semantics/updatedVersion
*
* Then, it picks the 1st dc:type text available and, in case there is no dc:type element, as last resort it tries
* to extract the type from the dr:CobjCategory element
*
* Examples:
*
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
* <dc:type>info:eu-repo/semantics/article</dc:type>
* <dc:type>Conference article</dc:type>
* <dr:CobjCategory type="publication">0006</dr:CobjCategory>
*
* @param doc the input document
* @return the chosen resource type
*/
@Override
protected String findOriginalType(Document doc) {
return (String) doc
final String dcType = (String) doc
.selectNodes("//dc:type")
.stream()
.map(o -> "" + ((Node) o).getText().trim())
.filter(t -> !DC_TYPE_PUBLICATION_VERSION.contains(t))
.sorted(new OriginalTypeComparator())
.findFirst()
.orElse(null);
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
return ObjectUtils.firstNonNull(dcType, drCobjCategory);
}
@Override

View File

@ -221,27 +221,36 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
}
/**
* The Datacite element
* Extracts the resource type from The Datacite element
*
* <datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType>
* <datacite:resourceType
* anyURI="http://purl.org/coar/resource_type/c_6501"
* uri="http://purl.org/coar/resource_type/c_6501"
* resourceTypeGeneral="Dataset">journal article</datacite:resourceType>
*
* @param doc the input document
* @return the chosen resource type
*/
@Override
protected String findOriginalType(Document doc) {
return Optional
final String resourceType = Optional
.ofNullable(
(Element) doc
.selectSingleNode(
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
.map(element -> {
final String resourceTypeURI = element.attributeValue("anyURI");
final String resourceTypeURI = element.attributeValue("uri");
final String resourceTypeAnyURI = element.attributeValue("anyURI");
final String resourceTypeTxt = element.getText();
final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
return ObjectUtils
.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);
})
.orElse(null);
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
return ObjectUtils.firstNonNull(resourceType, drCobjCategory);
}
@Override