refined mapping for the extraction of the original resource type

This commit is contained in:
Claudio Atzori 2024-01-11 16:28:26 +01:00
parent 62104790ae
commit 2753044d13
2 changed files with 42 additions and 7 deletions

View File

@ -11,6 +11,7 @@ import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Element; import org.dom4j.Element;
@ -27,6 +28,15 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
private static Set<String> DC_TYPE_PUBLICATION_VERSION = new HashSet<>();
static {
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/submittedVersion");
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/acceptedVersion");
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/publishedVersion");
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/updatedVersion");
}
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
final boolean forceOrginalId) { final boolean forceOrginalId) {
super(vocs, invisible, shouldHashId, forceOrginalId); super(vocs, invisible, shouldHashId, forceOrginalId);
@ -192,24 +202,40 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
/** /**
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value * The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the * So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
* openaire's info:eu-repo type, and as last resort picks the 1st type text available * openaire's info:eu-repo type, but excluding the following
*
* info:eu-repo/semantics/draft
* info:eu-repo/semantics/submittedVersion
* info:eu-repo/semantics/acceptedVersion
* info:eu-repo/semantics/publishedVersion
* info:eu-repo/semantics/updatedVersion
*
* Then, it picks the 1st dc:type text available and, in case there is no dc:type element, as last resort it tries
* to extract the type from the dr:CobjCategory element
*
* Examples:
* *
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type> * <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
* <dc:type>info:eu-repo/semantics/article</dc:type> * <dc:type>info:eu-repo/semantics/article</dc:type>
* <dc:type>Conference article</dc:type> * <dc:type>Conference article</dc:type>
* <dr:CobjCategory type="publication">0006</dr:CobjCategory>
* *
* @param doc the input document * @param doc the input document
* @return the chosen resource type * @return the chosen resource type
*/ */
@Override @Override
protected String findOriginalType(Document doc) { protected String findOriginalType(Document doc) {
return (String) doc final String dcType = (String) doc
.selectNodes("//dc:type") .selectNodes("//dc:type")
.stream() .stream()
.map(o -> "" + ((Node) o).getText().trim()) .map(o -> "" + ((Node) o).getText().trim())
.filter(t -> !DC_TYPE_PUBLICATION_VERSION.contains(t))
.sorted(new OriginalTypeComparator()) .sorted(new OriginalTypeComparator())
.findFirst() .findFirst()
.orElse(null); .orElse(null);
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
return ObjectUtils.firstNonNull(dcType, drCobjCategory);
} }
@Override @Override

View File

@ -221,27 +221,36 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
} }
/** /**
* The Datacite element * Extracts the resource type from The Datacite element
* *
* <datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType> * <datacite:resourceType
* anyURI="http://purl.org/coar/resource_type/c_6501"
* uri="http://purl.org/coar/resource_type/c_6501"
* resourceTypeGeneral="Dataset">journal article</datacite:resourceType>
* *
* @param doc the input document * @param doc the input document
* @return the chosen resource type * @return the chosen resource type
*/ */
@Override @Override
protected String findOriginalType(Document doc) { protected String findOriginalType(Document doc) {
return Optional final String resourceType = Optional
.ofNullable( .ofNullable(
(Element) doc (Element) doc
.selectSingleNode( .selectSingleNode(
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']")) "//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
.map(element -> { .map(element -> {
final String resourceTypeURI = element.attributeValue("anyURI"); final String resourceTypeURI = element.attributeValue("uri");
final String resourceTypeAnyURI = element.attributeValue("anyURI");
final String resourceTypeTxt = element.getText(); final String resourceTypeTxt = element.getText();
final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt); return ObjectUtils
.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);
}) })
.orElse(null); .orElse(null);
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
return ObjectUtils.firstNonNull(resourceType, drCobjCategory);
} }
@Override @Override