forked from D-Net/dnet-hadoop
Merge pull request 'refined mapping for the extraction of the original resource type' (#374) from resource_types into beta
Reviewed-on: D-Net/dnet-hadoop#374
This commit is contained in:
commit
c67467723b
|
@ -11,6 +11,7 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.ObjectUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Element;
|
||||
|
@ -27,6 +28,15 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
|||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
||||
private static Set<String> DC_TYPE_PUBLICATION_VERSION = new HashSet<>();
|
||||
|
||||
static {
|
||||
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/submittedVersion");
|
||||
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/acceptedVersion");
|
||||
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/publishedVersion");
|
||||
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/updatedVersion");
|
||||
}
|
||||
|
||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||
final boolean forceOrginalId) {
|
||||
super(vocs, invisible, shouldHashId, forceOrginalId);
|
||||
|
@ -192,24 +202,40 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
/**
|
||||
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
|
||||
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
|
||||
* openaire's info:eu-repo type, and as last resort picks the 1st type text available
|
||||
* openaire's info:eu-repo type, but excluding the following
|
||||
*
|
||||
* info:eu-repo/semantics/draft
|
||||
* info:eu-repo/semantics/submittedVersion
|
||||
* info:eu-repo/semantics/acceptedVersion
|
||||
* info:eu-repo/semantics/publishedVersion
|
||||
* info:eu-repo/semantics/updatedVersion
|
||||
*
|
||||
* Then, it picks the 1st dc:type text available and, in case there is no dc:type element, as last resort it tries
|
||||
* to extract the type from the dr:CobjCategory element
|
||||
*
|
||||
* Examples:
|
||||
*
|
||||
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
|
||||
* <dc:type>info:eu-repo/semantics/article</dc:type>
|
||||
* <dc:type>Conference article</dc:type>
|
||||
* <dr:CobjCategory type="publication">0006</dr:CobjCategory>
|
||||
*
|
||||
* @param doc the input document
|
||||
* @return the chosen resource type
|
||||
*/
|
||||
@Override
|
||||
protected String findOriginalType(Document doc) {
|
||||
return (String) doc
|
||||
final String dcType = (String) doc
|
||||
.selectNodes("//dc:type")
|
||||
.stream()
|
||||
.map(o -> "" + ((Node) o).getText().trim())
|
||||
.filter(t -> !DC_TYPE_PUBLICATION_VERSION.contains(t))
|
||||
.sorted(new OriginalTypeComparator())
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
|
||||
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
|
||||
return ObjectUtils.firstNonNull(dcType, drCobjCategory);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -221,27 +221,36 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
|||
}
|
||||
|
||||
/**
|
||||
* The Datacite element
|
||||
* Extracts the resource type from The Datacite element
|
||||
*
|
||||
* <datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType>
|
||||
* <datacite:resourceType
|
||||
* anyURI="http://purl.org/coar/resource_type/c_6501"
|
||||
* uri="http://purl.org/coar/resource_type/c_6501"
|
||||
* resourceTypeGeneral="Dataset">journal article</datacite:resourceType>
|
||||
*
|
||||
* @param doc the input document
|
||||
* @return the chosen resource type
|
||||
*/
|
||||
@Override
|
||||
protected String findOriginalType(Document doc) {
|
||||
return Optional
|
||||
final String resourceType = Optional
|
||||
.ofNullable(
|
||||
(Element) doc
|
||||
.selectSingleNode(
|
||||
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
|
||||
.map(element -> {
|
||||
final String resourceTypeURI = element.attributeValue("anyURI");
|
||||
final String resourceTypeURI = element.attributeValue("uri");
|
||||
final String resourceTypeAnyURI = element.attributeValue("anyURI");
|
||||
final String resourceTypeTxt = element.getText();
|
||||
final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
|
||||
|
||||
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
|
||||
return ObjectUtils
|
||||
.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);
|
||||
})
|
||||
.orElse(null);
|
||||
|
||||
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
|
||||
return ObjectUtils.firstNonNull(resourceType, drCobjCategory);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue