forked from D-Net/dnet-hadoop
Merge pull request 'refined mapping for the extraction of the original resource type' (#374) from resource_types into beta
Reviewed-on: D-Net/dnet-hadoop#374
This commit is contained in:
commit
c67467723b
|
@ -11,6 +11,7 @@ import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.ObjectUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Element;
|
import org.dom4j.Element;
|
||||||
|
@ -27,6 +28,15 @@ import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
private static Set<String> DC_TYPE_PUBLICATION_VERSION = new HashSet<>();
|
||||||
|
|
||||||
|
static {
|
||||||
|
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/submittedVersion");
|
||||||
|
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/acceptedVersion");
|
||||||
|
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/publishedVersion");
|
||||||
|
DC_TYPE_PUBLICATION_VERSION.add("info:eu-repo/semantics/updatedVersion");
|
||||||
|
}
|
||||||
|
|
||||||
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId,
|
||||||
final boolean forceOrginalId) {
|
final boolean forceOrginalId) {
|
||||||
super(vocs, invisible, shouldHashId, forceOrginalId);
|
super(vocs, invisible, shouldHashId, forceOrginalId);
|
||||||
|
@ -192,24 +202,40 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
/**
|
/**
|
||||||
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
|
* The Dublin Core element dc:type can be repeated, but we need to base our mapping on a single value
|
||||||
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
|
* So this method tries to give precedence to the COAR resource type, when available. Otherwise, it looks for the
|
||||||
* openaire's info:eu-repo type, and as last resort picks the 1st type text available
|
* openaire's info:eu-repo type, but excluding the following
|
||||||
|
*
|
||||||
|
* info:eu-repo/semantics/draft
|
||||||
|
* info:eu-repo/semantics/submittedVersion
|
||||||
|
* info:eu-repo/semantics/acceptedVersion
|
||||||
|
* info:eu-repo/semantics/publishedVersion
|
||||||
|
* info:eu-repo/semantics/updatedVersion
|
||||||
|
*
|
||||||
|
* Then, it picks the 1st dc:type text available and, in case there is no dc:type element, as last resort it tries
|
||||||
|
* to extract the type from the dr:CobjCategory element
|
||||||
|
*
|
||||||
|
* Examples:
|
||||||
*
|
*
|
||||||
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
|
* <dc:type>http://purl.org/coar/resource_type/c_5794</dc:type>
|
||||||
* <dc:type>info:eu-repo/semantics/article</dc:type>
|
* <dc:type>info:eu-repo/semantics/article</dc:type>
|
||||||
* <dc:type>Conference article</dc:type>
|
* <dc:type>Conference article</dc:type>
|
||||||
|
* <dr:CobjCategory type="publication">0006</dr:CobjCategory>
|
||||||
*
|
*
|
||||||
* @param doc the input document
|
* @param doc the input document
|
||||||
* @return the chosen resource type
|
* @return the chosen resource type
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected String findOriginalType(Document doc) {
|
protected String findOriginalType(Document doc) {
|
||||||
return (String) doc
|
final String dcType = (String) doc
|
||||||
.selectNodes("//dc:type")
|
.selectNodes("//dc:type")
|
||||||
.stream()
|
.stream()
|
||||||
.map(o -> "" + ((Node) o).getText().trim())
|
.map(o -> "" + ((Node) o).getText().trim())
|
||||||
|
.filter(t -> !DC_TYPE_PUBLICATION_VERSION.contains(t))
|
||||||
.sorted(new OriginalTypeComparator())
|
.sorted(new OriginalTypeComparator())
|
||||||
.findFirst()
|
.findFirst()
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
|
|
||||||
|
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
|
||||||
|
return ObjectUtils.firstNonNull(dcType, drCobjCategory);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -221,27 +221,36 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Datacite element
|
* Extracts the resource type from The Datacite element
|
||||||
*
|
*
|
||||||
* <datacite:resourceType xs:anyURI="http://purl.org/coar/resource_type/c_6501">journal article</datacite:resourceType>
|
* <datacite:resourceType
|
||||||
|
* anyURI="http://purl.org/coar/resource_type/c_6501"
|
||||||
|
* uri="http://purl.org/coar/resource_type/c_6501"
|
||||||
|
* resourceTypeGeneral="Dataset">journal article</datacite:resourceType>
|
||||||
*
|
*
|
||||||
* @param doc the input document
|
* @param doc the input document
|
||||||
* @return the chosen resource type
|
* @return the chosen resource type
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected String findOriginalType(Document doc) {
|
protected String findOriginalType(Document doc) {
|
||||||
return Optional
|
final String resourceType = Optional
|
||||||
.ofNullable(
|
.ofNullable(
|
||||||
(Element) doc
|
(Element) doc
|
||||||
.selectSingleNode(
|
.selectSingleNode(
|
||||||
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
|
"//*[local-name()='metadata']/*[local-name() = 'resource']/*[local-name() = 'resourceType']"))
|
||||||
.map(element -> {
|
.map(element -> {
|
||||||
final String resourceTypeURI = element.attributeValue("anyURI");
|
final String resourceTypeURI = element.attributeValue("uri");
|
||||||
|
final String resourceTypeAnyURI = element.attributeValue("anyURI");
|
||||||
final String resourceTypeTxt = element.getText();
|
final String resourceTypeTxt = element.getText();
|
||||||
|
final String resourceTypeGeneral = element.attributeValue("resourceTypeGeneral");
|
||||||
|
|
||||||
return ObjectUtils.firstNonNull(resourceTypeURI, resourceTypeTxt);
|
return ObjectUtils
|
||||||
|
.firstNonNull(resourceTypeURI, resourceTypeAnyURI, resourceTypeTxt, resourceTypeGeneral);
|
||||||
})
|
})
|
||||||
.orElse(null);
|
.orElse(null);
|
||||||
|
|
||||||
|
final String drCobjCategory = doc.valueOf("//dr:CobjCategory/text()");
|
||||||
|
return ObjectUtils.firstNonNull(resourceType, drCobjCategory);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in New Issue