package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field; import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Node; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DX_DOI_PREIFX = "http://dx.doi.org/"; public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible) { super(vocs, invisible); } @Override protected List prepareTitles(final Document doc, final DataInfo info) { return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); } @Override protected List prepareAuthors(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); int pos = 1; for (final Object o : doc.selectNodes("//datacite:creator")) { final Node n = (Node) o; final Author author = new Author(); final String fullname = n.valueOf("./datacite:creatorName"); author.setFullname(fullname); final PacePerson pp = new PacePerson(fullname, false); final String name = n.valueOf("./datacite:givenName"); if (StringUtils.isBlank(name) & pp.isAccurate()) { author.setName(pp.getNormalisedFirstName()); } else { author.setName(name); } final String surname = n.valueOf("./datacite:familyName"); if (StringUtils.isBlank(surname) & pp.isAccurate()) { author.setSurname(pp.getNormalisedSurname()); } else { author.setSurname(surname); } if (StringUtils.isBlank(author.getFullname())) { author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); } author.setAffiliation(prepareListFields(n, "./datacite:affiliation", info)); author.setPid(preparePids(n, info)); author.setRank(pos++); res.add(author); } return res; } private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : n.selectNodes("./datacite:nameIdentifier")) { final String id = ((Node) o).getText(); final String type = ((Node) o) .valueOf("./@nameIdentifierScheme") .trim() .toUpperCase() .replaceAll(" ", "") .replaceAll("_", ""); if (type.startsWith("ORCID")) { final String cleanedId = id.replaceAll("http://orcid.org/", "").replaceAll("https://orcid.org/", ""); res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); } else if (type.startsWith("MAGID")) { res.add(structuredProperty(id, MAG_PID_TYPE, info)); } } return res; } @Override protected List prepareInstances( final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { final Instance instance = new Instance(); instance .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); final Set url = new HashSet<>(); for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { url.add(((Node) o).getText().trim()); } for (final Object o : doc .selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='landingPage']")) { url.add(((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) { url.add(((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='landingPage']")) { url.add(((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) { url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { url.add(HTTP_DX_DOI_PREIFX + ((Node) o).getText().trim()); } if (!url.isEmpty()) { instance.setUrl(new ArrayList<>()); instance.getUrl().addAll(url); } return Arrays.asList(instance); } @Override protected List> prepareSources(final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//datacite:date")) { final String dateType = ((Node) o).valueOf("@dateType"); if (StringUtils.isBlank(dateType) && !dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available")) { res .add( structuredProperty( ((Node) o).getText(), "UNKNOWN", "UNKNOWN", DNET_DATA_CITE_DATE, DNET_DATA_CITE_DATE, info)); } } return res; } @Override protected List> prepareCoverages(final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareContributors(final Document doc, final DataInfo info) { return prepareListFields(doc, "//datacite:contributorName", info); } @Override protected List> prepareFormats(final Document doc, final DataInfo info) { return prepareListFields(doc, "//datacite:format", info); } @Override protected Field preparePublisher(final Document doc, final DataInfo info) { return prepareField(doc, "//datacite:publisher", info); } @Override protected List> prepareDescriptions(final Document doc, final DataInfo info) { return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info) .stream() .map(d -> { d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH)); return d; }) .collect(Collectors.toList()); } @Override protected List prepareSubjects(final Document doc, final DataInfo info) { return prepareListStructProps(doc, "//datacite:subject", info); } @Override protected Qualifier prepareLanguages(final Document doc) { return prepareQualifier(doc, "//datacite:language", DNET_LANGUAGES); } @Override protected List> prepareOtherResearchProductTools( final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { return prepareListFields( doc, "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { return prepareListFields( doc, "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { return prepareQualifier(doc, "//datacite:format", "dnet:programming_languages"); } @Override protected Field prepareSoftwareCodeRepositoryUrl( final Document doc, final DataInfo info) { return null; // Not present in ODF ??? } @Override protected List prepareSoftwareLicenses( final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { return prepareListFields( doc, "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//datacite:geoLocation")) { final GeoLocation loc = new GeoLocation(); loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); res.add(loc); } return res; } @Override protected Field prepareDatasetMetadataVersionNumber( final Document doc, final DataInfo info) { return null; // Not present in ODF ??? } @Override protected Field prepareDatasetLastMetadataUpdate( final Document doc, final DataInfo info) { return prepareField(doc, "//datacite:date[@dateType='Updated']", info); } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { return prepareField(doc, "//datacite:version", info); } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { return prepareField(doc, "//datacite:size", info); } @Override protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { return null; // Not present in ODF ??? } @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { return prepareField(doc, "//datacite:date[@dateType='Issued']", info); } @Override protected List addOtherResultRels( final Document doc, final OafEntity entity) { final String docId = entity.getId(); final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { final String originalId = ((Node) o).getText(); if (StringUtils.isNotBlank(originalId)) { final String otherId = createOpenaireId(50, originalId, false); final String type = ((Node) o).valueOf("@relationType"); if (type.equalsIgnoreCase(IS_SUPPLEMENT_TO)) { res .add( getRelation( docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); res .add( getRelation( otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); } else if (type.equalsIgnoreCase(IS_PART_OF)) { res .add( getRelation( docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity)); res .add( getRelation( otherId, docId, RESULT_RESULT, PART, HAS_PARTS, entity)); } else { // TODO catch more semantics } } } return res; } @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE); } @Override protected List prepareResultPids(final Document doc, final DataInfo info) { final Set res = new HashSet(); res .addAll( prepareListStructPropsWithValidQualifier( doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)); res .addAll( prepareListStructPropsWithValidQualifier( doc, "//datacite:identifier[@identifierType != 'URL' and @identifierType != 'landingPage']", "@identifierType", DNET_PID_TYPES, info)); res .addAll( prepareListStructPropsWithValidQualifier( doc, "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']", "@alternateIdentifierType", DNET_PID_TYPES, info)); return res .stream() .map(CleaningFunctions::normalizePidValue) .collect(Collectors.toList()); } }