package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLDecoder; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public static final String HTTP_DOI_PREIFX = "https://doi.org/"; public static final String HTTP_HANDLE_PREIFX = "https://hdl.handle.net/"; public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, final boolean forceOrginalId) { super(vocs, invisible, shouldHashId, forceOrginalId); } public OdfToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { super(vocs, invisible, shouldHashId); } @Override protected List prepareTitles(final Document doc, final DataInfo info) { final List title = Lists.newArrayList(); final String xpath = "//*[local-name()='titles']/*[local-name()='title']|//*[local-name()='resource']/*[local-name()='title']"; for (Object o : doc.selectNodes(xpath)) { Element e = (Element) o; final String titleValue = e.getTextTrim(); final String titleType = e.attributeValue("titleType"); if (StringUtils.isNotBlank(titleType)) { title .add( structuredProperty( titleValue, titleType, titleType, DNET_DATACITE_TITLE, DNET_DATACITE_TITLE, info)); } else { title.add(structuredProperty(titleValue, MAIN_TITLE_QUALIFIER, info)); } } return title; } @Override protected List prepareAuthors(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); int pos = 1; for (final Object o : doc.selectNodes("//*[local-name()='creator']")) { final Node n = (Node) o; final Author author = new Author(); final String fullname = n.valueOf("./*[local-name()='creatorName']"); final String name = n.valueOf("./*[local-name()='givenName']"); final String surname = n.valueOf("./*[local-name()='familyName']"); if (StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)) { author.setFullname(fullname); final PacePerson pp = new PacePerson(fullname, false); if (StringUtils.isBlank(name) & pp.isAccurate()) { author.setName(pp.getNormalisedFirstName()); } else { author.setName(name); } if (StringUtils.isBlank(surname) & pp.isAccurate()) { author.setSurname(pp.getNormalisedSurname()); } else { author.setSurname(surname); } if (StringUtils.isBlank(author.getFullname())) { author.setFullname(String.format("%s, %s", author.getSurname(), author.getName())); } author.setAffiliation(prepareListFields(n, "./*[local-name()='affiliation']", info)); author.setPid(preparePids(n, info)); author.setRank(pos++); res.add(author); } } return res; } private List preparePids(final Node n, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : n.selectNodes("./*[local-name()='nameIdentifier']")) { final String id = ((Node) o).getText(); final String type = ((Node) o) .valueOf("./@nameIdentifierScheme") .trim() .toUpperCase() .replace(" ", "") .replace("_", ""); if (type.toLowerCase().startsWith(ORCID)) { final String cleanedId = id.replace("http://orcid.org/", "").replace("https://orcid.org/", ""); res.add(structuredProperty(cleanedId, ORCID_PID_TYPE, info)); } else if (type.startsWith("MAGID")) { res.add(structuredProperty(id, MAG_PID_TYPE, info)); } } return res; } @Override protected List prepareInstances( final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { final Instance instance = new Instance(); instance .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); final List alternateIdentifier = prepareResultPids(doc, info); final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); instance .setAlternateIdentifier( alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList())); instance.setPid(pid); instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); final String distributionlocation = doc.valueOf("//oaf:distributionlocation"); instance.setDistributionlocation(StringUtils.isNotBlank(distributionlocation) ? distributionlocation : null); instance .setAccessright(prepareAccessRight(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); final Set url = new HashSet<>(); for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='URL']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='landingPage']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='URL']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } Set validUrl = validateUrl(url); if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) { for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); } } if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) { for (final Object o : doc .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); } } if (!validUrl.isEmpty()) { instance.setUrl(new ArrayList<>()); instance.getUrl().addAll(validUrl); } return Arrays.asList(instance); } protected String trimAndDecodeUrl(String url) { try { return URLDecoder.decode(url.trim(), "UTF-8"); } catch (Throwable t) { return url; } } @Override protected List> prepareSources(final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//*[local-name()='date']")) { final String dateType = ((Node) o).valueOf("@dateType"); if (StringUtils.isBlank(dateType) || (!dateType.equalsIgnoreCase("Accepted") && !dateType.equalsIgnoreCase("Issued") && !dateType.equalsIgnoreCase("Updated") && !dateType.equalsIgnoreCase("Available"))) { res .add( structuredProperty( ((Node) o).getText(), UNKNOWN, UNKNOWN, DNET_DATACITE_DATE, DNET_DATACITE_DATE, info)); } else { res .add( structuredProperty( ((Node) o).getText(), dateType, dateType, DNET_DATACITE_DATE, DNET_DATACITE_DATE, info)); } } return res; } @Override protected List> prepareCoverages(final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareContributors(final Document doc, final DataInfo info) { return prepareListFields(doc, "//*[local-name()='contributorName']", info); } @Override protected List> prepareFormats(final Document doc, final DataInfo info) { return prepareListFields(doc, "//*[local-name()='format']", info); } @Override protected Field preparePublisher(final Document doc, final DataInfo info) { return prepareField(doc, "//*[local-name()='publisher']", info); } @Override protected List> prepareDescriptions(final Document doc, final DataInfo info) { return prepareListFields(doc, "//*[local-name()='description' and ./@descriptionType='Abstract']", info); } @Override protected List prepareSubjects(final Document doc, final DataInfo info) { return prepareSubjectList(doc, "//*[local-name()='subject']", info); } @Override protected Qualifier prepareLanguages(final Document doc) { return prepareQualifier(doc, "//*[local-name()='language']", DNET_LANGUAGES); } @Override protected List> prepareOtherResearchProductTools( final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareOtherResearchProductContactGroups( final Document doc, final DataInfo info) { return prepareListFields( doc, "//*[local-name()='contributor' and ./@contributorType='ContactGroup']/*[local-name()='contributorName']", info); } @Override protected List> prepareOtherResearchProductContactPersons( final Document doc, final DataInfo info) { return prepareListFields( doc, "//*[local-name()='contributor' and ./@contributorType='ContactPerson']/*[local-name()='contributorName']", info); } @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { return prepareQualifier(doc, "//*[local-name()='format']", DNET_PROGRAMMING_LANGUAGES); } @Override protected Field prepareSoftwareCodeRepositoryUrl( final Document doc, final DataInfo info) { return null; // Not present in ODF ??? } @Override protected List prepareSoftwareLicenses( final Document doc, final DataInfo info) { return new ArrayList<>(); // Not present in ODF ??? } @Override protected List> prepareSoftwareDocumentationUrls( final Document doc, final DataInfo info) { return prepareListFields( doc, "//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", info); } // DATASETS @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//*[local-name()='geoLocation']")) { final GeoLocation loc = new GeoLocation(); loc.setBox(((Node) o).valueOf("./*[local-name()='geoLocationBox']")); loc.setPlace(((Node) o).valueOf("./*[local-name()='geoLocationPlace']")); loc.setPoint(((Node) o).valueOf("./*[local-name()='geoLocationPoint']")); res.add(loc); } return res; } @Override protected Field prepareDatasetMetadataVersionNumber( final Document doc, final DataInfo info) { return null; // Not present in ODF ??? } @Override protected Field prepareDatasetLastMetadataUpdate( final Document doc, final DataInfo info) { return prepareField(doc, "//*[local-name()='date' and ./@dateType='Updated']", info); } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { return prepareField(doc, "//*[local-name()='version']", info); } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { return prepareField(doc, "//*[local-name()='size']", info); } @Override protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { return null; // Not present in ODF ??? } @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { return prepareField(doc, "//*[local-name()='date' and ./@dateType='Issued']", info); } @Override protected List addOtherResultRels( final Document doc, final OafEntity entity) { final String docId = entity.getId(); final List res = new ArrayList<>(); /* /* https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/b4be0f3e-41d7-471f-b34e-f0bd54ff5698 https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/5d6e575b-ef84-417a-9d76-61c6702f7cb2 https://w3id.org/ro-id/13c54585-362e-4925-a785-08afb591fa0d/resources/35e01545-8c6d-49bd-ab98-5c152df69934 We could extend it to create the relationships targeting w3id, dois, pmcids and other pid types for which we know how to build the target openaire identifier "blindly". for (final Object o : doc .selectNodes("//*[local-name()='relatedIdentifier']")) { final String originalId = ((Node) o).getText(); if (StringUtils.isNotBlank(originalId)) { final String otherId = createOpenaireId(50, originalId, false); final String type = ((Node) o).valueOf("@relationType"); switch(type){ case IS_SUPPLEMENT_TO: break; case SUPPLEMENT: break; case IS_PART_OF: break; case HAS_PART: break; } */ for (final Object o : doc .selectNodes("//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { final String originalId = ((Node) o).getText(); if (StringUtils.isNotBlank(originalId)) { final String otherId = createOpenaireId(50, originalId, false); final String type = ((Node) o).valueOf("@relationType"); if (type.equalsIgnoreCase(IS_SUPPLEMENT_TO)) { res .add( getRelation( docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity)); res .add( getRelation( otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity)); } else if (type.equalsIgnoreCase(IS_PART_OF)) { res .add( getRelation( docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity)); res .add( getRelation( otherId, docId, RESULT_RESULT, PART, HAS_PART, entity)); } else { // TODO catch more semantics } } } return res; } @Override protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { return prepareQualifier( doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", DNET_DATA_CITE_RESOURCE); } @Override protected List prepareResultPids(final Document doc, final DataInfo info) { final Set res = new HashSet<>(); res .addAll( prepareListStructPropsWithValidQualifier( doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)); res .addAll( prepareListStructPropsWithValidQualifier( doc, "//*[local-name()='identifier' and ./@identifierType != 'URL' and ./@identifierType != 'landingPage']", "@identifierType", DNET_PID_TYPES, info)); res .addAll( prepareListStructPropsWithValidQualifier( doc, "//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType != 'URL' and ./@alternateIdentifierType != 'landingPage']", "@alternateIdentifierType", DNET_PID_TYPES, info)); return res .stream() .map(CleaningFunctions::normalizePidValue) .collect(Collectors.toList()); } }