package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.Node; import com.google.common.collect.Lists; import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; public class OafToOafMapper extends AbstractMdRecordToOafMapper { public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId, final boolean forceOrginalId) { super(vocs, invisible, shouldHashId, forceOrginalId); } public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible, final boolean shouldHashId) { super(vocs, invisible, shouldHashId); } @Override protected List prepareAuthors(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); int pos = 1; for (final Object o : doc.selectNodes("//dc:creator")) { final Element e = (Element) o; final Author author = new Author(); author.setFullname(e.getText()); author.setRank(pos++); final PacePerson p = new PacePerson(e.getText(), false); if (p.isAccurate()) { author.setName(p.getNormalisedFirstName()); author.setSurname(p.getNormalisedSurname()); } final String pid = e.valueOf("./@nameIdentifier"); final String type = e .valueOf("./@nameIdentifierScheme") .trim() .toUpperCase() .replace(" ", "") .replace("_", ""); author.setPid(new ArrayList<>()); if (StringUtils.isNotBlank(pid)) { if (type.toLowerCase().startsWith(ORCID)) { final String cleanedId = pid .replaceAll("http://orcid.org/", "") .replaceAll("https://orcid.org/", ""); author.getPid().add(authorPid(cleanedId, ORCID_PID_TYPE, info)); } else if (type.startsWith("MAGID")) { author.getPid().add(authorPid(pid, MAG_PID_TYPE, info)); } } res.add(author); } return res; } @Override protected Qualifier prepareLanguages(final Document doc) { return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES); } @Override protected List prepareSubjects(final Document doc, final DataInfo info) { return prepareSubjectList(doc, "//dc:subject", info); } @Override protected List prepareTitles(final Document doc) { return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER); } @Override protected List prepareDescriptions(final Document doc) { return prepareListFields(doc, "//dc:description") .stream() .map(d -> StringUtils.left(d, ModelHardLimits.MAX_ABSTRACT_LENGTH)) .collect(Collectors.toList()); } @Override protected Publisher preparePublisher(final Document doc) { return publisher(doc.valueOf("//dc:publisher")); } @Override protected List prepareFormats(final Document doc) { return prepareListFields(doc, "//dc:format"); } @Override protected List prepareContributors(final Document doc) { return prepareListFields(doc, "//dc:contributor"); } @Override protected List prepareCoverages(final Document doc) { return prepareListFields(doc, "//dc:coverage"); } @Override protected List prepareInstances( final Document doc, final DataInfo info, final KeyValue collectedfrom, final KeyValue hostedby) { final Instance instance = new Instance(); instance .setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE)); instance.setCollectedfrom(collectedfrom); instance.setHostedby(hostedby); final List alternateIdentifier = prepareResultPids(doc, info); final List pid = IdentifierFactory.getPids(alternateIdentifier, collectedfrom); final Set pids = pid.stream().collect(Collectors.toCollection(HashSet::new)); instance .setAlternateIdentifier( alternateIdentifier.stream().filter(i -> !pids.contains(i)).collect(Collectors.toList())); instance.setPid(pid); instance.setDateofacceptance(doc.valueOf("//oaf:dateAccepted")); instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); instance .setAccessright(prepareAccessRight(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); instance.setLicense(license(doc.valueOf("//oaf:license"))); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS)); instance .setProcessingchargeamount(doc.valueOf("//oaf:processingchargeamount")); instance .setProcessingchargecurrency(doc.valueOf("//oaf:processingchargeamount/@currency")); final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); final List url = nodes .stream() .filter(n -> StringUtils.isNotBlank(n.getText())) .map(n -> n.getText().trim()) .filter(u -> u.startsWith("http")) .map(s -> { try { return URLDecoder.decode(s, "UTF-8"); } catch (Throwable t) { return s; } }) .distinct() .collect(Collectors.toCollection(ArrayList::new)); final Set validUrl = validateUrl(url); if (!validUrl.isEmpty()) { instance.setUrl(new ArrayList<>()); instance.getUrl().addAll(validUrl); } return Lists.newArrayList(instance); } @Override protected List prepareSources(final Document doc) { return prepareListFields(doc, "//dc:source"); } @Override protected List prepareRelevantDates(final Document doc) { return new ArrayList<>(); // NOT PRESENT IN OAF } // SOFTWARES @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected String prepareSoftwareCodeRepositoryUrl( final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected List prepareSoftwareDocumentationUrls(final Document doc) { return new ArrayList<>(); // NOT PRESENT IN OAF } // DATASETS @Override protected List prepareDatasetGeoLocations(final Document doc) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected String prepareDatasetMetadataVersionNumber(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected String prepareDatasetLastMetadataUpdate(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected String prepareDatasetVersion(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected String prepareDatasetSize(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected String prepareDatasetDevice(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected String prepareDatasetStorageDate(final Document doc) { return null; // NOT PRESENT IN OAF } // OTHER PRODUCTS @Override protected List prepareOtherResearchProductTools(final Document doc) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List prepareOtherResearchProductContactGroups(final Document doc) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List prepareOtherResearchProductContactPersons(final Document doc) { return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List addOtherResultRels( final Document doc, final Entity entity) { final String docId = entity.getId(); final List res = new ArrayList<>(); for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { final String originalId = ((Node) o).getText(); if (StringUtils.isNotBlank(originalId)) { final String otherId = createOpenaireId(50, originalId, false); res .add( getRelation( docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity)); res .add( getRelation( otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity)); } } return res; } @Override protected Qualifier prepareResourceType(final Document doc) { return null; // NOT PRESENT IN OAF } @Override protected List prepareResultPids(final Document doc, final DataInfo info) { return prepareListStructPropsWithValidQualifier( doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info) .stream() .map(CleaningFunctions::normalizePidValue) .collect(Collectors.toList()); } }