2020-04-28 11:23:29 +02:00
|
|
|
|
2020-04-10 17:53:07 +02:00
|
|
|
package eu.dnetlib.dhp.oa.graph.raw;
|
2020-03-02 16:12:14 +01:00
|
|
|
|
2020-06-12 10:58:02 +02:00
|
|
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
2020-11-13 10:05:12 +01:00
|
|
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
|
|
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
|
|
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
2020-05-22 12:25:01 +02:00
|
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
2020-05-08 09:43:26 +02:00
|
|
|
import java.util.stream.Collectors;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
import org.dom4j.Document;
|
2020-05-15 12:26:16 +02:00
|
|
|
import org.dom4j.Element;
|
2020-04-28 11:23:29 +02:00
|
|
|
import org.dom4j.Node;
|
|
|
|
|
2020-05-06 13:20:02 +02:00
|
|
|
import com.google.common.collect.Lists;
|
|
|
|
|
2020-05-25 10:35:39 +02:00
|
|
|
import eu.dnetlib.dhp.common.PacePerson;
|
2020-05-27 11:34:13 +02:00
|
|
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
2020-05-22 12:25:01 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
2020-02-04 15:25:47 +01:00
|
|
|
|
2020-03-02 16:12:14 +01:00
|
|
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
2020-02-05 15:35:40 +01:00
|
|
|
|
2020-06-10 10:04:00 +02:00
|
|
|
public OafToOafMapper(final VocabularyGroup vocs, final boolean invisible) {
|
|
|
|
super(vocs, invisible);
|
2020-04-28 11:23:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
|
|
|
final List<Author> res = new ArrayList<>();
|
|
|
|
int pos = 1;
|
|
|
|
for (final Object o : doc.selectNodes("//dc:creator")) {
|
2020-05-15 12:26:16 +02:00
|
|
|
final Element e = (Element) o;
|
2020-04-28 11:23:29 +02:00
|
|
|
final Author author = new Author();
|
2020-05-15 12:26:16 +02:00
|
|
|
author.setFullname(e.getText());
|
2020-04-28 11:23:29 +02:00
|
|
|
author.setRank(pos++);
|
2020-05-15 12:26:16 +02:00
|
|
|
final PacePerson p = new PacePerson(e.getText(), false);
|
2020-04-28 11:23:29 +02:00
|
|
|
if (p.isAccurate()) {
|
|
|
|
author.setName(p.getNormalisedFirstName());
|
|
|
|
author.setSurname(p.getNormalisedSurname());
|
|
|
|
}
|
2020-05-15 12:26:16 +02:00
|
|
|
|
2020-05-22 12:25:01 +02:00
|
|
|
final String pid = e.valueOf("./@nameIdentifier");
|
2020-05-22 12:34:00 +02:00
|
|
|
final String type = e
|
|
|
|
.valueOf("./@nameIdentifierScheme")
|
2020-05-22 12:25:01 +02:00
|
|
|
.trim()
|
|
|
|
.toUpperCase()
|
|
|
|
.replaceAll(" ", "")
|
|
|
|
.replaceAll("_", "");
|
2020-05-15 12:26:16 +02:00
|
|
|
|
2020-05-15 17:06:01 +02:00
|
|
|
author.setPid(new ArrayList<>());
|
2020-05-22 12:25:01 +02:00
|
|
|
|
|
|
|
if (StringUtils.isNotBlank(pid)) {
|
|
|
|
if (type.startsWith("ORCID")) {
|
2020-05-22 12:34:00 +02:00
|
|
|
final String cleanedId = pid
|
|
|
|
.replaceAll("http://orcid.org/", "")
|
|
|
|
.replaceAll("https://orcid.org/", "");
|
2020-05-22 12:25:01 +02:00
|
|
|
author.getPid().add(structuredProperty(cleanedId, ORCID_PID_TYPE, info));
|
|
|
|
} else if (type.startsWith("MAGID")) {
|
|
|
|
author.getPid().add(structuredProperty(pid, MAG_PID_TYPE, info));
|
|
|
|
}
|
2020-05-15 12:26:16 +02:00
|
|
|
}
|
|
|
|
|
2020-04-28 11:23:29 +02:00
|
|
|
res.add(author);
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Qualifier prepareLanguages(final Document doc) {
|
2020-05-27 11:34:13 +02:00
|
|
|
return prepareQualifier(doc, "//dc:language", DNET_LANGUAGES);
|
2020-04-28 11:23:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<StructuredProperty> prepareSubjects(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListStructProps(doc, "//dc:subject", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<StructuredProperty> prepareTitles(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListFields(doc, "//dc:description", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> preparePublisher(final Document doc, final DataInfo info) {
|
|
|
|
return prepareField(doc, "//dc:publisher", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareFormats(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListFields(doc, "//dc:format", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareContributors(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListFields(doc, "//dc:contributor", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareCoverages(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListFields(doc, "//dc:coverage", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Instance> prepareInstances(
|
|
|
|
final Document doc,
|
|
|
|
final DataInfo info,
|
|
|
|
final KeyValue collectedfrom,
|
|
|
|
final KeyValue hostedby) {
|
2020-05-06 13:20:02 +02:00
|
|
|
|
|
|
|
final Instance instance = new Instance();
|
|
|
|
instance
|
2020-05-27 11:34:13 +02:00
|
|
|
.setInstancetype(prepareQualifier(doc, "//dr:CobjCategory", DNET_PUBLICATION_RESOURCE));
|
2020-05-06 13:20:02 +02:00
|
|
|
instance.setCollectedfrom(collectedfrom);
|
|
|
|
instance.setHostedby(hostedby);
|
|
|
|
instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info));
|
|
|
|
instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation"));
|
|
|
|
instance
|
2020-05-27 11:34:13 +02:00
|
|
|
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
|
2020-05-06 13:20:02 +02:00
|
|
|
instance.setLicense(field(doc.valueOf("//oaf:license"), info));
|
2020-06-09 19:52:53 +02:00
|
|
|
instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
|
2020-05-06 13:20:02 +02:00
|
|
|
instance
|
2020-05-22 12:25:01 +02:00
|
|
|
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
|
2020-05-06 13:20:02 +02:00
|
|
|
instance
|
2020-05-22 12:25:01 +02:00
|
|
|
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));
|
2020-05-06 13:20:02 +02:00
|
|
|
|
2020-05-22 12:25:01 +02:00
|
|
|
final List<Node> nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier"));
|
2020-05-08 09:43:26 +02:00
|
|
|
instance
|
2020-05-26 13:11:09 +02:00
|
|
|
.setUrl(
|
|
|
|
nodes
|
|
|
|
.stream()
|
|
|
|
.filter(n -> StringUtils.isNotBlank(n.getText()))
|
|
|
|
.map(n -> n.getText().trim())
|
|
|
|
.filter(u -> u.startsWith("http"))
|
|
|
|
.distinct()
|
|
|
|
.collect(Collectors.toCollection(ArrayList::new)));
|
2020-05-08 09:43:26 +02:00
|
|
|
|
2020-05-06 13:20:02 +02:00
|
|
|
return Lists.newArrayList(instance);
|
2020-04-28 11:23:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareSources(final Document doc, final DataInfo info) {
|
|
|
|
return prepareListFields(doc, "//dc:source", info);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
// SOFTWARES
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareSoftwareCodeRepositoryUrl(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<StructuredProperty> prepareSoftwareLicenses(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareSoftwareDocumentationUrls(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
// DATASETS
|
|
|
|
@Override
|
|
|
|
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareDatasetMetadataVersionNumber(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareDatasetLastMetadataUpdate(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
// OTHER PRODUCTS
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareOtherResearchProductTools(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareOtherResearchProductContactGroups(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Field<String>> prepareOtherResearchProductContactPersons(
|
2020-05-22 12:25:01 +02:00
|
|
|
final Document doc,
|
|
|
|
final DataInfo info) {
|
2020-04-28 11:23:29 +02:00
|
|
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<Oaf> addOtherResultRels(
|
|
|
|
final Document doc,
|
|
|
|
final KeyValue collectedFrom,
|
|
|
|
final DataInfo info,
|
|
|
|
final long lastUpdateTimestamp) {
|
|
|
|
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
|
|
|
|
|
|
|
final List<Oaf> res = new ArrayList<>();
|
|
|
|
|
|
|
|
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
|
|
|
|
|
|
|
final String originalId = ((Node) o).getText();
|
|
|
|
|
|
|
|
if (StringUtils.isNotBlank(originalId)) {
|
|
|
|
|
|
|
|
final String otherId = createOpenaireId(50, originalId, false);
|
|
|
|
|
2020-05-06 13:20:02 +02:00
|
|
|
res
|
2020-05-26 13:11:09 +02:00
|
|
|
.add(
|
|
|
|
getRelation(
|
2020-06-12 10:58:02 +02:00
|
|
|
docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
|
2020-05-26 13:11:09 +02:00
|
|
|
lastUpdateTimestamp));
|
2020-05-06 13:20:02 +02:00
|
|
|
res
|
2020-05-26 13:11:09 +02:00
|
|
|
.add(
|
|
|
|
getRelation(
|
2020-06-12 10:58:02 +02:00
|
|
|
otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
|
2020-05-26 13:11:09 +02:00
|
|
|
lastUpdateTimestamp));
|
2020-04-28 11:23:29 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
|
|
|
return null; // NOT PRESENT IN OAF
|
|
|
|
}
|
2020-05-26 13:06:55 +02:00
|
|
|
|
|
|
|
@Override
|
|
|
|
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
2020-05-27 11:34:13 +02:00
|
|
|
return prepareListStructPropsWithValidQualifier(
|
|
|
|
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info);
|
2020-05-26 13:06:55 +02:00
|
|
|
}
|
2020-02-04 15:25:47 +01:00
|
|
|
}
|