From 95740767e07cb6360797f48c73a0dc124ec2d903 Mon Sep 17 00:00:00 2001 From: Michele Artini Date: Mon, 10 Feb 2020 16:04:06 +0100 Subject: [PATCH] Ready for tests --- .../dhp/migration/AbstractMongoExecutor.java | 72 +++++-- .../dhp/migration/OafMigrationExecutor.java | 116 ++++++------ .../dhp/migration/OdfMigrationExecutor.java | 54 +++++- .../dhp/migration/pace/PacePerson.java | 176 ++++++++++++++++++ .../dhp/migration/pace/name_particles.txt | 7 + 5 files changed, 346 insertions(+), 79 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java index 1fa70dded..b2792e292 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/AbstractMongoExecutor.java @@ -30,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; @@ -164,14 +165,56 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { } if (!oafs.isEmpty()) { - addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO - addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); } return oafs; } + private List addProjectRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + final String projectId = createOpenaireId(40, ((Node) o).getText()); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + + return res; + } + + protected abstract List addOtherResultRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + private void populateResultFields(final Result r, final Document doc, final KeyValue collectedFrom, @@ -199,19 +242,21 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { r.setPublisher(preparePublisher(doc, info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setSource(prepareSources(doc, info)); - r.setFulltext(null); // NOT PRESENT IN MDSTORES + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setFormat(prepareFormats(doc, info)); r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(null); // TODO + r.setResourcetype(prepareResourceType(doc, info)); r.setCoverage(prepareCoverages(doc, info)); - r.setRefereed(null); // TODO - r.setContext(null); // TODO - r.setExternalReference(null); // TODO + r.setRefereed(null); // NOT PRESENT IN MDSTORES + r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); - r.setProcessingchargeamount(null); // TODO - r.setProcessingchargecurrency(null); // TODO + r.setProcessingchargeamount(null); // NOT PRESENT IN MDSTORES + r.setProcessingchargecurrency(null); // NOT PRESENT IN MDSTORES } + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + protected abstract List prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List> prepareSources(Document doc, DataInfo info); @@ -264,13 +309,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor { protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - abstract protected void addRelations(final List oafs, - final Document doc, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); - private Journal prepareJournal(final Document doc, final DataInfo info) { final Node n = doc.selectSingleNode("//oaf:journal"); if (n != null) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java index 6dcfae71f..75360943c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OafMigrationExecutor.java @@ -10,6 +10,7 @@ import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.Node; +import eu.dnetlib.dhp.migration.pace.PacePerson; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -37,29 +38,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); } - @Override - protected void addRelations(final List oafs, - final Document doc, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - for (final Object o : doc.selectNodes("//")) { // TODO - final Node n = (Node) o; - final Relation r = new Relation(); - r.setRelType(null); // TODO - r.setSubRelType(null); // TODO - r.setRelClass(null); // TODO - r.setSource(null); // TODO - r.setTarget(null); // TODO - r.setCollectedFrom(Arrays.asList(collectedFrom)); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - oafs.add(r); - } - - } - @Override protected List prepareAuthors(final Document doc, final DataInfo info) { final List res = new ArrayList<>(); @@ -69,6 +47,11 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { final Author author = new Author(); author.setFullname(n.getText()); author.setRank(pos++); + final PacePerson p = new PacePerson(n.getText(), false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } res.add(author); } return res; @@ -142,97 +125,124 @@ public class OafMigrationExecutor extends AbstractMongoExecutor { @Override protected List prepareRelevantDates(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } // SOFTWARES @Override protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected List prepareSoftwareLicenses(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } // DATASETS @Override protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } @Override protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // NOT PRESENT IN OAF } // OTHER PRODUCTS @Override protected List> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF } @Override protected List> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return new ArrayList<>(); // NOT PRESENT IN OAF + } + + @Override + protected List addOtherResultRels(final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText()); + + final Relation r1 = new Relation(); + r1.setRelType("resultResult"); + r1.setSubRelType("publicationDataset"); + r1.setRelClass("isRelatedTo"); + r1.setSource(docId); + r1.setTarget(otherId); + r1.setCollectedFrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultResult"); + r2.setSubRelType("publicationDataset"); + r2.setRelClass("isRelatedTo"); + r2.setSource(otherId); + r2.setTarget(docId); + r2.setCollectedFrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + return res; + } + + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java index 5e9c70ae5..b1dbfcdf4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/OdfMigrationExecutor.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.migration; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map; @@ -18,6 +19,7 @@ import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; public class OdfMigrationExecutor extends AbstractMongoExecutor { @@ -190,8 +192,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { @Override protected Field prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { - // TODO Auto-generated method stub - return null; + return null; // Not present in ODF ??? } @Override @@ -220,14 +221,49 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor { } @Override - protected void addRelations(final List oafs, - final Document doc, - final String type, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - // TODO Auto-generated method stub + protected List addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier")); + + final List res = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) { + final String otherId = createOpenaireId(50, ((Node) o).getText()); + final String type = ((Node) o).valueOf("@relationType"); + + if (type.equals("IsSupplementTo")) { + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo")); + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy")); + } else if (type.equals("IsPartOf")) { + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); + res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); + } else {} + } + return res; + } + + private Relation prepareOtherResultRel(final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp, + final String source, + final String target, + final String subRelType, + final String relClass) { + final Relation r = new Relation(); + r.setRelType("resultResult"); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + r.setSource(source); + r.setTarget(target); + r.setCollectedFrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } + + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource"); } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java new file mode 100644 index 000000000..927f5641b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/migration/pace/PacePerson.java @@ -0,0 +1,176 @@ +package eu.dnetlib.dhp.migration.pace; + +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.text.WordUtils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +public class PacePerson { + + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; + + private static Set particles = null; + + public static final String capitalize(final String s) { + return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); + } + + public static final String dotAbbreviations(final String s) { + return s.length() == 1 ? s + "." : s; + } + + public static Set loadFromClasspath(final String classpath) { + final Set h = new HashSet<>(); + try { + for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return new HashSet<>(); + } + return h; + } + + public PacePerson(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } + + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } + + private List splitTerms(final String s) { + if (particles == null) { + particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt"); + } + + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } + + public List getName() { + return name; + } + + public String getNameString() { + return Joiner.on(" ").join(getName()); + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String getOriginal() { + return original; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString(); + } + + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } + + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } + + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } + + public String getNormalisedFullname() { + return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname); + } + + public List getCapitalFirstnames() { + return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + } + + public boolean isAccurate() { + return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt new file mode 100644 index 000000000..dae37c9dc --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/migration/pace/name_particles.txt @@ -0,0 +1,7 @@ +van +der +de +dell +sig +mr +mrs