master #2

Merged
sandro.labruzzo merged 16 commits from michele.artini/dnet-hadoop:master into master 2020-02-17 10:43:09 +01:00
5 changed files with 346 additions and 79 deletions
Showing only changes of commit 95740767e0 - Show all commits

View File

@ -30,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
@ -164,14 +165,56 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
} }
if (!oafs.isEmpty()) { if (!oafs.isEmpty()) {
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
} }
return oafs; return oafs;
} }
private List<Oaf> addProjectRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
for (final Object o : doc.selectNodes("//oaf:projectid")) {
final String projectId = createOpenaireId(40, ((Node) o).getText());
final Relation r1 = new Relation();
r1.setRelType("resultProject");
r1.setSubRelType("outcome");
r1.setRelClass("isProducedBy");
r1.setSource(docId);
r1.setTarget(projectId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultProject");
r2.setSubRelType("outcome");
r2.setRelClass("produces");
r2.setSource(projectId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
protected abstract List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp);
private void populateResultFields(final Result r, private void populateResultFields(final Result r,
final Document doc, final Document doc,
final KeyValue collectedFrom, final KeyValue collectedFrom,
@ -199,19 +242,21 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
r.setPublisher(preparePublisher(doc, info)); r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info)); r.setSource(prepareSources(doc, info));
r.setFulltext(null); // NOT PRESENT IN MDSTORES r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setFormat(prepareFormats(doc, info)); r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info)); r.setContributor(prepareContributors(doc, info));
r.setResourcetype(null); // TODO r.setResourcetype(prepareResourceType(doc, info));
r.setCoverage(prepareCoverages(doc, info)); r.setCoverage(prepareCoverages(doc, info));
r.setRefereed(null); // TODO r.setRefereed(null); // NOT PRESENT IN MDSTORES
r.setContext(null); // TODO r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setExternalReference(null); // TODO r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
r.setProcessingchargeamount(null); // TODO r.setProcessingchargeamount(null); // NOT PRESENT IN MDSTORES
r.setProcessingchargecurrency(null); // TODO r.setProcessingchargecurrency(null); // NOT PRESENT IN MDSTORES
} }
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info); protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
@ -264,13 +309,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
abstract protected void addRelations(final List<Oaf> oafs,
final Document doc,
final String type,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp);
private Journal prepareJournal(final Document doc, final DataInfo info) { private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal"); final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) { if (n != null) {

View File

@ -10,6 +10,7 @@ import org.apache.commons.logging.LogFactory;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
import eu.dnetlib.dhp.migration.pace.PacePerson;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
@ -37,29 +38,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
} }
@Override
protected void addRelations(final List<Oaf> oafs,
final Document doc,
final String type,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
for (final Object o : doc.selectNodes("//")) { // TODO
final Node n = (Node) o;
final Relation r = new Relation();
r.setRelType(null); // TODO
r.setSubRelType(null); // TODO
r.setRelClass(null); // TODO
r.setSource(null); // TODO
r.setTarget(null); // TODO
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
oafs.add(r);
}
}
@Override @Override
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) { protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
final List<Author> res = new ArrayList<>(); final List<Author> res = new ArrayList<>();
@ -69,6 +47,11 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
final Author author = new Author(); final Author author = new Author();
author.setFullname(n.getText()); author.setFullname(n.getText());
author.setRank(pos++); author.setRank(pos++);
final PacePerson p = new PacePerson(n.getText(), false);
if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname());
}
res.add(author); res.add(author);
} }
return res; return res;
@ -142,97 +125,124 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null;
} }
// SOFTWARES // SOFTWARES
@Override @Override
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) { protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) { protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null;
} }
// DATASETS // DATASETS
@Override @Override
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) { protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // NOT PRESENT IN OAF
return null;
} }
// OTHER PRODUCTS // OTHER PRODUCTS
@Override @Override
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) { protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) { protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null;
} }
@Override @Override
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) { protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return new ArrayList<>(); // NOT PRESENT IN OAF
return null; }
@Override
protected List<Oaf> addOtherResultRels(final Document doc,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText());
final Relation r1 = new Relation();
r1.setRelType("resultResult");
r1.setSubRelType("publicationDataset");
r1.setRelClass("isRelatedTo");
r1.setSource(docId);
r1.setTarget(otherId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
final Relation r2 = new Relation();
r2.setRelType("resultResult");
r2.setSubRelType("publicationDataset");
r2.setRelClass("isRelatedTo");
r2.setSource(otherId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
}
return res;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return null; // NOT PRESENT IN OAF
} }
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.dhp.migration; package eu.dnetlib.dhp.migration;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -18,6 +19,7 @@ import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OdfMigrationExecutor extends AbstractMongoExecutor { public class OdfMigrationExecutor extends AbstractMongoExecutor {
@ -190,8 +192,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
@Override @Override
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) { protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
// TODO Auto-generated method stub return null; // Not present in ODF ???
return null;
} }
@Override @Override
@ -220,14 +221,49 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
} }
@Override @Override
protected void addRelations(final List<Oaf> oafs, protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
final Document doc,
final String type,
final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp) {
// TODO Auto-generated method stub
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) {
final String otherId = createOpenaireId(50, ((Node) o).getText());
final String type = ((Node) o).valueOf("@relationType");
if (type.equals("IsSupplementTo")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
} else if (type.equals("IsPartOf")) {
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
} else {}
}
return res;
}
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
final DataInfo info,
final long lastUpdateTimestamp,
final String source,
final String target,
final String subRelType,
final String relClass) {
final Relation r = new Relation();
r.setRelType("resultResult");
r.setSubRelType(subRelType);
r.setRelClass(relClass);
r.setSource(source);
r.setTarget(target);
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return r;
}
@Override
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
} }
} }

View File

@ -0,0 +1,176 @@
package eu.dnetlib.dhp.migration.pace;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
public PacePerson(String s, final boolean aggressive) {
original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD);
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
s = s.replaceAll("\\d", " ");
s = s.replaceAll("\\n", " ");
s = s.replaceAll("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (aggressive) {
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
} else if (arr.length > 1) {
surname = splitTerms(arr[0]);
name = splitTerms(arr[1]);
fullname.addAll(surname);
fullname.addAll(name);
}
} else {
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if (term.length() > 1 && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
name.add(term);
}
}
}
}
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
list.add(part);
}
}
return list;
}
public List<String> getName() {
return name;
}
public String getNameString() {
return Joiner.on(" ").join(getName());
}
public List<String> getSurname() {
return surname;
}
public List<String> getFullname() {
return fullname;
}
public String getOriginal() {
return original;
}
public String hash() {
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
}
public String getNormalisedFirstName() {
return Joiner.on(" ").join(getCapitalFirstnames());
}
public String getNormalisedSurname() {
return Joiner.on(" ").join(getCapitalSurname());
}
public String getSurnameString() {
return Joiner.on(" ").join(getSurname());
}
public String getNormalisedFullname() {
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
}
public List<String> getCapitalFirstnames() {
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
}
public boolean isAccurate() {
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
}
}

View File

@ -0,0 +1,7 @@
van
der
de
dell
sig
mr
mrs