forked from D-Net/dnet-hadoop
Ready for tests
This commit is contained in:
parent
181e8498d4
commit
95740767e0
|
@ -30,6 +30,7 @@ import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
@ -164,14 +165,56 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!oafs.isEmpty()) {
|
if (!oafs.isEmpty()) {
|
||||||
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
||||||
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
||||||
addRelations(oafs, doc, "TYPE", collectedFrom, info, lastUpdateTimestamp); // TODO
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return oafs;
|
return oafs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private List<Oaf> addProjectRels(final Document doc,
|
||||||
|
final KeyValue collectedFrom,
|
||||||
|
final DataInfo info,
|
||||||
|
final long lastUpdateTimestamp) {
|
||||||
|
|
||||||
|
final List<Oaf> res = new ArrayList<>();
|
||||||
|
|
||||||
|
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||||
|
|
||||||
|
for (final Object o : doc.selectNodes("//oaf:projectid")) {
|
||||||
|
final String projectId = createOpenaireId(40, ((Node) o).getText());
|
||||||
|
|
||||||
|
final Relation r1 = new Relation();
|
||||||
|
r1.setRelType("resultProject");
|
||||||
|
r1.setSubRelType("outcome");
|
||||||
|
r1.setRelClass("isProducedBy");
|
||||||
|
r1.setSource(docId);
|
||||||
|
r1.setTarget(projectId);
|
||||||
|
r1.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||||
|
r1.setDataInfo(info);
|
||||||
|
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
res.add(r1);
|
||||||
|
|
||||||
|
final Relation r2 = new Relation();
|
||||||
|
r2.setRelType("resultProject");
|
||||||
|
r2.setSubRelType("outcome");
|
||||||
|
r2.setRelClass("produces");
|
||||||
|
r2.setSource(projectId);
|
||||||
|
r2.setTarget(docId);
|
||||||
|
r2.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||||
|
r2.setDataInfo(info);
|
||||||
|
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
res.add(r2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract List<Oaf> addOtherResultRels(final Document doc,
|
||||||
|
final KeyValue collectedFrom,
|
||||||
|
final DataInfo info,
|
||||||
|
final long lastUpdateTimestamp);
|
||||||
|
|
||||||
private void populateResultFields(final Result r,
|
private void populateResultFields(final Result r,
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final KeyValue collectedFrom,
|
final KeyValue collectedFrom,
|
||||||
|
@ -199,19 +242,21 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
||||||
r.setPublisher(preparePublisher(doc, info));
|
r.setPublisher(preparePublisher(doc, info));
|
||||||
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
|
||||||
r.setSource(prepareSources(doc, info));
|
r.setSource(prepareSources(doc, info));
|
||||||
r.setFulltext(null); // NOT PRESENT IN MDSTORES
|
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
r.setFormat(prepareFormats(doc, info));
|
r.setFormat(prepareFormats(doc, info));
|
||||||
r.setContributor(prepareContributors(doc, info));
|
r.setContributor(prepareContributors(doc, info));
|
||||||
r.setResourcetype(null); // TODO
|
r.setResourcetype(prepareResourceType(doc, info));
|
||||||
r.setCoverage(prepareCoverages(doc, info));
|
r.setCoverage(prepareCoverages(doc, info));
|
||||||
r.setRefereed(null); // TODO
|
r.setRefereed(null); // NOT PRESENT IN MDSTORES
|
||||||
r.setContext(null); // TODO
|
r.setContext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
r.setExternalReference(null); // TODO
|
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
|
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
|
||||||
r.setProcessingchargeamount(null); // TODO
|
r.setProcessingchargeamount(null); // NOT PRESENT IN MDSTORES
|
||||||
r.setProcessingchargecurrency(null); // TODO
|
r.setProcessingchargecurrency(null); // NOT PRESENT IN MDSTORES
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
|
||||||
|
|
||||||
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
|
protected abstract List<Instance> prepareInstances(Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
|
||||||
|
|
||||||
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
|
||||||
|
@ -264,13 +309,6 @@ public abstract class AbstractMongoExecutor extends AbstractMigrationExecutor {
|
||||||
|
|
||||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||||
|
|
||||||
abstract protected void addRelations(final List<Oaf> oafs,
|
|
||||||
final Document doc,
|
|
||||||
final String type,
|
|
||||||
final KeyValue collectedFrom,
|
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp);
|
|
||||||
|
|
||||||
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||||
final Node n = doc.selectSingleNode("//oaf:journal");
|
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||||
if (n != null) {
|
if (n != null) {
|
||||||
|
|
|
@ -10,6 +10,7 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.migration.pace.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
@ -37,29 +38,6 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
||||||
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void addRelations(final List<Oaf> oafs,
|
|
||||||
final Document doc,
|
|
||||||
final String type,
|
|
||||||
final KeyValue collectedFrom,
|
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp) {
|
|
||||||
for (final Object o : doc.selectNodes("//")) { // TODO
|
|
||||||
final Node n = (Node) o;
|
|
||||||
final Relation r = new Relation();
|
|
||||||
r.setRelType(null); // TODO
|
|
||||||
r.setSubRelType(null); // TODO
|
|
||||||
r.setRelClass(null); // TODO
|
|
||||||
r.setSource(null); // TODO
|
|
||||||
r.setTarget(null); // TODO
|
|
||||||
r.setCollectedFrom(Arrays.asList(collectedFrom));
|
|
||||||
r.setDataInfo(info);
|
|
||||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
|
||||||
oafs.add(r);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
protected List<Author> prepareAuthors(final Document doc, final DataInfo info) {
|
||||||
final List<Author> res = new ArrayList<>();
|
final List<Author> res = new ArrayList<>();
|
||||||
|
@ -69,6 +47,11 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
||||||
final Author author = new Author();
|
final Author author = new Author();
|
||||||
author.setFullname(n.getText());
|
author.setFullname(n.getText());
|
||||||
author.setRank(pos++);
|
author.setRank(pos++);
|
||||||
|
final PacePerson p = new PacePerson(n.getText(), false);
|
||||||
|
if (p.isAccurate()) {
|
||||||
|
author.setName(p.getNormalisedFirstName());
|
||||||
|
author.setSurname(p.getNormalisedSurname());
|
||||||
|
}
|
||||||
res.add(author);
|
res.add(author);
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
@ -142,97 +125,124 @@ public class OafMigrationExecutor extends AbstractMongoExecutor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
protected List<StructuredProperty> prepareRelevantDates(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// SOFTWARES
|
// SOFTWARES
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
protected Field<String> prepareSoftwareCodeRepositoryUrl(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
protected List<StructuredProperty> prepareSoftwareLicenses(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
protected List<Field<String>> prepareSoftwareDocumentationUrls(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// DATASETS
|
// DATASETS
|
||||||
@Override
|
@Override
|
||||||
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
protected List<GeoLocation> prepareDatasetGeoLocations(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetLastMetadataUpdate(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetVersion(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetSize(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetDevice(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetStorageDate(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// OTHER PRODUCTS
|
// OTHER PRODUCTS
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
protected List<Field<String>> prepareOtherResearchProductTools(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
protected List<Field<String>> prepareOtherResearchProductContactGroups(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
protected List<Field<String>> prepareOtherResearchProductContactPersons(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return new ArrayList<>(); // NOT PRESENT IN OAF
|
||||||
return null;
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected List<Oaf> addOtherResultRels(final Document doc,
|
||||||
|
final KeyValue collectedFrom,
|
||||||
|
final DataInfo info,
|
||||||
|
final long lastUpdateTimestamp) {
|
||||||
|
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||||
|
|
||||||
|
final List<Oaf> res = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
||||||
|
final String otherId = createOpenaireId(50, ((Node) o).getText());
|
||||||
|
|
||||||
|
final Relation r1 = new Relation();
|
||||||
|
r1.setRelType("resultResult");
|
||||||
|
r1.setSubRelType("publicationDataset");
|
||||||
|
r1.setRelClass("isRelatedTo");
|
||||||
|
r1.setSource(docId);
|
||||||
|
r1.setTarget(otherId);
|
||||||
|
r1.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||||
|
r1.setDataInfo(info);
|
||||||
|
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
res.add(r1);
|
||||||
|
|
||||||
|
final Relation r2 = new Relation();
|
||||||
|
r2.setRelType("resultResult");
|
||||||
|
r2.setSubRelType("publicationDataset");
|
||||||
|
r2.setRelClass("isRelatedTo");
|
||||||
|
r2.setSource(otherId);
|
||||||
|
r2.setTarget(docId);
|
||||||
|
r2.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||||
|
r2.setDataInfo(info);
|
||||||
|
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
res.add(r2);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||||
|
return null; // NOT PRESENT IN OAF
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.dhp.migration;
|
package eu.dnetlib.dhp.migration;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -18,6 +19,7 @@ import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
||||||
|
@ -190,8 +192,7 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
protected Field<String> prepareDatasetMetadataVersionNumber(final Document doc, final DataInfo info) {
|
||||||
// TODO Auto-generated method stub
|
return null; // Not present in ODF ???
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -220,14 +221,49 @@ public class OdfMigrationExecutor extends AbstractMongoExecutor {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void addRelations(final List<Oaf> oafs,
|
protected List<Oaf> addOtherResultRels(final Document doc, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) {
|
||||||
final Document doc,
|
|
||||||
final String type,
|
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"));
|
||||||
final KeyValue collectedFrom,
|
|
||||||
|
final List<Oaf> res = new ArrayList<>();
|
||||||
|
|
||||||
|
for (final Object o : doc.selectNodes("//*[local-name() = 'resource']//*[local-name()='relatedIdentifier' and ./@relatedIdentifierType='OPENAIRE']")) {
|
||||||
|
final String otherId = createOpenaireId(50, ((Node) o).getText());
|
||||||
|
final String type = ((Node) o).valueOf("@relationType");
|
||||||
|
|
||||||
|
if (type.equals("IsSupplementTo")) {
|
||||||
|
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "supplement", "isSupplementTo"));
|
||||||
|
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "supplement", "isSupplementedBy"));
|
||||||
|
} else if (type.equals("IsPartOf")) {
|
||||||
|
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf"));
|
||||||
|
res.add(prepareOtherResultRel(collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts"));
|
||||||
|
} else {}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
private Relation prepareOtherResultRel(final KeyValue collectedFrom,
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp,
|
||||||
// TODO Auto-generated method stub
|
final String source,
|
||||||
|
final String target,
|
||||||
|
final String subRelType,
|
||||||
|
final String relClass) {
|
||||||
|
final Relation r = new Relation();
|
||||||
|
r.setRelType("resultResult");
|
||||||
|
r.setSubRelType(subRelType);
|
||||||
|
r.setRelClass(relClass);
|
||||||
|
r.setSource(source);
|
||||||
|
r.setTarget(target);
|
||||||
|
r.setCollectedFrom(Arrays.asList(collectedFrom));
|
||||||
|
r.setDataInfo(info);
|
||||||
|
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Qualifier prepareResourceType(final Document doc, final DataInfo info) {
|
||||||
|
return prepareQualifier(doc, "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", "dnet:dataCite_resource", "dnet:dataCite_resource");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,176 @@
|
||||||
|
package eu.dnetlib.dhp.migration.pace;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.text.WordUtils;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
|
||||||
|
public class PacePerson {
|
||||||
|
|
||||||
|
private static final String UTF8 = "UTF-8";
|
||||||
|
private List<String> name = Lists.newArrayList();
|
||||||
|
private List<String> surname = Lists.newArrayList();
|
||||||
|
private List<String> fullname = Lists.newArrayList();
|
||||||
|
private final String original;
|
||||||
|
|
||||||
|
private static Set<String> particles = null;
|
||||||
|
|
||||||
|
public static final String capitalize(final String s) {
|
||||||
|
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final String dotAbbreviations(final String s) {
|
||||||
|
return s.length() == 1 ? s + "." : s;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> loadFromClasspath(final String classpath) {
|
||||||
|
final Set<String> h = new HashSet<>();
|
||||||
|
try {
|
||||||
|
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
|
||||||
|
h.add(s);
|
||||||
|
}
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
return new HashSet<>();
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PacePerson(String s, final boolean aggressive) {
|
||||||
|
original = s;
|
||||||
|
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
s = s.replaceAll("\\(.+\\)", "");
|
||||||
|
s = s.replaceAll("\\[.+\\]", "");
|
||||||
|
s = s.replaceAll("\\{.+\\}", "");
|
||||||
|
s = s.replaceAll("\\s+-\\s+", "-");
|
||||||
|
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
|
||||||
|
s = s.replaceAll("\\d", " ");
|
||||||
|
s = s.replaceAll("\\n", " ");
|
||||||
|
s = s.replaceAll("\\.", " ");
|
||||||
|
s = s.replaceAll("\\s+", " ");
|
||||||
|
|
||||||
|
if (aggressive) {
|
||||||
|
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
|
||||||
|
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.contains(",")) {
|
||||||
|
final String[] arr = s.split(",");
|
||||||
|
if (arr.length == 1) {
|
||||||
|
fullname = splitTerms(arr[0]);
|
||||||
|
} else if (arr.length > 1) {
|
||||||
|
surname = splitTerms(arr[0]);
|
||||||
|
name = splitTerms(arr[1]);
|
||||||
|
fullname.addAll(surname);
|
||||||
|
fullname.addAll(name);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fullname = splitTerms(s);
|
||||||
|
|
||||||
|
int lastInitialPosition = fullname.size();
|
||||||
|
boolean hasSurnameInUpperCase = false;
|
||||||
|
|
||||||
|
for (int i = 0; i < fullname.size(); i++) {
|
||||||
|
final String term = fullname.get(i);
|
||||||
|
if (term.length() == 1) {
|
||||||
|
lastInitialPosition = i;
|
||||||
|
} else if (term.equals(term.toUpperCase())) {
|
||||||
|
hasSurnameInUpperCase = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||||
|
name = fullname.subList(0, lastInitialPosition + 1);
|
||||||
|
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||||
|
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||||
|
for (final String term : fullname) {
|
||||||
|
if (term.length() > 1 && term.equals(term.toUpperCase())) {
|
||||||
|
surname.add(term);
|
||||||
|
} else {
|
||||||
|
name.add(term);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> splitTerms(final String s) {
|
||||||
|
if (particles == null) {
|
||||||
|
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<String> list = Lists.newArrayList();
|
||||||
|
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||||
|
if (!particles.contains(part.toLowerCase())) {
|
||||||
|
list.add(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return list;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getNameString() {
|
||||||
|
return Joiner.on(" ").join(getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getSurname() {
|
||||||
|
return surname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getFullname() {
|
||||||
|
return fullname;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getOriginal() {
|
||||||
|
return original;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String hash() {
|
||||||
|
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getNormalisedFirstName() {
|
||||||
|
return Joiner.on(" ").join(getCapitalFirstnames());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getNormalisedSurname() {
|
||||||
|
return Joiner.on(" ").join(getCapitalSurname());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSurnameString() {
|
||||||
|
return Joiner.on(" ").join(getSurname());
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getNormalisedFullname() {
|
||||||
|
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getCapitalFirstnames() {
|
||||||
|
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getCapitalSurname() {
|
||||||
|
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<String> getNameWithAbbreviations() {
|
||||||
|
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isAccurate() {
|
||||||
|
return name != null && surname != null && !name.isEmpty() && !surname.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,7 @@
|
||||||
|
van
|
||||||
|
der
|
||||||
|
de
|
||||||
|
dell
|
||||||
|
sig
|
||||||
|
mr
|
||||||
|
mrs
|
Loading…
Reference in New Issue