add xslt, personname cleaner
parent
511c0521e5
commit
3b694074ff
@ -0,0 +1,206 @@
|
||||
|
||||
package eu.dnetlib.dhp.transformation.xslt;
|
||||
|
||||
import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI;
|
||||
|
||||
import java.io.Serializable;
|
||||
// import java.nio.charset.Charset;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize;
|
||||
import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations;
|
||||
import net.sf.saxon.s9api.ExtensionFunction;
|
||||
import net.sf.saxon.s9api.ItemType;
|
||||
import net.sf.saxon.s9api.OccurrenceIndicator;
|
||||
import net.sf.saxon.s9api.QName;
|
||||
import net.sf.saxon.s9api.SaxonApiException;
|
||||
import net.sf.saxon.s9api.SequenceType;
|
||||
import net.sf.saxon.s9api.XdmValue;
|
||||
|
||||
//import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
//import eu.dnetlib.pace.util.Capitalise;
|
||||
//import eu.dnetlib.pace.util.DotAbbreviations;
|
||||
|
||||
public class PersonCleaner implements ExtensionFunction, Serializable {
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 1L;
|
||||
private List<String> firstname = Lists.newArrayList();
|
||||
private List<String> surname = Lists.newArrayList();
|
||||
private List<String> fullname = Lists.newArrayList();
|
||||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
public PersonCleaner() {
|
||||
|
||||
}
|
||||
|
||||
public String normalize(String s) {
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD
|
||||
s = s.replaceAll("\\(.+\\)", "");
|
||||
s = s.replaceAll("\\[.+\\]", "");
|
||||
s = s.replaceAll("\\{.+\\}", "");
|
||||
s = s.replaceAll("\\s+-\\s+", "-");
|
||||
|
||||
// s = s.replaceAll("[\\W&&[^,-]]", " ");
|
||||
|
||||
// System.out.println("class Person: s: " + s);
|
||||
|
||||
// s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", " ");
|
||||
s = s.replaceAll("[\\p{Punct}&&[^-,]]", " ");
|
||||
s = s.replace("\\d", " ");
|
||||
s = s.replace("\\n", " ");
|
||||
s = s.replace("\\.", " ");
|
||||
s = s.replaceAll("\\s+", " ");
|
||||
|
||||
if (s.contains(",")) {
|
||||
// System.out.println("class Person: s: " + s);
|
||||
|
||||
String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
|
||||
fullname = splitTerms(arr[0]);
|
||||
} else if (arr.length > 1) {
|
||||
surname = splitTerms(arr[0]);
|
||||
firstname = splitTermsFirstName(arr[1]);
|
||||
// System.out.println("class Person: surname: " + surname);
|
||||
// System.out.println("class Person: firstname: " + firstname);
|
||||
|
||||
fullname.addAll(surname);
|
||||
fullname.addAll(firstname);
|
||||
}
|
||||
} else {
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
}
|
||||
}
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
firstname = fullname.subList(0, lastInitialPosition + 1);
|
||||
System.out.println("name: " + firstname);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||
for (String term : fullname) {
|
||||
if (term.length() > 1 && term.equals(term.toUpperCase())) {
|
||||
surname.add(term);
|
||||
} else {
|
||||
firstname.add(term);
|
||||
}
|
||||
}
|
||||
} else if (lastInitialPosition == fullname.size()) {
|
||||
surname = fullname.subList(lastInitialPosition - 1, fullname.size());
|
||||
firstname = fullname.subList(0, lastInitialPosition - 1);
|
||||
}
|
||||
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private List<String> splitTermsFirstName(String s) {
|
||||
List<String> list = Lists.newArrayList();
|
||||
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
if (s.trim().matches("\\p{Lu}{2,3}")) {
|
||||
String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase)
|
||||
for (String p : parts) {
|
||||
if (p.length() > 0)
|
||||
list.add(p);
|
||||
}
|
||||
} else {
|
||||
list.add(part);
|
||||
}
|
||||
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
private List<String> splitTerms(String s) {
|
||||
if (particles == null) {
|
||||
// particles = NGramUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
}
|
||||
|
||||
List<String> list = Lists.newArrayList();
|
||||
for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
// if (!particles.contains(part.toLowerCase())) {
|
||||
list.add(part);
|
||||
|
||||
// }
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<String> getFirstname() {
|
||||
return firstname;
|
||||
}
|
||||
|
||||
public List<String> getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public List<String> getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public String hash() {
|
||||
return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString();
|
||||
}
|
||||
|
||||
public String getNormalisedFullname() {
|
||||
return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations())
|
||||
: Joiner.on(" ").join(fullname);
|
||||
// return isAccurate() ?
|
||||
// Joiner.on(" ").join(getCapitalSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) :
|
||||
// Joiner.on(" ").join(fullname);
|
||||
}
|
||||
|
||||
public List<String> getCapitalSurname() {
|
||||
return Lists.newArrayList(Iterables.transform(surname, new Capitalize()));
|
||||
}
|
||||
|
||||
public List<String> getNameWithAbbreviations() {
|
||||
return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations()));
|
||||
}
|
||||
|
||||
public boolean isAccurate() {
|
||||
return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public QName getName() {
|
||||
return new QName(QNAME_BASE_URI + "/person", "person");
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType getResultType() {
|
||||
return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SequenceType[] getArgumentTypes() {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public XdmValue call(XdmValue[] arguments) throws SaxonApiException {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
|
||||
package eu.dnetlib.dhp.transformation.xslt.utils;
|
||||
|
||||
// import org.apache.commons.text.WordUtils;
|
||||
// import org.apache.commons.text.WordUtils;
|
||||
import com.google.common.base.Function;
|
||||
|
||||
public class Capitalize implements Function<String, String> {
|
||||
|
||||
@Override
|
||||
public String apply(String s) {
|
||||
return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase());
|
||||
}
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
|
||||
package eu.dnetlib.dhp.transformation.xslt.utils;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
|
||||
public class DotAbbreviations implements Function<String, String> {
|
||||
|
||||
@Override
|
||||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
}
|
@ -0,0 +1,60 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<oai:record xmlns="http://namespace.openaire.eu/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<oai:header>
|
||||
<dri:objIdentifier>_____OmicsDI::0000337c02d1b51030675d69407655da</dri:objIdentifier>
|
||||
<dri:recordIdentifier>PRJNA78295</dri:recordIdentifier>
|
||||
<dri:dateOfCollection>2020-10-31T15:31:30.725Z</dri:dateOfCollection>
|
||||
<oaf:datasourceprefix>_____OmicsDI</oaf:datasourceprefix>
|
||||
</oai:header>
|
||||
<oai:metadata>
|
||||
<datasets xmlns="">
|
||||
<connectionsCountScaled>0.235294117647059</connectionsCountScaled>
|
||||
<reanalysisCount>0</reanalysisCount>
|
||||
<keywords>null</keywords>
|
||||
<citationsCountScaled>0.0</citationsCountScaled>
|
||||
<viewsCount>0</viewsCount>
|
||||
<description>Sedimentitalea nanhaiensis DSM 24252 Genome sequencing and assembly</description>
|
||||
<downloadCountScaled>8.20101314054644E-5</downloadCountScaled>
|
||||
<source>omics_ena_project</source>
|
||||
<title>Sedimentitalea nanhaiensis DSM 24252</title>
|
||||
<connectionsCount>14</connectionsCount>
|
||||
<citationsCount>0</citationsCount>
|
||||
<score>null</score>
|
||||
<omicsType>Genomics</omicsType>
|
||||
<reanalysisCountScaled>0.0</reanalysisCountScaled>
|
||||
<organisms>
|
||||
<acc>571166</acc>
|
||||
<name>Sedimentitalea nanhaiensis DSM 24252</name>
|
||||
</organisms>
|
||||
<viewsCountScaled>0.0</viewsCountScaled>
|
||||
<claimable>false</claimable>
|
||||
<id>PRJNA78295</id>
|
||||
<publicationDate>null</publicationDate>
|
||||
<downloadCount>13</downloadCount>
|
||||
</datasets>
|
||||
</oai:metadata>
|
||||
<about xmlns="">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2020-10-31T15:31:30.725Z">
|
||||
<baseURL>https%3A%2F%2Fwww.omicsdi.org%2Fws%2Fdataset%2Fsearch</baseURL>
|
||||
<identifier/>
|
||||
<datestamp/>
|
||||
<metadataNamespace/>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||
classname="sysimport:crosswalk:datasetarchive"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</oai:record>
|
Loading…
Reference in New Issue