Merge pull request 'author name parsing' (#220) from author_name_particles into beta

Reviewed-on: #220
This commit is contained in:
Claudio Atzori 2022-06-27 09:37:27 +02:00
commit cba9c2b7cc
4 changed files with 162 additions and 29 deletions

View File

@ -1,18 +1,18 @@
package eu.dnetlib.dhp.common; package eu.dnetlib.dhp.common;
import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.HashSet; import java.util.*;
import java.util.List; import java.util.stream.Collectors;
import java.util.Set;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils; import org.apache.commons.lang3.text.WordUtils;
import com.ctc.wstx.dtd.LargePrefixedNameSet;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
@ -29,7 +29,19 @@ public class PacePerson {
private List<String> fullname = Lists.newArrayList(); private List<String> fullname = Lists.newArrayList();
private final String original; private final String original;
private static Set<String> particles = null; private static Set<String> particles;
static {
try {
particles = new HashSet<>(IOUtils
.readLines(
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/** /**
* Capitalizes a string * Capitalizes a string
@ -37,29 +49,20 @@ public class PacePerson {
* @param s the string to capitalize * @param s the string to capitalize
* @return the input string with capital letter * @return the input string with capital letter
*/ */
public static final String capitalize(final String s) { public static String capitalize(final String s) {
if (particles.contains(s)) {
return s;
}
return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
} }
/** /**
* Adds a dot to a string with length equals to 1 * Adds a dot to a string with length equals to 1
*/ */
public static final String dotAbbreviations(final String s) { public static String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s; return s.length() == 1 ? s + "." : s;
} }
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
/** /**
* The constructor of the class. It fills the fields of the class basing on the input fullname. * The constructor of the class. It fills the fields of the class basing on the input fullname.
* *
@ -128,10 +131,6 @@ public class PacePerson {
} }
private List<String> splitTerms(final String s) { private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList(); final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) { if (!particles.contains(part.toLowerCase())) {
@ -187,17 +186,36 @@ public class PacePerson {
} }
public List<String> getCapitalFirstnames() { public List<String> getCapitalFirstnames() {
return Lists return Optional
.newArrayList( .ofNullable(getNameWithAbbreviations())
Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); .map(
name -> name
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
} }
public List<String> getCapitalSurname() { public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); return Optional
.ofNullable(getSurname())
.map(
surname -> surname
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
} }
public List<String> getNameWithAbbreviations() { public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); return Optional
.ofNullable(getName())
.map(
name -> name
.stream()
.map(PacePerson::dotAbbreviations)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
} }
public boolean isAccurate() { public boolean isAccurate() {

View File

@ -762,6 +762,51 @@ class MappersTest {
assertFalse(p_cleaned.getTitle().isEmpty()); assertFalse(p_cleaned.getTitle().isEmpty());
} }
@Test
void testZenodo() throws IOException, DocumentException {
final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml")));
final List<Oaf> list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml);
System.out.println("***************");
System.out.println(new ObjectMapper().writeValueAsString(list));
System.out.println("***************");
final Publication p = (Publication) list.get(0);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertNotNull(p.getTitle());
assertFalse(p.getTitle().isEmpty());
assertEquals(1, p.getTitle().size());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertNotNull(p.getAuthor());
assertEquals(2, p.getAuthor().size());
Author author = p
.getAuthor()
.stream()
.filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8007")))
.findFirst()
.get();
assertNotNull(author);
assertTrue(StringUtils.isBlank(author.getSurname()));
assertTrue(StringUtils.isBlank(author.getName()));
assertEquals("Anne van Weerden", author.getFullname());
author = p
.getAuthor()
.stream()
.filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8008")))
.findFirst()
.get();
assertNotNull(author);
assertFalse(StringUtils.isBlank(author.getSurname()));
assertFalse(StringUtils.isBlank(author.getName()));
assertFalse(StringUtils.isBlank(author.getFullname()));
}
@Test @Test
void testOdfFromHdfs() throws IOException, DocumentException { void testOdfFromHdfs() throws IOException, DocumentException {
final String xml = IOUtils final String xml = IOUtils

View File

@ -0,0 +1,69 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:datacite="http://datacite.org/schema/kernel-3"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri">>
<header xmlns="http://www.openarchives.org/OAI/2.0/">
<identifier>oai:zenodo.org:3406824</identifier>
<datestamp>2020-01-20T16:45:20Z</datestamp>
<setSpec>openaire</setSpec>
<dr:dateOfTransformation>2022-06-07T10:21:24.06Z</dr:dateOfTransformation>
<dri:objIdentifier>test________::92fe3efa47883b2f3401e6a4bd92e9d7</dri:objIdentifier>
<dri:dateOfCollection>2020-05-21T05:26:15.93Z</dri:dateOfCollection>
<dri:dateOfTransformation>2020-08-01T11:06:26.977Z</dri:dateOfTransformation>
</header>
<metadata>
<resource xmlns="http://datacite.org/schema/kernel-4"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd">
<identifier identifierType="DOI">10.5281/zenodo.3406824</identifier>
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3">
<alternateIdentifier alternateIdentifierType="URL">http://dx.doi.org/10.5281/zenodo.3406824</alternateIdentifier>
</alternateIdentifiers>
<creators>
<creator>
<creatorName>Anne van Weerden</creatorName>
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3272-8007</nameIdentifier>
<affiliation>Utrecht University Library</affiliation>
</creator>
<creator>
<creatorName>Anne van, Weerden</creatorName>
<nameIdentifier nameIdentifierScheme="ORCID" schemeURI="http://orcid.org/">0000-0003-3272-8008</nameIdentifier>
<affiliation>Utrecht University Library</affiliation>
</creator>
</creators>
<titles>
<title>Helen Bayly and Catherine Disney as influences in the life of Sir William Rowan Hamilton</title>
</titles>
<publisher>Zenodo</publisher>
<publicationYear>2018</publicationYear>
<subjects>
<subject>Sir William Rowan Hamilton, Lady Helena Maria Hamilton Bayly, Catherine Disney, Ireland, history, biography, nineteenth century</subject>
</subjects>
<dates>
<date dateType="Issued">2018-12-28</date>
</dates>
<language>en</language>
<resourceType resourceTypeGeneral="JournalArticle"/>
<relatedIdentifiers>
<relatedIdentifier relatedIdentifierType="DOI" relationType="IsVersionOf">10.5281/zenodo.3406823</relatedIdentifier>
</relatedIdentifiers>
<rightsList>
<rights rightsURI="https://creativecommons.org/licenses/by/4.0/legalcode">Creative Commons Attribution 4.0 International</rights>
<rights rightsURI="info:eu-repo/semantics/openAccess">Open Access</rights>
</rightsList>
<descriptions>
<description descriptionType="Abstract"><p>In the 1880s Robert Graves published a biography about Sir William Rowan Hamilton (1805-1865), to which in a 1980 biography Thomas Hankins added further information. From these biographies a picture emerged of a man who was unhappily married because he had lost the love of his life, which raised the question how such an unhappy man could produce so much beautiful mathematics. In this article it is stated that a main cause for the unhappy picture is that Graves ignored the influence on one another of Hamilton and his wife Helen Bayly, and Hankins that of Hamilton and his first and lost love Catherine Disney. It is then shown that if these influences are taken into account a very different view on Hamilton;s private life arises, in which he was happily married to a wife who enabled him to work as he needed to.</p></description>
</descriptions>
</resource>
<oaf:identifier identifierType="doi">10.5281/zenodo.3406824</oaf:identifier>
<dr:CobjCategory type="publication">0001</dr:CobjCategory>
<oaf:dateAccepted>2018-12-28</oaf:dateAccepted>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:license>https://creativecommons.org/licenses/by/4.0/legalcode</oaf:license>
<oaf:language>eng</oaf:language>
<oaf:hostedBy name="ZENODO" id="opendoar____::2659"/>
<oaf:collectedFrom name="ZENODO" id="opendoar____::2659"/>
</metadata>
</record>