From b295a40d9cd88ad8c88c4a387eb10baa58502f64 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 16 Jun 2022 12:20:43 +0200 Subject: [PATCH 1/2] restored use of name_particles when parsing author names --- .../eu/dnetlib/dhp/common/PacePerson.java | 74 ++++++++++++------- .../eu/dnetlib/dhp/common}/name_particles.txt | 2 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 45 +++++++++++ .../dnetlib/dhp/oa/graph/raw/odf_zenodo.xml | 69 +++++++++++++++++ 4 files changed, 161 insertions(+), 29 deletions(-) rename {dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace => dhp-common/src/main/resources/eu/dnetlib/dhp/common}/name_particles.txt (85%) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java index 91c6c1825..fac9a7565 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java @@ -1,18 +1,18 @@ package eu.dnetlib.dhp.common; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.text.Normalizer; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.text.WordUtils; +import com.ctc.wstx.dtd.LargePrefixedNameSet; import com.google.common.base.Joiner; import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; @@ -29,7 +29,19 @@ public class PacePerson { private List fullname = Lists.newArrayList(); private final String original; - private static Set particles = null; + private static Set particles; + + static { + try { + particles = new HashSet<>(IOUtils + .readLines( + PacePerson.class + .getResourceAsStream( + "/eu/dnetlib/dhp/common/name_particles.txt"))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } /** * Capitalizes a string @@ -37,29 +49,20 @@ public class PacePerson { * @param s the string to capitalize * @return the input string with capital letter */ - public static final String capitalize(final String s) { + public static String capitalize(final String s) { + if (particles.contains(s)) { + return s; + } return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); } /** * Adds a dot to a string with length equals to 1 */ - public static final String dotAbbreviations(final String s) { + public static String dotAbbreviations(final String s) { return s.length() == 1 ? s + "." : s; } - public static Set loadFromClasspath(final String classpath) { - final Set h = new HashSet<>(); - try { - for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { - h.add(s); - } - } catch (final Throwable e) { - return new HashSet<>(); - } - return h; - } - /** * The constructor of the class. It fills the fields of the class basing on the input fullname. * @@ -128,10 +131,6 @@ public class PacePerson { } private List splitTerms(final String s) { - if (particles == null) { - particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); - } - final List list = Lists.newArrayList(); for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { if (!particles.contains(part.toLowerCase())) { @@ -187,17 +186,36 @@ public class PacePerson { } public List getCapitalFirstnames() { - return Lists - .newArrayList( - Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + return Optional + .ofNullable(getNameWithAbbreviations()) + .map( + name -> name + .stream() + .map(PacePerson::capitalize) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public List getCapitalSurname() { - return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + return Optional + .ofNullable(getSurname()) + .map( + surname -> surname + .stream() + .map(PacePerson::capitalize) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public List getNameWithAbbreviations() { - return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + return Optional + .ofNullable(getName()) + .map( + name -> name + .stream() + .map(PacePerson::dotAbbreviations) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public boolean isAccurate() { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt similarity index 85% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt rename to dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt index dae37c9dc..d21610522 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt @@ -4,4 +4,4 @@ de dell sig mr -mrs +mrs \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index dd69bae85..f5cb86bfd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -762,6 +762,51 @@ class MappersTest { assertFalse(p_cleaned.getTitle().isEmpty()); } + @Test + void testZenodo() throws IOException, DocumentException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_zenodo.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + + System.out.println("***************"); + System.out.println(new ObjectMapper().writeValueAsString(list)); + System.out.println("***************"); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + + assertNotNull(p.getTitle()); + assertFalse(p.getTitle().isEmpty()); + assertEquals(1, p.getTitle().size()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + + assertNotNull(p.getAuthor()); + assertEquals(2, p.getAuthor().size()); + + Author author = p + .getAuthor() + .stream() + .filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8007"))) + .findFirst() + .get(); + assertNotNull(author); + assertTrue(StringUtils.isBlank(author.getSurname())); + assertTrue(StringUtils.isBlank(author.getName())); + assertEquals("Anne van Weerden", author.getFullname()); + + author = p + .getAuthor() + .stream() + .filter(a -> a.getPid().stream().anyMatch(pi -> pi.getValue().equals("0000-0003-3272-8008"))) + .findFirst() + .get(); + assertNotNull(author); + assertFalse(StringUtils.isBlank(author.getSurname())); + assertFalse(StringUtils.isBlank(author.getName())); + assertFalse(StringUtils.isBlank(author.getFullname())); + + } + @Test void testOdfFromHdfs() throws IOException, DocumentException { final String xml = IOUtils diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml new file mode 100644 index 000000000..0fc568e56 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_zenodo.xml @@ -0,0 +1,69 @@ + +> +
+ oai:zenodo.org:3406824 + 2020-01-20T16:45:20Z + openaire + 2022-06-07T10:21:24.06Z + test________::92fe3efa47883b2f3401e6a4bd92e9d7 + 2020-05-21T05:26:15.93Z + 2020-08-01T11:06:26.977Z +
+ + + 10.5281/zenodo.3406824 + + http://dx.doi.org/10.5281/zenodo.3406824 + + + + Anne van Weerden + 0000-0003-3272-8007 + Utrecht University Library + + + Anne van, Weerden + 0000-0003-3272-8008 + Utrecht University Library + + + + Helen Bayly and Catherine Disney as influences in the life of Sir William Rowan Hamilton + + Zenodo + 2018 + + Sir William Rowan Hamilton, Lady Helena Maria Hamilton Bayly, Catherine Disney, Ireland, history, biography, nineteenth century + + + 2018-12-28 + + en + + + 10.5281/zenodo.3406823 + + + Creative Commons Attribution 4.0 International + Open Access + + +

In the 1880s Robert Graves published a biography about Sir William Rowan Hamilton (1805-1865), to which in a 1980 biography Thomas Hankins added further information. From these biographies a picture emerged of a man who was unhappily married because he had lost the love of his life, which raised the question how such an unhappy man could produce so much beautiful mathematics. In this article it is stated that a main cause for the unhappy picture is that Graves ignored the influence on one another of Hamilton and his wife Helen Bayly, and Hankins that of Hamilton and his first and lost love Catherine Disney. It is then shown that if these influences are taken into account a very different view on Hamilton;s private life arises, in which he was happily married to a wife who enabled him to work as he needed to.

+
+
+ 10.5281/zenodo.3406824 + 0001 + 2018-12-28 + OPEN + https://creativecommons.org/licenses/by/4.0/legalcode + eng + + +
+
\ No newline at end of file From 316b0fd73c9b71cbde067b1d0d4a62b64da2d76e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 27 Jun 2022 09:36:51 +0200 Subject: [PATCH 2/2] added 'von' to the name particles file --- .../src/main/resources/eu/dnetlib/dhp/common/name_particles.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt index d21610522..07cf06a98 100644 --- a/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt @@ -1,4 +1,5 @@ van +von der de dell