enrichment steps #38
|
@ -16,6 +16,13 @@ import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PacePerson tries to derive information from the fullname string of an author.
|
||||||
|
* Such informations are Names, Surnames an Fullname split into terms. It provides also an additional field for
|
||||||
|
* the original data.
|
||||||
|
* The calculation of the names and the surnames is not always possible. When it is impossible to assert which are the
|
||||||
|
* names and the surnames, the lists are empty.
|
||||||
|
* */
|
||||||
public class PacePerson {
|
public class PacePerson {
|
||||||
|
|
||||||
private static final String UTF8 = "UTF-8";
|
private static final String UTF8 = "UTF-8";
|
||||||
|
@ -26,10 +33,19 @@ public class PacePerson {
|
||||||
|
|
||||||
private static Set<String> particles = null;
|
private static Set<String> particles = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Capitalizes a string
|
||||||
|
*
|
||||||
|
* @param s the string to capitalize
|
||||||
|
* @return the input string with capital letter
|
||||||
|
* */
|
||||||
public static final String capitalize(final String s) {
|
public static final String capitalize(final String s) {
|
||||||
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a dot to a string with length equals to 1
|
||||||
|
* */
|
||||||
public static final String dotAbbreviations(final String s) {
|
public static final String dotAbbreviations(final String s) {
|
||||||
return s.length() == 1 ? s + "." : s;
|
return s.length() == 1 ? s + "." : s;
|
||||||
}
|
}
|
||||||
|
@ -46,6 +62,12 @@ public class PacePerson {
|
||||||
return h;
|
return h;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The constructor of the class. It fills the fields of the class basing on the input fullname.
|
||||||
|
*
|
||||||
|
* @param s the input string (fullname of the author)
|
||||||
|
* @param aggressive set the string normalization type
|
||||||
|
* */
|
||||||
public PacePerson(String s, final boolean aggressive) {
|
public PacePerson(String s, final boolean aggressive) {
|
||||||
original = s;
|
original = s;
|
||||||
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
@ -64,6 +86,7 @@ public class PacePerson {
|
||||||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//if the string contains a comma, it can derive surname and name by splitting on it
|
||||||
if (s.contains(",")) {
|
if (s.contains(",")) {
|
||||||
final String[] arr = s.split(",");
|
final String[] arr = s.split(",");
|
||||||
if (arr.length == 1) {
|
if (arr.length == 1) {
|
||||||
|
@ -74,21 +97,23 @@ public class PacePerson {
|
||||||
fullname.addAll(surname);
|
fullname.addAll(surname);
|
||||||
fullname.addAll(name);
|
fullname.addAll(name);
|
||||||
}
|
}
|
||||||
} else {
|
} else { //otherwise, it should rely on CAPS terms and short terms
|
||||||
fullname = splitTerms(s);
|
fullname = splitTerms(s);
|
||||||
|
|
||||||
int lastInitialPosition = fullname.size();
|
int lastInitialPosition = fullname.size();
|
||||||
boolean hasSurnameInUpperCase = false;
|
boolean hasSurnameInUpperCase = false;
|
||||||
|
|
||||||
|
//computes lastInitialPosition and hasSurnameInUpperCase
|
||||||
for (int i = 0; i < fullname.size(); i++) {
|
for (int i = 0; i < fullname.size(); i++) {
|
||||||
final String term = fullname.get(i);
|
final String term = fullname.get(i);
|
||||||
if (term.length() == 1) {
|
if (term.length() == 1) {
|
||||||
lastInitialPosition = i;
|
lastInitialPosition = i; //first word in the name longer than 1 (to avoid name with dots)
|
||||||
} else if (term.equals(term.toUpperCase())) {
|
} else if (term.equals(term.toUpperCase())) {
|
||||||
hasSurnameInUpperCase = true;
|
hasSurnameInUpperCase = true; //if one of the words is CAPS
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//manages particular cases of fullnames
|
||||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||||
name = fullname.subList(0, lastInitialPosition + 1);
|
name = fullname.subList(0, lastInitialPosition + 1);
|
||||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||||
|
|
|
@ -0,0 +1,26 @@
|
||||||
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
public class PacePersonTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void pacePersonTest1(){
|
||||||
|
|
||||||
|
PacePerson p = new PacePerson("Artini, Michele", false);
|
||||||
|
assertEquals("Artini",p.getSurnameString());
|
||||||
|
assertEquals("Michele", p.getNameString());
|
||||||
|
assertEquals("Artini, Michele", p.getNormalisedFullname());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void pacePersonTest2(){
|
||||||
|
PacePerson p = new PacePerson("Michele G. Artini", false);
|
||||||
|
assertEquals("Artini, Michele G.", p.getNormalisedFullname());
|
||||||
|
assertEquals("Michele G", p.getNameString());
|
||||||
|
assertEquals("Artini", p.getSurnameString());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue