forked from D-Net/dnet-hadoop
implementation of author dedup configuration and lnfi clustering function
This commit is contained in:
parent
14f6346676
commit
6a6c266dde
|
@ -0,0 +1,77 @@
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ClusteringClass("lnfi")
|
||||||
|
public class LastNameFirstInitial extends AbstractClusteringFunction{
|
||||||
|
|
||||||
|
private boolean DEFAULT_AGGRESSIVE = true;
|
||||||
|
|
||||||
|
public LastNameFirstInitial(final Map<String, Integer> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||||
|
return fields.stream().filter(f -> !f.isEmpty())
|
||||||
|
.map(Field::stringValue)
|
||||||
|
.map(this::normalize)
|
||||||
|
.map(s -> doApply(conf, s))
|
||||||
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
|
.flatMap(c -> c.stream())
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected String normalize(final String s) {
|
||||||
|
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||||
|
.replaceAll("[^ \\w]+", "")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Collection<String> doApply(final Config conf, final String s) {
|
||||||
|
|
||||||
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
||||||
|
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||||
|
|
||||||
|
Person p = new Person(s, aggressive);
|
||||||
|
|
||||||
|
if (p.isAccurate()) {
|
||||||
|
String lastName = p.getNormalisedSurname().toLowerCase();
|
||||||
|
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
|
||||||
|
|
||||||
|
res.add(firstInitial.concat(lastName));
|
||||||
|
}
|
||||||
|
else { // is not accurate, meaning it has no defined name and surname
|
||||||
|
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
|
||||||
|
if (fullname.size() == 1) {
|
||||||
|
res.add(p.getNormalisedFullname().toLowerCase());
|
||||||
|
}
|
||||||
|
else if (fullname.size() == 2) {
|
||||||
|
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
|
||||||
|
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
|
||||||
|
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
}
|
|
@ -43,7 +43,7 @@ public class Person {
|
||||||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (s.contains(",")) {
|
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
|
||||||
final String[] arr = s.split(",");
|
final String[] arr = s.split(",");
|
||||||
if (arr.length == 1) {
|
if (arr.length == 1) {
|
||||||
fullname = splitTerms(arr[0]);
|
fullname = splitTerms(arr[0]);
|
||||||
|
|
|
@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLastNameFirstInitial(){
|
||||||
|
|
||||||
|
final ClusteringFunction cf = new LastNameFirstInitial(params);
|
||||||
|
final String s = "LI Yonghong";
|
||||||
|
System.out.println("s = " + s);
|
||||||
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -30,6 +30,11 @@ public class UtilTest {
|
||||||
|
|
||||||
assertEquals("kennedy", p.getSurnameString());
|
assertEquals("kennedy", p.getSurnameString());
|
||||||
assertEquals("j f", p.getNameString());
|
assertEquals("j f", p.getNameString());
|
||||||
|
|
||||||
|
p = new Person("Guan-Hua Du", false);
|
||||||
|
|
||||||
|
System.out.println("surname = " + p.getSurnameString());
|
||||||
|
System.out.println("name = " + p.getNameString());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue