forked from D-Net/dnet-hadoop
implementation of author dedup configuration and lnfi clustering function
This commit is contained in:
parent
14f6346676
commit
6a6c266dde
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("lnfi")
|
||||
public class LastNameFirstInitial extends AbstractClusteringFunction{
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = true;
|
||||
|
||||
public LastNameFirstInitial(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::normalize)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
|
||||
Person p = new Person(s, aggressive);
|
||||
|
||||
if (p.isAccurate()) {
|
||||
String lastName = p.getNormalisedSurname().toLowerCase();
|
||||
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
|
||||
|
||||
res.add(firstInitial.concat(lastName));
|
||||
}
|
||||
else { // is not accurate, meaning it has no defined name and surname
|
||||
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
|
||||
if (fullname.size() == 1) {
|
||||
res.add(p.getNormalisedFullname().toLowerCase());
|
||||
}
|
||||
else if (fullname.size() == 2) {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
|
||||
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
else {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
|
||||
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
}
|
|
@ -43,7 +43,7 @@ public class Person {
|
|||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
if (s.contains(",")) {
|
||||
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname = splitTerms(arr[0]);
|
||||
|
|
|
@ -237,4 +237,13 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLastNameFirstInitial(){
|
||||
|
||||
final ClusteringFunction cf = new LastNameFirstInitial(params);
|
||||
final String s = "LI Yonghong";
|
||||
System.out.println("s = " + s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
}
|
|
@ -30,6 +30,11 @@ public class UtilTest {
|
|||
|
||||
assertEquals("kennedy", p.getSurnameString());
|
||||
assertEquals("j f", p.getNameString());
|
||||
|
||||
p = new Person("Guan-Hua Du", false);
|
||||
|
||||
System.out.println("surname = " + p.getSurnameString());
|
||||
System.out.println("name = " + p.getNameString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue