2023-07-06 10:28:53 +02:00
|
|
|
|
2018-10-02 10:37:54 +02:00
|
|
|
package eu.dnetlib.pace.clustering;
|
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
import java.util.Collection;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Set;
|
|
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
2018-12-12 16:30:03 +01:00
|
|
|
import com.google.common.collect.Sets;
|
2023-07-06 10:28:53 +02:00
|
|
|
|
2018-12-12 16:30:03 +01:00
|
|
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
2019-10-08 14:53:52 +02:00
|
|
|
import eu.dnetlib.pace.config.Config;
|
2018-12-12 16:30:03 +01:00
|
|
|
import eu.dnetlib.pace.model.Field;
|
|
|
|
import eu.dnetlib.pace.model.Person;
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2018-12-12 16:30:03 +01:00
|
|
|
@ClusteringClass("personClustering")
|
2018-10-02 10:37:54 +02:00
|
|
|
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
|
|
|
|
|
|
|
private Map<String, Integer> params;
|
|
|
|
|
|
|
|
private static final int MAX_TOKENS = 5;
|
|
|
|
|
|
|
|
public PersonClustering(final Map<String, Integer> params) {
|
|
|
|
this.params = params;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
2019-10-08 14:53:52 +02:00
|
|
|
public Collection<String> apply(final Config conf, final List<Field> fields) {
|
2018-10-02 10:37:54 +02:00
|
|
|
final Set<String> hashes = Sets.newHashSet();
|
|
|
|
|
|
|
|
for (final Field f : fields) {
|
|
|
|
|
2018-12-12 16:30:03 +01:00
|
|
|
final Person person = new Person(f.stringValue(), false);
|
2018-10-02 10:37:54 +02:00
|
|
|
|
2023-07-06 10:28:53 +02:00
|
|
|
if (StringUtils.isNotBlank(person.getNormalisedFirstName())
|
|
|
|
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
|
2018-12-12 16:30:03 +01:00
|
|
|
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
|
2018-10-02 10:37:54 +02:00
|
|
|
} else {
|
2018-12-12 16:30:03 +01:00
|
|
|
for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
|
|
|
|
for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
|
2018-10-02 10:37:54 +02:00
|
|
|
if (!token1.equals(token2)) {
|
|
|
|
hashes.add(firstLC(token1) + token2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return hashes;
|
|
|
|
}
|
|
|
|
|
2018-12-12 16:30:03 +01:00
|
|
|
// @Override
|
|
|
|
// public Collection<String> apply(final List<Field> fields) {
|
|
|
|
// final Set<String> hashes = Sets.newHashSet();
|
|
|
|
//
|
|
|
|
// for (final Field f : fields) {
|
|
|
|
//
|
|
|
|
// final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
|
|
|
|
//
|
|
|
|
// final Author a = gta.getAuthor();
|
|
|
|
//
|
|
|
|
// if (StringUtils.isNotBlank(a.getFirstname()) && StringUtils.isNotBlank(a.getSecondnames())) {
|
|
|
|
// hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
|
|
|
|
// } else {
|
|
|
|
// for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
|
|
|
|
// for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
|
|
|
|
// if (!token1.equals(token2)) {
|
|
|
|
// hashes.add(firstLC(token1) + token2);
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
// }
|
|
|
|
//
|
|
|
|
// return hashes;
|
|
|
|
// }
|
2018-10-02 10:37:54 +02:00
|
|
|
|
|
|
|
@Override
|
|
|
|
public Map<String, Integer> getParams() {
|
|
|
|
return params;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|