forked from D-Net/dnet-hadoop
Import dnet-pace-core module from dnet-dedup repository
This commit is contained in:
commit
3b35db5fbd
|
@ -0,0 +1,77 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-dedup</artifactId>
|
||||
<version>4.1.13-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>edu.cmu</groupId>
|
||||
<artifactId>secondstring</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.guava</groupId>
|
||||
<artifactId>guava</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.code.gson</groupId>
|
||||
<artifactId>gson</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-lang3</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-logging</groupId>
|
||||
<artifactId>commons-logging</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.reflections</groupId>
|
||||
<artifactId>reflections</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.fasterxml.jackson.core</groupId>
|
||||
<artifactId>jackson-databind</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.commons</groupId>
|
||||
<artifactId>commons-math3</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
|
@ -0,0 +1,44 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
protected Map<String, Integer> params;
|
||||
|
||||
public AbstractClusteringFunction(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
protected abstract Collection<String> doApply(Config conf, String s);
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
public Map<String, Integer> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
protected Integer param(String name) {
|
||||
return params.get(name);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("acronyms")
|
||||
public class Acronyms extends AbstractClusteringFunction {
|
||||
|
||||
public Acronyms(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||
}
|
||||
|
||||
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
|
||||
|
||||
final Set<String> acronyms = Sets.newLinkedHashSet();
|
||||
|
||||
for (int i = 0; i < maxAcronyms; i++) {
|
||||
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (sb.length() > maxLen) {
|
||||
break;
|
||||
}
|
||||
if (token.length() > 1 && i < token.length()) {
|
||||
sb.append(token.charAt(i));
|
||||
}
|
||||
}
|
||||
String acronym = sb.toString();
|
||||
if (acronym.length() > minLen) {
|
||||
acronyms.add(acronym);
|
||||
}
|
||||
}
|
||||
return acronyms;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldListImpl;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
|
||||
|
||||
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
|
||||
Document filtered = filter(a, conf.blacklists());
|
||||
return combine(filtered, conf);
|
||||
}
|
||||
|
||||
private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
|
||||
if (blacklists == null || blacklists.isEmpty()) {
|
||||
return a;
|
||||
}
|
||||
|
||||
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
|
||||
|
||||
for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
|
||||
Field fields = a.getFieldMap().get(e.getKey());
|
||||
if (fields != null) {
|
||||
final FieldListImpl fl = new FieldListImpl();
|
||||
|
||||
for (Field f : fields) {
|
||||
if (!isBlackListed(f.stringValue(), e.getValue())) {
|
||||
fl.add(f);
|
||||
}
|
||||
}
|
||||
|
||||
filtered.put(e.getKey(), fl);
|
||||
}
|
||||
}
|
||||
|
||||
return new MapDocument(a.getIdentifier(), filtered);
|
||||
}
|
||||
|
||||
private static boolean isBlackListed(String value, List<Pattern> blacklist) {
|
||||
for (Pattern pattern : blacklist) {
|
||||
if (pattern.matcher(value).matches()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ClusteringClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
public class ClusteringCombiner {
|
||||
|
||||
private static String SEPARATOR = ":";
|
||||
private static String COLLAPSE_ON= "collapseOn";
|
||||
|
||||
public static Collection<String> combine(final Document a, final Config conf) {
|
||||
final Collection<String> res = Sets.newLinkedHashSet();
|
||||
for (final ClusteringDef cd : conf.clusterings()) {
|
||||
for (final String fieldName : cd.getFields()) {
|
||||
String prefix = getPrefix(cd, fieldName);
|
||||
|
||||
Field values = a.values(fieldName);
|
||||
List<Field> fields = new ArrayList<>();
|
||||
|
||||
if (values instanceof FieldValueImpl) {
|
||||
fields.add(values);
|
||||
}
|
||||
else {
|
||||
fields.addAll((List<Field>) values);
|
||||
}
|
||||
|
||||
res.addAll(
|
||||
cd.clusteringFunction()
|
||||
.apply(conf, fields)
|
||||
.stream()
|
||||
.map(k -> prefix + SEPARATOR +k)
|
||||
.collect(Collectors.toList())
|
||||
);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private static String getPrefix(ClusteringDef cd, String fieldName) {
|
||||
return cd.getName()+ SEPARATOR +
|
||||
cd.getParams().keySet()
|
||||
.stream()
|
||||
.filter(k -> k.contains(COLLAPSE_ON))
|
||||
.findFirst()
|
||||
.map(k -> StringUtils.substringAfter(k, SEPARATOR))
|
||||
.orElse(fieldName);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface ClusteringFunction {
|
||||
|
||||
public Collection<String> apply(Config config, List<Field> fields);
|
||||
|
||||
public Map<String, Integer> getParams();
|
||||
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("immutablefieldvalue")
|
||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||
|
||||
public ImmutableFieldValue(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(s);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("keywordsclustering")
|
||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||
|
||||
public KeywordsClustering(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
|
||||
//takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||
|
||||
//list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
|
||||
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
|
||||
for (String city: citiesToCodes(cities)) {
|
||||
combinations.add(keyword+"-"+city);
|
||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||
return combinations;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return combinations;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::cleanup)
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("lnfi")
|
||||
public class LastNameFirstInitial extends AbstractClusteringFunction{
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = true;
|
||||
|
||||
public LastNameFirstInitial(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::normalize)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
|
||||
Person p = new Person(s, aggressive);
|
||||
|
||||
if (p.isAccurate()) {
|
||||
String lastName = p.getNormalisedSurname().toLowerCase();
|
||||
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
|
||||
|
||||
res.add(firstInitial.concat(lastName));
|
||||
}
|
||||
else { // is not accurate, meaning it has no defined name and surname
|
||||
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
|
||||
if (fullname.size() == 1) {
|
||||
res.add(p.getNormalisedFullname().toLowerCase());
|
||||
}
|
||||
else if (fullname.size() == 2) {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
|
||||
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
else {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
|
||||
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@ClusteringClass("lowercase")
|
||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||
|
||||
public LowercaseClustering(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
Collection<String> c = Sets.newLinkedHashSet();
|
||||
for(Field f : fields) {
|
||||
c.addAll(doApply(conf, f.stringValue()));
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
if(StringUtils.isBlank(s)) {
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
return Lists.newArrayList(s.toLowerCase().trim());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
||||
public class NGramUtils extends AbstractPaceFunctions {
|
||||
|
||||
private static final int SIZE = 100;
|
||||
|
||||
private static Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
|
||||
public static String cleanupForOrdering(String s) {
|
||||
NGramUtils utils = new NGramUtils();
|
||||
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("ngrampairs")
|
||||
public class NgramPairs extends Ngrams {
|
||||
|
||||
public NgramPairs(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
||||
}
|
||||
|
||||
protected Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) {
|
||||
Collection<String> res = Lists.newArrayList();
|
||||
int j = 0;
|
||||
for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
|
||||
if (++j >= ngrams.size()) {
|
||||
break;
|
||||
}
|
||||
res.add(ngrams.get(i) + ngrams.get(j));
|
||||
//System.out.println("-- " + concatNgrams);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@ClusteringClass("ngrams")
|
||||
public class Ngrams extends AbstractClusteringFunction {
|
||||
|
||||
public Ngrams(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||
}
|
||||
|
||||
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
|
||||
|
||||
final Collection<String> ngrams = new LinkedHashSet<String>();
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (!token.isEmpty()) {
|
||||
|
||||
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
|
||||
String ngram = (token + " ").substring(i, ngramLen + i).trim();
|
||||
if (ngrams.size() >= max) {
|
||||
return ngrams;
|
||||
}
|
||||
if (ngram.length() >= minNgramLen) {
|
||||
ngrams.add(ngram);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//System.out.println(ngrams + " n: " + ngrams.size());
|
||||
return ngrams;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ClusteringClass("personClustering")
|
||||
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
private Map<String, Integer> params;
|
||||
|
||||
private static final int MAX_TOKENS = 5;
|
||||
|
||||
public PersonClustering(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, final List<Field> fields) {
|
||||
final Set<String> hashes = Sets.newHashSet();
|
||||
|
||||
for (final Field f : fields) {
|
||||
|
||||
final Person person = new Person(f.stringValue(), false);
|
||||
|
||||
if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
|
||||
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
|
||||
} else {
|
||||
for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
|
||||
for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
|
||||
if (!token1.equals(token2)) {
|
||||
hashes.add(firstLC(token1) + token2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return hashes;
|
||||
}
|
||||
|
||||
// @Override
|
||||
// public Collection<String> apply(final List<Field> fields) {
|
||||
// final Set<String> hashes = Sets.newHashSet();
|
||||
//
|
||||
// for (final Field f : fields) {
|
||||
//
|
||||
// final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
|
||||
//
|
||||
// final Author a = gta.getAuthor();
|
||||
//
|
||||
// if (StringUtils.isNotBlank(a.getFirstname()) && StringUtils.isNotBlank(a.getSecondnames())) {
|
||||
// hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
|
||||
// } else {
|
||||
// for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
|
||||
// for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
|
||||
// if (!token1.equals(token2)) {
|
||||
// hashes.add(firstLC(token1) + token2);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return hashes;
|
||||
// }
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("personHash")
|
||||
public class PersonHash extends AbstractClusteringFunction {
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = false;
|
||||
|
||||
public PersonHash(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
|
||||
res.add(new Person(s, aggressive).hash());
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
public class RandomClusteringFunction extends AbstractClusteringFunction {
|
||||
|
||||
public RandomClusteringFunction(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("sortedngrampairs")
|
||||
public class SortedNgramPairs extends NgramPairs {
|
||||
|
||||
public SortedNgramPairs(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
|
||||
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
|
||||
|
||||
Collections.sort(tokens);
|
||||
|
||||
return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang3.RandomStringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
@ClusteringClass("spacetrimmingfieldvalue")
|
||||
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||
|
||||
public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("suffixprefix")
|
||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||
|
||||
public SuffixPrefix(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||
int i = 0;
|
||||
while (++i < s.length() && bigrams.size() < max) {
|
||||
int j = s.indexOf(" ", i);
|
||||
|
||||
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
|
||||
|
||||
if (j - len > 0) {
|
||||
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
|
||||
if (bigram.length() >= 4) {
|
||||
bigrams.add(bigram);
|
||||
}
|
||||
}
|
||||
}
|
||||
return bigrams;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("urlclustering")
|
||||
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
protected Map<String, Integer> params;
|
||||
|
||||
public UrlClustering(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||
try {
|
||||
return fields.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::asUrl)
|
||||
.map(URL::getHost)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
catch (IllegalStateException e){
|
||||
return new HashSet<>();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> getParams() {
|
||||
return null;
|
||||
}
|
||||
|
||||
private URL asUrl(String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("wordsStatsSuffixPrefixChain")
|
||||
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
||||
|
||||
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefixChain(s, param("mod"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
|
||||
//create the list of words from the string (remove short words)
|
||||
List<String> wordsList =
|
||||
Arrays.stream(s.split(" "))
|
||||
.filter(si -> si.length() > 3)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
final int words = wordsList.size();
|
||||
final int letters = s.length();
|
||||
|
||||
//create the prefix: number of words + number of letters/mod
|
||||
String prefix = words + "-" + letters/mod + "-";
|
||||
|
||||
return doSuffixPrefixChain(wordsList, prefix);
|
||||
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
|
||||
Set<String> set = Sets.newLinkedHashSet();
|
||||
switch(wordsList.size()){
|
||||
case 0:
|
||||
case 1:
|
||||
break;
|
||||
case 2:
|
||||
set.add(
|
||||
prefix +
|
||||
suffix(wordsList.get(0), 3) +
|
||||
prefix(wordsList.get(1), 3)
|
||||
);
|
||||
|
||||
set.add(
|
||||
prefix +
|
||||
prefix(wordsList.get(0), 3) +
|
||||
suffix(wordsList.get(1), 3)
|
||||
);
|
||||
|
||||
break;
|
||||
default:
|
||||
set.add(
|
||||
prefix +
|
||||
suffix(wordsList.get(0), 3) +
|
||||
prefix(wordsList.get(1), 3) +
|
||||
suffix(wordsList.get(2), 3)
|
||||
);
|
||||
|
||||
set.add(
|
||||
prefix +
|
||||
prefix(wordsList.get(0), 3) +
|
||||
suffix(wordsList.get(1), 3) +
|
||||
prefix(wordsList.get(2), 3)
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
return set;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private String suffix(String s, int len) {
|
||||
return s.substring(s.length()-len);
|
||||
}
|
||||
|
||||
private String prefix(String s, int len) {
|
||||
return s.substring(0, len);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("wordssuffixprefix")
|
||||
public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
||||
|
||||
public WordsSuffixPrefix(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||
|
||||
final int words = s.split(" ").length;
|
||||
|
||||
// adjust the token length according to the number of words
|
||||
switch (words) {
|
||||
case 1:
|
||||
return Sets.newLinkedHashSet();
|
||||
case 2:
|
||||
return doSuffixPrefix(s, len+2, max, words);
|
||||
case 3:
|
||||
return doSuffixPrefix(s, len+1, max, words);
|
||||
default:
|
||||
return doSuffixPrefix(s, len, max, words);
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
|
||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||
int i = 0;
|
||||
while (++i < s.length() && bigrams.size() < max) {
|
||||
int j = s.indexOf(" ", i);
|
||||
|
||||
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
|
||||
|
||||
if (j - len > 0) {
|
||||
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
|
||||
if (bigram.length() >= 4) {
|
||||
bigrams.add(words+bigram);
|
||||
}
|
||||
}
|
||||
}
|
||||
return bigrams;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,357 @@
|
|||
package eu.dnetlib.pace.common;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.FieldListImpl;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
//city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
//list of stopwords in different languages
|
||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
//transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
//blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
//html regex for normalization
|
||||
public final String HTML_REGEX = "<[^>]*>";
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
//doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
|
||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||
|
||||
protected String concat(final List<String> l) {
|
||||
return Joiner.on(" ").skipNulls().join(l);
|
||||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
|
||||
final String s1 = s.replaceAll(HTML_REGEX, "");
|
||||
final String s2 = unicodeNormalization(s1.toLowerCase());
|
||||
final String s3 = nfd(s2);
|
||||
final String s4 = fixXML(s3);
|
||||
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
|
||||
final String s6 = transliterate(s5);
|
||||
final String s7 = fixAliases(s6);
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
|
||||
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
|
||||
final String s10 = s9.replaceAll("\\n", " ");
|
||||
final String s11 = s10.replaceAll("(?m)\\s+", " ");
|
||||
final String s12 = s11.trim();
|
||||
return s12;
|
||||
}
|
||||
|
||||
protected String fixXML(final String a){
|
||||
|
||||
return a.replaceAll("–", " ")
|
||||
.replaceAll("&", " ")
|
||||
.replaceAll(""", " ")
|
||||
.replaceAll("−", " ");
|
||||
}
|
||||
|
||||
protected boolean checkNumbers(final String a, final String b) {
|
||||
final String numbersA = getNumbers(a);
|
||||
final String numbersB = getNumbers(b);
|
||||
final String romansA = getRomans(a);
|
||||
final String romansB = getRomans(b);
|
||||
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
||||
}
|
||||
|
||||
protected String getRomans(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isRoman(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected boolean isRoman(final String s) {
|
||||
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
|
||||
}
|
||||
|
||||
protected String getNumbers(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isNumber(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public boolean isNumber(String strNum) {
|
||||
if (strNum == null) {
|
||||
return false;
|
||||
}
|
||||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
|
||||
});
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
}
|
||||
catch(Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
protected String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
s.chars().forEach(ch -> {
|
||||
sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
|
||||
});
|
||||
|
||||
return sb.toString().replaceAll("\\s+", " ");
|
||||
}
|
||||
|
||||
protected String getFirstValue(final Field values) {
|
||||
return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : "";
|
||||
}
|
||||
|
||||
protected boolean notNull(final String s) {
|
||||
return s != null;
|
||||
}
|
||||
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public String unicodeNormalization(final String s) {
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (!stopwords.contains(token)) {
|
||||
sb.append(token);
|
||||
sb.append(" ");
|
||||
}
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
public String filterAllStopWords(String s) {
|
||||
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
s = filterStopWords(s, stopwords_it);
|
||||
s = filterStopWords(s, stopwords_fr);
|
||||
s = filterStopWords(s, stopwords_pt);
|
||||
s = filterStopWords(s, stopwords_es);
|
||||
s = filterStopWords(s, stopwords_gr);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
||||
final Set<String> newset = Sets.newLinkedHashSet();
|
||||
for (final String s : set) {
|
||||
if (!ngramBlacklist.contains(s)) {
|
||||
newset.add(s);
|
||||
}
|
||||
}
|
||||
return newset;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
//string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i = 1; i < line.length; i++) {
|
||||
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
public String removeKeywords(String s, Set<String> keywords) {
|
||||
|
||||
s = " " + s + " ";
|
||||
for (String k : keywords) {
|
||||
s = s.replaceAll(k.toLowerCase(), "");
|
||||
}
|
||||
|
||||
return s.trim();
|
||||
}
|
||||
|
||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
|
||||
|
||||
double longer = Math.max(s1.size(), s2.size());
|
||||
return (double) s1.stream().filter(s2::contains).count() / longer;
|
||||
}
|
||||
|
||||
//convert the set of keywords to codes
|
||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return toCodes(keywords, translationMap);
|
||||
}
|
||||
|
||||
public Set<String> citiesToCodes(Set<String> keywords) {
|
||||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
|
||||
protected String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
||||
protected Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
public String normalizePid(String pid) {
|
||||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||
}
|
||||
|
||||
//get the list of keywords into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
|
||||
|
||||
String s = s1;
|
||||
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
|
||||
Set<String> codes = new HashSet<>();
|
||||
|
||||
if (tokens.size() < windowSize)
|
||||
windowSize = tokens.size();
|
||||
|
||||
int length = windowSize;
|
||||
|
||||
while (length != 0) {
|
||||
|
||||
for (int i = 0; i <= tokens.size() - length; i++) {
|
||||
String candidate = concat(tokens.subList(i, i + length));
|
||||
if (translationMap.containsKey(candidate)) {
|
||||
codes.add(candidate);
|
||||
s = s.replace(candidate, "").trim();
|
||||
}
|
||||
}
|
||||
|
||||
tokens = Arrays.asList(s.split(" "));
|
||||
length -= 1;
|
||||
}
|
||||
|
||||
return codes;
|
||||
}
|
||||
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
||||
/**
|
||||
* Interface for PACE configuration bean.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public interface Config {
|
||||
|
||||
/**
|
||||
* Field configuration definitions.
|
||||
*
|
||||
* @return the list of definitions
|
||||
*/
|
||||
public List<FieldDef> model();
|
||||
|
||||
/**
|
||||
* Decision Tree definition
|
||||
*
|
||||
* @return the map representing the decision tree
|
||||
*/
|
||||
public Map<String, TreeNodeDef> decisionTree();
|
||||
|
||||
/**
|
||||
* Field configuration definitions.
|
||||
*
|
||||
* @return the list of definitions
|
||||
*/
|
||||
public Map<String, FieldDef> modelMap();
|
||||
|
||||
/**
|
||||
* Clusterings.
|
||||
*
|
||||
* @return the list
|
||||
*/
|
||||
public List<ClusteringDef> clusterings();
|
||||
|
||||
/**
|
||||
* Blacklists.
|
||||
*
|
||||
* @return the map
|
||||
*/
|
||||
public Map<String, List<Pattern>> blacklists();
|
||||
|
||||
|
||||
/**
|
||||
* Translation map.
|
||||
*
|
||||
* @return the map
|
||||
* */
|
||||
public Map<String, String> translationMap();
|
||||
}
|
|
@ -0,0 +1,163 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.antlr.stringtemplate.StringTemplate;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
||||
|
||||
public class DedupConfig implements Config, Serializable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DedupConfig.class);
|
||||
|
||||
private static String CONFIG_TEMPLATE = "dedupConfig.st";
|
||||
|
||||
private PaceConfig pace;
|
||||
|
||||
private WfConfig wf;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, List<Pattern>> blacklists;
|
||||
|
||||
private static Map<String, String> defaults = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
defaults.put("dedupRun", "001");
|
||||
defaults.put("entityType", "result");
|
||||
defaults.put("subEntityType", "resulttype");
|
||||
defaults.put("subEntityValue", "publication");
|
||||
defaults.put("orderField", "title");
|
||||
defaults.put("queueMaxSize", "2000");
|
||||
defaults.put("groupMaxSize", "10");
|
||||
defaults.put("slidingWindowSize", "200");
|
||||
defaults.put("rootBuilder", "result");
|
||||
defaults.put("includeChildren", "true");
|
||||
defaults.put("maxIterations", "20");
|
||||
defaults.put("idPath", "$.id");
|
||||
}
|
||||
|
||||
public DedupConfig() {}
|
||||
|
||||
public static DedupConfig load(final String json) {
|
||||
|
||||
final DedupConfig config;
|
||||
try {
|
||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||
config.getPace().initModel();
|
||||
config.getPace().initTranslationMap();
|
||||
|
||||
config.blacklists = config.getPace().getBlacklists().entrySet()
|
||||
.stream()
|
||||
.collect(Collectors.toMap(e -> e.getKey(),
|
||||
e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
|
||||
|
||||
return config;
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Error in parsing configuration json", e);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static DedupConfig loadDefault() throws IOException {
|
||||
return loadDefault(new HashMap<String, String>());
|
||||
}
|
||||
|
||||
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
|
||||
|
||||
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
|
||||
|
||||
for (final Entry<String, String> e : defaults.entrySet()) {
|
||||
template.setAttribute(e.getKey(), e.getValue());
|
||||
}
|
||||
for (final Entry<String, String> e : params.entrySet()) {
|
||||
if (template.getAttribute(e.getKey()) != null) {
|
||||
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
|
||||
} else {
|
||||
template.setAttribute(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
final String json = template.toString();
|
||||
return load(json);
|
||||
}
|
||||
|
||||
private String readFromClasspath(final String resource) throws IOException {
|
||||
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public PaceConfig getPace() {
|
||||
return pace;
|
||||
}
|
||||
|
||||
public void setPace(final PaceConfig pace) {
|
||||
this.pace = pace;
|
||||
}
|
||||
|
||||
public WfConfig getWf() {
|
||||
return wf;
|
||||
}
|
||||
|
||||
public void setWf(final WfConfig wf) {
|
||||
this.wf = wf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise configuration", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, TreeNodeDef> decisionTree(){
|
||||
return getPace().getDecisionTree();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FieldDef> model() {
|
||||
return getPace().getModel();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, FieldDef> modelMap() {
|
||||
return getPace().getModelMap();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ClusteringDef> clusterings() {
|
||||
return getPace().getClustering();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, List<Pattern>> blacklists() {
|
||||
return blacklists;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> translationMap() {
|
||||
return getPace().translationMap();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,110 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
||||
|
||||
private List<FieldDef> model;
|
||||
|
||||
private List<ClusteringDef> clustering;
|
||||
private Map<String, TreeNodeDef> decisionTree;
|
||||
|
||||
private Map<String, List<String>> blacklists;
|
||||
private Map<String, List<String>> synonyms;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, String> translationMap;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, FieldDef> modelMap;
|
||||
|
||||
@JsonIgnore
|
||||
public static PaceResolver resolver = new PaceResolver();
|
||||
|
||||
public PaceConfig() {}
|
||||
|
||||
public void initModel() {
|
||||
modelMap = Maps.newHashMap();
|
||||
for (FieldDef fd : getModel()) {
|
||||
modelMap.put(fd.getName(), fd);
|
||||
}
|
||||
}
|
||||
|
||||
public void initTranslationMap(){
|
||||
translationMap = Maps.newHashMap();
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
for (String key : synonyms.keySet()) {
|
||||
for (String term : synonyms.get(key)){
|
||||
translationMap.put(
|
||||
fixAliases(transliterator.transliterate(term.toLowerCase())),
|
||||
key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, String> translationMap(){
|
||||
return translationMap;
|
||||
}
|
||||
|
||||
public List<FieldDef> getModel() {
|
||||
return model;
|
||||
}
|
||||
|
||||
public void setModel(final List<FieldDef> model) {
|
||||
this.model = model;
|
||||
}
|
||||
|
||||
public List<ClusteringDef> getClustering() {
|
||||
return clustering;
|
||||
}
|
||||
|
||||
public void setClustering(final List<ClusteringDef> clustering) {
|
||||
this.clustering = clustering;
|
||||
}
|
||||
|
||||
public Map<String, TreeNodeDef> getDecisionTree() {
|
||||
return decisionTree;
|
||||
}
|
||||
|
||||
public void setDecisionTree(Map<String, TreeNodeDef> decisionTree) {
|
||||
this.decisionTree = decisionTree;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getBlacklists() {
|
||||
return blacklists;
|
||||
}
|
||||
|
||||
public void setBlacklists(final Map<String, List<String>> blacklists) {
|
||||
this.blacklists = blacklists;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getSynonyms() {
|
||||
return synonyms;
|
||||
}
|
||||
|
||||
public void setSynonyms(Map<String, List<String>> synonyms) {
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
public Map<String, FieldDef> getModelMap() {
|
||||
return modelMap;
|
||||
}
|
||||
|
||||
public void setModelMap(final Map<String, FieldDef> modelMap) {
|
||||
this.modelMap = modelMap;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
public enum Type {
|
||||
String, Int, List, JSON, URL, StringConcat, DoubleArray
|
||||
}
|
|
@ -0,0 +1,292 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
||||
public class WfConfig implements Serializable {
|
||||
|
||||
/**
|
||||
* Entity type.
|
||||
*/
|
||||
private String entityType = "";
|
||||
|
||||
/**
|
||||
* Sub-Entity type refers to one of fields declared in the model. See eu.dnetlib.pace.config.PaceConfig.modelMap
|
||||
*/
|
||||
private String subEntityType = "";
|
||||
|
||||
/**
|
||||
* Sub-Entity value declares a value for subTypes to be considered.
|
||||
*/
|
||||
private String subEntityValue = "";
|
||||
|
||||
/**
|
||||
* Field name used to sort the values in the reducer phase.
|
||||
*/
|
||||
private String orderField = "";
|
||||
|
||||
/**
|
||||
* Column Families involved in the relations redirection.
|
||||
*/
|
||||
private List<String> rootBuilder = Lists.newArrayList();
|
||||
|
||||
/**
|
||||
* Set of datasource namespace prefixes that won't be deduplicated.
|
||||
*/
|
||||
private Set<String> skipList = Sets.newHashSet();
|
||||
|
||||
/**
|
||||
* Subprefix used to build the root id, allows multiple dedup runs.
|
||||
*/
|
||||
private String dedupRun = "";
|
||||
|
||||
/**
|
||||
* Similarity threshold.
|
||||
*/
|
||||
private double threshold = 0;
|
||||
|
||||
/** The queue max size. */
|
||||
private int queueMaxSize = 2000;
|
||||
|
||||
/** The group max size. */
|
||||
private int groupMaxSize;
|
||||
|
||||
/** The sliding window size. */
|
||||
private int slidingWindowSize;
|
||||
|
||||
/** The configuration id. */
|
||||
private String configurationId;
|
||||
|
||||
/** The include children. */
|
||||
private boolean includeChildren;
|
||||
|
||||
/** Default maximum number of allowed children. */
|
||||
private final static int MAX_CHILDREN = 10;
|
||||
|
||||
/** Maximum number of allowed children. */
|
||||
private int maxChildren = MAX_CHILDREN;
|
||||
|
||||
|
||||
/** Default maximum number of iterations. */
|
||||
private final static int MAX_ITERATIONS = 20;
|
||||
|
||||
/** Maximum number of iterations */
|
||||
private int maxIterations = MAX_ITERATIONS;
|
||||
|
||||
/** The Jquery path to retrieve the identifier */
|
||||
private String idPath = "$.id";
|
||||
|
||||
public WfConfig() {}
|
||||
|
||||
/**
|
||||
* Instantiates a new dedup config.
|
||||
*
|
||||
* @param entityType
|
||||
* the entity type
|
||||
* @param orderField
|
||||
* the order field
|
||||
* @param rootBuilder
|
||||
* the root builder families
|
||||
* @param dedupRun
|
||||
* the dedup run
|
||||
* @param skipList
|
||||
* the skip list
|
||||
* @param queueMaxSize
|
||||
* the queue max size
|
||||
* @param groupMaxSize
|
||||
* the group max size
|
||||
* @param slidingWindowSize
|
||||
* the sliding window size
|
||||
* @param includeChildren
|
||||
* allows the children to be included in the representative records or not.
|
||||
* @param maxIterations
|
||||
* the maximum number of iterations
|
||||
* @param idPath
|
||||
* the path for the id of the entity
|
||||
*/
|
||||
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
|
||||
super();
|
||||
this.entityType = entityType;
|
||||
this.orderField = orderField;
|
||||
this.rootBuilder = rootBuilder;
|
||||
this.dedupRun = cleanupStringNumber(dedupRun);
|
||||
this.skipList = skipList;
|
||||
this.queueMaxSize = queueMaxSize;
|
||||
this.groupMaxSize = groupMaxSize;
|
||||
this.slidingWindowSize = slidingWindowSize;
|
||||
this.includeChildren = includeChildren;
|
||||
this.maxIterations = maxIterations;
|
||||
this.idPath = idPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup string number.
|
||||
*
|
||||
* @param s
|
||||
* the s
|
||||
* @return the string
|
||||
*/
|
||||
private String cleanupStringNumber(final String s) {
|
||||
return s.contains("'") ? s.replaceAll("'", "") : s;
|
||||
}
|
||||
|
||||
public boolean hasSubType() {
|
||||
return StringUtils.isNotBlank(getSubEntityType()) && StringUtils.isNotBlank(getSubEntityValue());
|
||||
}
|
||||
|
||||
public String getEntityType() {
|
||||
return entityType;
|
||||
}
|
||||
|
||||
public void setEntityType(final String entityType) {
|
||||
this.entityType = entityType;
|
||||
}
|
||||
|
||||
public String getSubEntityType() {
|
||||
return subEntityType;
|
||||
}
|
||||
|
||||
public void setSubEntityType(final String subEntityType) {
|
||||
this.subEntityType = subEntityType;
|
||||
}
|
||||
|
||||
public String getSubEntityValue() {
|
||||
return subEntityValue;
|
||||
}
|
||||
|
||||
public void setSubEntityValue(final String subEntityValue) {
|
||||
this.subEntityValue = subEntityValue;
|
||||
}
|
||||
|
||||
public String getOrderField() {
|
||||
return orderField;
|
||||
}
|
||||
|
||||
public void setOrderField(final String orderField) {
|
||||
this.orderField = orderField;
|
||||
}
|
||||
|
||||
public List<String> getRootBuilder() {
|
||||
return rootBuilder;
|
||||
}
|
||||
|
||||
public void setRootBuilder(final List<String> rootBuilder) {
|
||||
this.rootBuilder = rootBuilder;
|
||||
}
|
||||
|
||||
public Set<String> getSkipList() {
|
||||
return skipList != null ? skipList : new HashSet<String>();
|
||||
}
|
||||
|
||||
public void setSkipList(final Set<String> skipList) {
|
||||
this.skipList = skipList;
|
||||
}
|
||||
|
||||
public String getDedupRun() {
|
||||
return dedupRun;
|
||||
}
|
||||
|
||||
public void setDedupRun(final String dedupRun) {
|
||||
this.dedupRun = dedupRun;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(final double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public int getQueueMaxSize() {
|
||||
return queueMaxSize;
|
||||
}
|
||||
|
||||
public void setQueueMaxSize(final int queueMaxSize) {
|
||||
this.queueMaxSize = queueMaxSize;
|
||||
}
|
||||
|
||||
public int getGroupMaxSize() {
|
||||
return groupMaxSize;
|
||||
}
|
||||
|
||||
public void setGroupMaxSize(final int groupMaxSize) {
|
||||
this.groupMaxSize = groupMaxSize;
|
||||
}
|
||||
|
||||
public int getSlidingWindowSize() {
|
||||
return slidingWindowSize;
|
||||
}
|
||||
|
||||
public void setSlidingWindowSize(final int slidingWindowSize) {
|
||||
this.slidingWindowSize = slidingWindowSize;
|
||||
}
|
||||
|
||||
public String getConfigurationId() {
|
||||
return configurationId;
|
||||
}
|
||||
|
||||
public void setConfigurationId(final String configurationId) {
|
||||
this.configurationId = configurationId;
|
||||
}
|
||||
|
||||
public boolean isIncludeChildren() {
|
||||
return includeChildren;
|
||||
}
|
||||
|
||||
public void setIncludeChildren(final boolean includeChildren) {
|
||||
this.includeChildren = includeChildren;
|
||||
}
|
||||
|
||||
public int getMaxChildren() {
|
||||
return maxChildren;
|
||||
}
|
||||
|
||||
public void setMaxChildren(final int maxChildren) {
|
||||
this.maxChildren = maxChildren;
|
||||
}
|
||||
|
||||
|
||||
public int getMaxIterations() {
|
||||
return maxIterations;
|
||||
}
|
||||
|
||||
public void setMaxIterations(int maxIterations) {
|
||||
this.maxIterations = maxIterations;
|
||||
}
|
||||
|
||||
public String getIdPath() {
|
||||
return idPath;
|
||||
}
|
||||
|
||||
public void setIdPath(String idPath) {
|
||||
this.idPath = idPath;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Object#toString()
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
/**
|
||||
* The Class AbstractField.
|
||||
*/
|
||||
public abstract class AbstractField implements Field {
|
||||
|
||||
/** The type. */
|
||||
protected Type type = Type.String;
|
||||
|
||||
/** The name. */
|
||||
protected String name;
|
||||
|
||||
/**
|
||||
* Instantiates a new abstract field.
|
||||
*/
|
||||
protected AbstractField() {}
|
||||
|
||||
/**
|
||||
* Instantiates a new abstract field.
|
||||
*
|
||||
* @param type
|
||||
* the type
|
||||
* @param name
|
||||
* the name
|
||||
*/
|
||||
protected AbstractField(final Type type, final String name) {
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#getName()
|
||||
*/
|
||||
@Override
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#getType()
|
||||
*/
|
||||
@Override
|
||||
public Type getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#setName(java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type)
|
||||
*/
|
||||
@Override
|
||||
public void setType(final Type type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
||||
public class ClusteringDef implements Serializable {
|
||||
|
||||
private String name;
|
||||
|
||||
private List<String> fields;
|
||||
|
||||
private Map<String, Integer> params;
|
||||
|
||||
public ClusteringDef() {}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public ClusteringFunction clusteringFunction() {
|
||||
return PaceConfig.resolver.getClusteringFunction(getName(), params);
|
||||
}
|
||||
|
||||
public List<String> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public void setFields(final List<String> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public Map<String, Integer> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* The Interface Document. Models the common operations available on a Pace Document.
|
||||
*/
|
||||
public interface Document {
|
||||
|
||||
/**
|
||||
* Gets the identifier.
|
||||
*
|
||||
* @return the identifier
|
||||
*/
|
||||
String getIdentifier();
|
||||
|
||||
/**
|
||||
* Fields.
|
||||
*
|
||||
* @return the iterable
|
||||
*/
|
||||
Iterable<Field> fields();
|
||||
|
||||
/**
|
||||
* Values.
|
||||
*
|
||||
* @param name
|
||||
* the name
|
||||
* @return the field list
|
||||
*/
|
||||
Field values(String name);
|
||||
|
||||
/**
|
||||
* Field names.
|
||||
*
|
||||
* @return the sets the
|
||||
*/
|
||||
Set<String> fieldNames();
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The Interface Field.
|
||||
*/
|
||||
public interface Field extends Iterable<Field>, Serializable {
|
||||
|
||||
/**
|
||||
* Gets the name.
|
||||
*
|
||||
* @return the name
|
||||
*/
|
||||
public String getName();
|
||||
|
||||
/**
|
||||
* Sets the name.
|
||||
*
|
||||
* @param name
|
||||
* the new name
|
||||
*/
|
||||
public void setName(String name);
|
||||
|
||||
/**
|
||||
* Gets the type.
|
||||
*
|
||||
* @return the type
|
||||
*/
|
||||
public Type getType();
|
||||
|
||||
/**
|
||||
* Sets the type.
|
||||
*
|
||||
* @param type
|
||||
* the new type
|
||||
*/
|
||||
public void setType(Type type);
|
||||
|
||||
/**
|
||||
* Checks if is empty.
|
||||
*
|
||||
* @return true, if is empty
|
||||
*/
|
||||
public boolean isEmpty();
|
||||
|
||||
/**
|
||||
* String value.
|
||||
*
|
||||
* @return the string
|
||||
*/
|
||||
public String stringValue();
|
||||
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
|
||||
*/
|
||||
public class FieldDef implements Serializable {
|
||||
|
||||
public final static String PATH_SEPARATOR = "/";
|
||||
|
||||
private String name;
|
||||
|
||||
private String path;
|
||||
|
||||
private Type type;
|
||||
|
||||
private boolean overrideMatch;
|
||||
|
||||
/**
|
||||
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
|
||||
*/
|
||||
private int size = -1;
|
||||
|
||||
/**
|
||||
* Sets maximum length for field values in the model. -1 for unbounded length.
|
||||
*/
|
||||
private int length = -1;
|
||||
|
||||
public FieldDef() {}
|
||||
|
||||
// def apply(s: String): Field[A]
|
||||
public Field apply(final Type type, final String s) {
|
||||
switch (type) {
|
||||
case Int:
|
||||
return new FieldValueImpl(type, name, Integer.parseInt(s));
|
||||
case String:
|
||||
return new FieldValueImpl(type, name, s);
|
||||
case List:
|
||||
return new FieldListImpl(name, type);
|
||||
default:
|
||||
throw new IllegalArgumentException("Casting not implemented for type " + type);
|
||||
}
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
public List<String> getPathList() {
|
||||
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
|
||||
}
|
||||
|
||||
public Type getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(final Type type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public boolean isOverrideMatch() {
|
||||
return overrideMatch;
|
||||
}
|
||||
|
||||
public void setOverrideMatch(final boolean overrideMatch) {
|
||||
this.overrideMatch = overrideMatch;
|
||||
}
|
||||
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public void setSize(int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public void setLength(int length) {
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void setPath(String path) {
|
||||
this.path = path;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (JsonProcessingException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Interface FieldList.
|
||||
*/
|
||||
public interface FieldList extends List<Field>, Field {
|
||||
|
||||
/**
|
||||
* String list.
|
||||
*
|
||||
* @return the list
|
||||
*/
|
||||
public List<String> stringList();
|
||||
|
||||
/**
|
||||
* Double[] Array
|
||||
*
|
||||
* @return the double[] array
|
||||
*/
|
||||
public double[] doubleArray();
|
||||
|
||||
}
|
|
@ -0,0 +1,338 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* The Class FieldListImpl.
|
||||
*/
|
||||
public class FieldListImpl extends AbstractField implements FieldList {
|
||||
|
||||
/** The fields. */
|
||||
private List<Field> fields;
|
||||
|
||||
/**
|
||||
* Instantiates a new field list impl.
|
||||
*/
|
||||
public FieldListImpl() {
|
||||
fields = Lists.newArrayList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new field list impl.
|
||||
*
|
||||
* @param name
|
||||
* the name
|
||||
*/
|
||||
public FieldListImpl(final String name, final Type type) {
|
||||
super(type, name);
|
||||
fields = Lists.newArrayList();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#add(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public boolean add(final Field f) {
|
||||
return fields.add(f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#add(int, java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public void add(final int i, final Field f) {
|
||||
fields.add(i, f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#addAll(java.util.Collection)
|
||||
*/
|
||||
@Override
|
||||
public boolean addAll(final Collection<? extends Field> f) {
|
||||
return fields.addAll(f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#addAll(int, java.util.Collection)
|
||||
*/
|
||||
@Override
|
||||
public boolean addAll(final int i, final Collection<? extends Field> f) {
|
||||
return fields.addAll(i, f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#clear()
|
||||
*/
|
||||
@Override
|
||||
public void clear() {
|
||||
fields.clear();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#contains(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public boolean contains(final Object o) {
|
||||
return fields.contains(o);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#containsAll(java.util.Collection)
|
||||
*/
|
||||
@Override
|
||||
public boolean containsAll(final Collection<?> f) {
|
||||
return fields.containsAll(f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#get(int)
|
||||
*/
|
||||
@Override
|
||||
public Field get(final int i) {
|
||||
return fields.get(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#indexOf(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public int indexOf(final Object o) {
|
||||
return fields.indexOf(o);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#isEmpty()
|
||||
*/
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
return Iterables.all(fields, f -> f.isEmpty());
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
public Iterator<Field> iterator() {
|
||||
return fields.iterator();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#lastIndexOf(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public int lastIndexOf(final Object o) {
|
||||
return fields.lastIndexOf(o);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#listIterator()
|
||||
*/
|
||||
@Override
|
||||
public ListIterator<Field> listIterator() {
|
||||
return fields.listIterator();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#listIterator(int)
|
||||
*/
|
||||
@Override
|
||||
public ListIterator<Field> listIterator(final int i) {
|
||||
return fields.listIterator(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#remove(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public boolean remove(final Object o) {
|
||||
return fields.remove(o);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#remove(int)
|
||||
*/
|
||||
@Override
|
||||
public Field remove(final int i) {
|
||||
return fields.remove(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#removeAll(java.util.Collection)
|
||||
*/
|
||||
@Override
|
||||
public boolean removeAll(final Collection<?> f) {
|
||||
return fields.removeAll(f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#retainAll(java.util.Collection)
|
||||
*/
|
||||
@Override
|
||||
public boolean retainAll(final Collection<?> f) {
|
||||
return fields.retainAll(f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#set(int, java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public Field set(final int i, final Field f) {
|
||||
return fields.set(i, f);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#size()
|
||||
*/
|
||||
@Override
|
||||
public int size() {
|
||||
return fields.size();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#subList(int, int)
|
||||
*/
|
||||
@Override
|
||||
public List<Field> subList(final int from, final int to) {
|
||||
return fields.subList(from, to);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#toArray()
|
||||
*/
|
||||
@Override
|
||||
public Object[] toArray() {
|
||||
return fields.toArray();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.List#toArray(java.lang.Object[])
|
||||
*/
|
||||
@Override
|
||||
public <T> T[] toArray(final T[] t) {
|
||||
return fields.toArray(t);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#stringValue()
|
||||
*/
|
||||
@Override
|
||||
public String stringValue() {
|
||||
switch (getType()) {
|
||||
|
||||
case List:
|
||||
case Int:
|
||||
case String:
|
||||
return Joiner.on(" ").join(stringList());
|
||||
case JSON:
|
||||
String json;
|
||||
try {
|
||||
json = new ObjectMapper().writeValueAsString(this);
|
||||
} catch (JsonProcessingException e) {
|
||||
json = null;
|
||||
}
|
||||
return json;
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown type: " + getType().toString());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.FieldList#stringList()
|
||||
*/
|
||||
@Override
|
||||
public List<String> stringList() {
|
||||
return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer()));
|
||||
}
|
||||
|
||||
private Function<Field, String> getValuesTransformer() {
|
||||
return new Function<Field, String>() {
|
||||
|
||||
@Override
|
||||
public String apply(final Field f) {
|
||||
return f.stringValue();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public double[] doubleArray() {
|
||||
return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d-> d).toArray();
|
||||
}
|
||||
|
||||
private Function<Field,Double> getDouble() {
|
||||
|
||||
return new Function<Field, Double>() {
|
||||
@Override
|
||||
public Double apply(final Field f) {
|
||||
return Double.parseDouble(f.stringValue());
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return stringList().toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
/**
|
||||
* The Interface FieldValue.
|
||||
*/
|
||||
public interface FieldValue extends Field {
|
||||
|
||||
/**
|
||||
* Gets the value.
|
||||
*
|
||||
* @return the value
|
||||
*/
|
||||
public Object getValue();
|
||||
|
||||
/**
|
||||
* Sets the value.
|
||||
*
|
||||
* @param value
|
||||
* the new value
|
||||
*/
|
||||
public void setValue(final Object value);
|
||||
|
||||
public double[] doubleArrayValue();
|
||||
|
||||
}
|
|
@ -0,0 +1,136 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
/**
|
||||
* The Class FieldValueImpl.
|
||||
*/
|
||||
public class FieldValueImpl extends AbstractField implements FieldValue {
|
||||
|
||||
/** The value. */
|
||||
private Object value = null;
|
||||
|
||||
/**
|
||||
* Instantiates a new field value impl.
|
||||
*/
|
||||
public FieldValueImpl() {}
|
||||
|
||||
/**
|
||||
* Instantiates a new field value impl.
|
||||
*
|
||||
* @param type
|
||||
* the type
|
||||
* @param name
|
||||
* the name
|
||||
* @param value
|
||||
* the value
|
||||
*/
|
||||
public FieldValueImpl(final Type type, final String name, final Object value) {
|
||||
super(type, name);
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#isEmpty()
|
||||
*/
|
||||
@Override
|
||||
public boolean isEmpty() {
|
||||
if (value == null) return false;
|
||||
|
||||
switch (type) {
|
||||
case String:
|
||||
case JSON:
|
||||
return value.toString().isEmpty();
|
||||
case List:
|
||||
try {
|
||||
List<?> list = (List<?>) value;
|
||||
return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(value.toString());
|
||||
}
|
||||
case URL:
|
||||
String str = value.toString();
|
||||
return StringUtils.isBlank(str) || !isValidURL(str);
|
||||
case DoubleArray:
|
||||
return doubleArrayValue().length==0;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isValidURL(final String s) {
|
||||
try {
|
||||
new URL(s);
|
||||
return true;
|
||||
} catch (MalformedURLException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.FieldValue#getValue()
|
||||
*/
|
||||
@Override
|
||||
public Object getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public void setValue(final Object value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.Field#stringValue()
|
||||
*/
|
||||
@Override
|
||||
// @SuppressWarnings("unchecked")
|
||||
public String stringValue() {
|
||||
return String.valueOf(getValue());
|
||||
// switch (getType()) {
|
||||
//
|
||||
// case Int:
|
||||
// return String.valueOf(getValue());
|
||||
// case List:
|
||||
// return Joiner.on(" ").join((List<String>) getValue());
|
||||
// case String:
|
||||
// return (String) getValue();
|
||||
// default:
|
||||
// throw new IllegalArgumentException("Unknown type: " + getType().toString());
|
||||
// }
|
||||
}
|
||||
|
||||
public double[] doubleArrayValue() {
|
||||
return (double[])getValue();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Iterable#iterator()
|
||||
*/
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Iterator<Field> iterator() {
|
||||
return Collections.singleton((Field) this).iterator();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
/**
|
||||
* The Class MapDocument.
|
||||
*/
|
||||
public class MapDocument implements Document, Serializable {
|
||||
|
||||
/** The identifier. */
|
||||
private String identifier;
|
||||
|
||||
/** The field map. */
|
||||
private Map<String, Field> fieldMap;
|
||||
|
||||
/**
|
||||
* Instantiates a new map document.
|
||||
*/
|
||||
public MapDocument() {
|
||||
identifier = null;
|
||||
fieldMap = Maps.newHashMap();
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new map document.
|
||||
*
|
||||
* @param identifier
|
||||
* the identifier
|
||||
* @param fieldMap
|
||||
* the field map
|
||||
*/
|
||||
public MapDocument(final String identifier, final Map<String, Field> fieldMap) {
|
||||
this.setIdentifier(identifier);
|
||||
this.fieldMap = fieldMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new map document.
|
||||
*
|
||||
* @param identifier
|
||||
* the identifier
|
||||
* @param data
|
||||
* the data
|
||||
*/
|
||||
public MapDocument(final String identifier, final byte[] data) {
|
||||
final MapDocument doc = MapDocumentSerializer.decode(data);
|
||||
|
||||
this.fieldMap = doc.fieldMap;
|
||||
this.identifier = doc.identifier;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.document.Document#fields()
|
||||
*/
|
||||
@Override
|
||||
public Iterable<Field> fields() {
|
||||
return Lists.newArrayList(Iterables.concat(fieldMap.values()));
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public Field values(final String name) {
|
||||
return fieldMap.get(name);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.document.Document#fieldNames()
|
||||
*/
|
||||
@Override
|
||||
public Set<String> fieldNames() {
|
||||
return fieldMap.keySet();
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Object#toString()
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
return MapDocumentSerializer.toString(this);
|
||||
// return String.format("Document(%s)", fieldMap.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* To byte array.
|
||||
*
|
||||
* @return the byte[]
|
||||
*/
|
||||
public byte[] toByteArray() {
|
||||
return MapDocumentSerializer.toByteArray(this);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.model.document.Document#getIdentifier()
|
||||
*/
|
||||
@Override
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the identifier.
|
||||
*
|
||||
* @param identifier
|
||||
* the new identifier
|
||||
*/
|
||||
public void setIdentifier(final String identifier) {
|
||||
this.identifier = identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the field map.
|
||||
*
|
||||
* @return the field map
|
||||
*/
|
||||
public Map<String, Field> getFieldMap() {
|
||||
return fieldMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the field map.
|
||||
*
|
||||
* @param fieldMap
|
||||
* the field map
|
||||
*/
|
||||
public void setFieldMap(final Map<String, Field> fieldMap) {
|
||||
this.fieldMap = fieldMap;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
|
||||
/**
|
||||
* The Class MapDocumentComparator.
|
||||
*/
|
||||
public class MapDocumentComparator implements Comparator<Document> {
|
||||
|
||||
/** The comparator field. */
|
||||
private String comparatorField;
|
||||
|
||||
private final FieldList emptyField = new FieldListImpl();
|
||||
|
||||
/**
|
||||
* Instantiates a new map document comparator.
|
||||
*
|
||||
* @param comparatorField
|
||||
* the comparator field
|
||||
*/
|
||||
public MapDocumentComparator(final String comparatorField) {
|
||||
this.comparatorField = comparatorField;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public int compare(final Document d1, final Document d2) {
|
||||
|
||||
if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0;
|
||||
|
||||
final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue();
|
||||
final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue();
|
||||
|
||||
if ((o1 == null) || (o2 == null)) return 0;
|
||||
|
||||
final String to1 = NGramUtils.cleanupForOrdering(o1);
|
||||
final String to2 = NGramUtils.cleanupForOrdering(o2);
|
||||
|
||||
return to1.compareTo(to2);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.lang.reflect.Type;
|
||||
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.InstanceCreator;
|
||||
import com.google.gson.JsonDeserializationContext;
|
||||
import com.google.gson.JsonDeserializer;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonObject;
|
||||
import com.google.gson.JsonParseException;
|
||||
|
||||
/**
|
||||
* The Class MapDocumentSerializer.
|
||||
*/
|
||||
public class MapDocumentSerializer implements InstanceCreator<MapDocument> {
|
||||
|
||||
@Override
|
||||
public MapDocument createInstance(final Type type) {
|
||||
return new MapDocument();
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode.
|
||||
*
|
||||
* @param s
|
||||
* the String
|
||||
* @return the map document
|
||||
*/
|
||||
public static MapDocument decode(final String s) {
|
||||
final GsonBuilder gson = new GsonBuilder();
|
||||
|
||||
gson.registerTypeAdapter(Field.class, new JsonDeserializer<Field>() {
|
||||
|
||||
@Override
|
||||
public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
|
||||
final FieldListImpl fl = new FieldListImpl();
|
||||
if (json.isJsonObject()) {
|
||||
|
||||
fl.add(handleJsonObject(json.getAsJsonObject()));
|
||||
|
||||
} else if (json.isJsonArray()) {
|
||||
|
||||
for (final JsonElement e : json.getAsJsonArray()) {
|
||||
if (e.isJsonObject()) {
|
||||
fl.add(handleJsonObject(e.getAsJsonObject()));
|
||||
}
|
||||
}
|
||||
}
|
||||
return fl;
|
||||
}
|
||||
|
||||
private Field handleJsonObject(final JsonObject o) {
|
||||
final FieldListImpl fl = new FieldListImpl();
|
||||
final String name = o.get("name").getAsString();
|
||||
final String type = o.get("type").getAsString();
|
||||
final String value = o.get("value").getAsString();
|
||||
fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
|
||||
return fl;
|
||||
}
|
||||
});
|
||||
|
||||
return gson.create().fromJson(s, MapDocument.class);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode.
|
||||
*
|
||||
* @param bytes
|
||||
* the bytes
|
||||
* @return the map document
|
||||
*/
|
||||
public static MapDocument decode(final byte[] bytes) {
|
||||
return decode(new String(bytes));
|
||||
}
|
||||
|
||||
/**
|
||||
* To string.
|
||||
*
|
||||
* @param doc
|
||||
* the doc
|
||||
* @return the string
|
||||
*/
|
||||
public static String toString(final MapDocument doc) {
|
||||
final GsonBuilder b = new GsonBuilder();
|
||||
return b.setPrettyPrinting().create().toJson(doc);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* To byte array.
|
||||
*
|
||||
* @param doc
|
||||
* the doc
|
||||
* @return the byte[]
|
||||
*/
|
||||
public static byte[] toByteArray(final MapDocument doc) {
|
||||
return toString(doc).getBytes();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,155 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.util.Capitalise;
|
||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||
|
||||
public class Person {
|
||||
|
||||
private static final String UTF8 = "UTF-8";
|
||||
private List<String> name = Lists.newArrayList();
|
||||
private List<String> surname = Lists.newArrayList();
|
||||
private List<String> fullname = Lists.newArrayList();
|
||||
private final String original;
|
||||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
public Person(String s, final boolean aggressive) {
|
||||
original = s;
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
s = s.replaceAll("\\(.+\\)", "");
|
||||
s = s.replaceAll("\\[.+\\]", "");
|
||||
s = s.replaceAll("\\{.+\\}", "");
|
||||
s = s.replaceAll("\\s+-\\s+", "-");
|
||||
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
|
||||
s = s.replaceAll("\\d", " ");
|
||||
s = s.replaceAll("\\n", " ");
|
||||
s = s.replaceAll("\\.", " ");
|
||||
s = s.replaceAll("\\s+", " ");
|
||||
|
||||
if (aggressive) {
|
||||
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
|
||||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname = splitTerms(arr[0]);
|
||||
} else if (arr.length > 1) {
|
||||
surname = splitTerms(arr[0]);
|
||||
name = splitTerms(arr[1]);
|
||||
fullname.addAll(surname);
|
||||
fullname.addAll(name);
|
||||
}
|
||||
} else {
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
final String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
|
||||
name = fullname.subList(0, lastInitialPosition + 1);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
|
||||
for (final String term : fullname) {
|
||||
if ((term.length() > 1) && term.equals(term.toUpperCase())) {
|
||||
surname.add(term);
|
||||
} else {
|
||||
name.add(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
||||
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
|
||||
if (!particles.contains(part.toLowerCase())) {
|
||||
list.add(part);
|
||||
}
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
public List<String> getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public String getNameString() {
|
||||
return Joiner.on(" ").join(getName());
|
||||
}
|
||||
|
||||
public List<String> getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public List<String> getFullname() {
|
||||
return fullname;
|
||||
}
|
||||
|
||||
public String getOriginal() {
|
||||
return original;
|
||||
}
|
||||
|
||||
public String hash() {
|
||||
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
|
||||
}
|
||||
|
||||
public String getNormalisedFirstName() {
|
||||
return Joiner.on(" ").join(getCapitalFirstnames());
|
||||
}
|
||||
|
||||
public String getNormalisedSurname() {
|
||||
return Joiner.on(" ").join(getCapitalSurname());
|
||||
}
|
||||
|
||||
public String getSurnameString() {
|
||||
return Joiner.on(" ").join(getSurname());
|
||||
}
|
||||
|
||||
public String getNormalisedFullname() {
|
||||
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
|
||||
}
|
||||
|
||||
public List<String> getCapitalFirstnames() {
|
||||
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
|
||||
}
|
||||
|
||||
public List<String> getCapitalSurname() {
|
||||
return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
|
||||
}
|
||||
|
||||
public List<String> getNameWithAbbreviations() {
|
||||
return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
|
||||
}
|
||||
|
||||
public boolean isAccurate() {
|
||||
return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
public class PersonComparatorUtils {
|
||||
|
||||
private static final int MAX_FULLNAME_LENGTH = 50;
|
||||
|
||||
public static Set<String> getNgramsForPerson(String fullname) {
|
||||
|
||||
Set<String> set = Sets.newHashSet();
|
||||
|
||||
if (fullname.length() > MAX_FULLNAME_LENGTH) {
|
||||
return set;
|
||||
}
|
||||
|
||||
Person p = new Person(fullname, true);
|
||||
|
||||
if (p.isAccurate()) {
|
||||
for (String name : p.getName()) {
|
||||
for (String surname : p.getSurname()) {
|
||||
set.add((name.charAt(0) + "_" + surname).toLowerCase());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
List<String> list = p.getFullname();
|
||||
for (int i = 0; i < list.size(); i++) {
|
||||
if (list.get(i).length() > 1) {
|
||||
for (int j = 0; j < list.size(); j++) {
|
||||
if (i != j) {
|
||||
set.add((list.get(j).charAt(0) + "_" + list.get(i)).toLowerCase());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
public static boolean areSimilar(String s1, String s2) {
|
||||
Person p1 = new Person(s1, true);
|
||||
Person p2 = new Person(s2, true);
|
||||
|
||||
if (p1.isAccurate() && p2.isAccurate()) {
|
||||
return verifyNames(p1.getName(), p2.getName()) && verifySurnames(p1.getSurname(), p2.getSurname());
|
||||
} else {
|
||||
return verifyFullnames(p1.getFullname(), p2.getFullname());
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean verifyNames(List<String> list1, List<String> list2) {
|
||||
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
|
||||
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
|
||||
}
|
||||
|
||||
private static boolean verifySurnames(List<String> list1, List<String> list2) {
|
||||
if (list1.size() != list2.size()) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < list1.size(); i++) {
|
||||
if (!list1.get(i).equalsIgnoreCase(list2.get(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static boolean verifyFullnames(List<String> list1, List<String> list2) {
|
||||
Collections.sort(list1);
|
||||
Collections.sort(list2);
|
||||
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
|
||||
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
|
||||
}
|
||||
|
||||
private static List<String> extractExtendedNames(List<String> list) {
|
||||
ArrayList<String> res = Lists.newArrayList();
|
||||
for (String s : list) {
|
||||
if (s.length() > 1) {
|
||||
res.add(s.toLowerCase());
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private static List<String> extractInitials(List<String> list) {
|
||||
ArrayList<String> res = Lists.newArrayList();
|
||||
for (String s : list) {
|
||||
res.add(s.substring(0, 1).toLowerCase());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
private static boolean verifySimilarity(List<String> list1, List<String> list2) {
|
||||
if (list1.size() > list2.size()) {
|
||||
return verifySimilarity(list2, list1);
|
||||
}
|
||||
|
||||
// NB: List2 is greater than list1 (or equal)
|
||||
int pos = -1;
|
||||
for (String s : list1) {
|
||||
int curr = list2.indexOf(s);
|
||||
if (curr > pos) {
|
||||
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
|
||||
pos = curr;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("alwaysMatch")
|
||||
public class AlwaysMatch extends AbstractComparator {
|
||||
|
||||
public AlwaysMatch(final Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,154 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@ComparatorClass("authorsMatch")
|
||||
public class AuthorsMatch extends AbstractComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
private double SURNAME_THRESHOLD;
|
||||
private double NAME_THRESHOLD;
|
||||
private double FULLNAME_THRESHOLD;
|
||||
private String MODE; //full or surname
|
||||
private int SIZE_THRESHOLD;
|
||||
private String TYPE; //count or percentage
|
||||
private int common;
|
||||
|
||||
public AuthorsMatch(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
|
||||
MODE = params.getOrDefault("mode", "full");
|
||||
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
|
||||
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
|
||||
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
||||
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
|
||||
TYPE = params.getOrDefault("type", "percentage");
|
||||
common = 0;
|
||||
}
|
||||
|
||||
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
|
||||
return 1.0;
|
||||
|
||||
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
common = 0;
|
||||
//compare each element of List1 with each element of List2
|
||||
for (Person p1 : aList)
|
||||
|
||||
for (Person p2 : bList) {
|
||||
|
||||
//both persons are inaccurate
|
||||
if (!p1.isAccurate() && !p2.isAccurate()) {
|
||||
//compare just normalized fullnames
|
||||
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
|
||||
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
|
||||
|
||||
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//one person is inaccurate
|
||||
if (p1.isAccurate() ^ p2.isAccurate()) {
|
||||
//prepare data
|
||||
//data for the accurate person
|
||||
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
|
||||
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
|
||||
|
||||
//data for the inaccurate person
|
||||
String fullname = normalization(
|
||||
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
|
||||
);
|
||||
|
||||
if (fullname.contains(surname)) {
|
||||
if (MODE.equals("full")) {
|
||||
if (fullname.contains(name)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else { //MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//both persons are accurate
|
||||
if (p1.isAccurate() && p2.isAccurate()) {
|
||||
|
||||
if (compareSurname(p1, p2)) {
|
||||
if (MODE.equals("full")) {
|
||||
if(compareFirstname(p1, p2)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else { //MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//normalization factor to compute the score
|
||||
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
|
||||
|
||||
if(TYPE.equals("percentage")) {
|
||||
return (double) common / normFactor;
|
||||
}
|
||||
else {
|
||||
return (double) common;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean compareSurname(Person p1, Person p2) {
|
||||
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
|
||||
}
|
||||
|
||||
public boolean compareFirstname(Person p1, Person p2) {
|
||||
|
||||
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
|
||||
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
|
||||
return true;
|
||||
}
|
||||
|
||||
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
|
||||
}
|
||||
|
||||
public String normalization(String s) {
|
||||
return normalize(utf8(cleanup(s)));
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("cityMatch")
|
||||
public class CityMatch extends AbstractComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public CityMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = citiesToCodes(cities1);
|
||||
Set<String> codes2 = citiesToCodes(cities2);
|
||||
|
||||
//if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; //undefined if one of the two has no cities
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ComparatorClass("cosineSimilarity")
|
||||
public class CosineSimilarity extends AbstractComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
public CosineSimilarity(Map<String,String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
|
||||
double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
|
||||
|
||||
return cosineSimilarity(aVector, bVector);
|
||||
}
|
||||
|
||||
double cosineSimilarity(double[] a, double[] b) {
|
||||
double dotProduct = 0;
|
||||
double normASum = 0;
|
||||
double normBSum = 0;
|
||||
|
||||
for(int i = 0; i < a.length; i ++) {
|
||||
dotProduct += a[i] * b[i];
|
||||
normASum += a[i] * a[i];
|
||||
normBSum += b[i] * b[i];
|
||||
}
|
||||
|
||||
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
|
||||
return dotProduct / eucledianDist;
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* The Class ExactMatch.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ComparatorClass("doiExactMatch")
|
||||
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
public DoiExactMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
return super.getValue(f).replaceAll(PREFIX, "");
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("domainExactMatch")
|
||||
public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public DomainExactMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String getValue(final Field f) {
|
||||
|
||||
try {
|
||||
return asUrl(super.getValue(f)).getHost();
|
||||
} catch (MalformedURLException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) throws MalformedURLException {
|
||||
return new URL(value);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,42 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("exactMatch")
|
||||
public class ExactMatch extends AbstractComparator {
|
||||
|
||||
public ExactMatch(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; //return -1 if a field is missing
|
||||
}
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("exactMatchIgnoreCase")
|
||||
public class ExactMatchIgnoreCase extends AbstractComparator {
|
||||
|
||||
public ExactMatchIgnoreCase(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b, final Config conf) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
||||
if (fa.isEmpty() || fb.isEmpty())
|
||||
return -1;
|
||||
|
||||
return fa.equalsIgnoreCase(fb) ? 1 : 0;
|
||||
}
|
||||
|
||||
protected String getValue(final Field f) {
|
||||
return getFirstValue(f);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ComparatorClass("instanceTypeMatch")
|
||||
public class InstanceTypeMatch extends AbstractComparator {
|
||||
|
||||
final Map<String, String> translationMap = new HashMap<>();
|
||||
|
||||
public InstanceTypeMatch(Map<String, String> params){
|
||||
super(params);
|
||||
|
||||
//jolly types
|
||||
translationMap.put("Conference object", "*");
|
||||
translationMap.put("Other literature type", "*");
|
||||
translationMap.put("Unknown", "*");
|
||||
|
||||
//article types
|
||||
translationMap.put("Article", "Article");
|
||||
translationMap.put("Data Paper", "Article");
|
||||
translationMap.put("Software Paper", "Article");
|
||||
translationMap.put("Preprint", "Article");
|
||||
|
||||
//thesis types
|
||||
translationMap.put("Thesis", "Thesis");
|
||||
translationMap.put("Master thesis", "Thesis");
|
||||
translationMap.put("Bachelor thesis", "Thesis");
|
||||
translationMap.put("Doctoral thesis", "Thesis");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
if (a == null || b == null) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::translate).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::translate).collect(Collectors.toSet());
|
||||
|
||||
//if at least one is a jolly type, it must produce a match
|
||||
if (ca.contains("*") || cb.contains("*"))
|
||||
return 1.0;
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
|
||||
//if at least one is in common, it must produce a match
|
||||
return incommon >= 1 ? 1 : 0;
|
||||
}
|
||||
|
||||
public String translate(String term){
|
||||
return translationMap.getOrDefault(term, term);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinkler")
|
||||
public class JaroWinkler extends AbstractComparator {
|
||||
|
||||
public JaroWinkler(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,72 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("jaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca,cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinklerTitle")
|
||||
public class JaroWinklerTitle extends AbstractComparator {
|
||||
|
||||
public JaroWinklerTitle(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinklerTitle(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
boolean check = checkNumbers(ca, cb);
|
||||
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ComparatorClass("jsonListMatch")
|
||||
public class JsonListMatch extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(JsonListMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
private String MODE; //"percentage" or "count"
|
||||
|
||||
public JsonListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
MODE = params.getOrDefault("mode", "percentage");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (MODE.equals("percentage"))
|
||||
return (double)incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
|
||||
//converts every json into a comparable string basing on parameters
|
||||
private String toComparableString(String json){
|
||||
|
||||
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
|
||||
|
||||
//for each path in the param list
|
||||
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||
String path = params.get(key);
|
||||
String value = MapDocumentUtil.getJPathString(path, json);
|
||||
if (value == null || value.isEmpty())
|
||||
value = "";
|
||||
st.append( value + "::");
|
||||
}
|
||||
|
||||
st.setLength(st.length()-2);
|
||||
return st.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("keywordMatch")
|
||||
public class KeywordMatch extends AbstractComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
public KeywordMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
||||
|
||||
//if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1.0; //undefined if one of the two has no keywords
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinkler")
|
||||
public class Level2JaroWinkler extends AbstractComparator {
|
||||
|
||||
public Level2JaroWinkler(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinkler(double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinklerTitle")
|
||||
public class Level2JaroWinklerTitle extends AbstractComparator {
|
||||
|
||||
public Level2JaroWinklerTitle(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public Level2JaroWinklerTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
return ssalgo.score(ca, cb);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2Levenstein")
|
||||
public class Level2Levenstein extends AbstractComparator {
|
||||
|
||||
public Level2Levenstein(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
public Level2Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levenstein")
|
||||
public class Levenstein extends AbstractComparator {
|
||||
|
||||
public Levenstein(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected Levenstein(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levensteinTitle")
|
||||
public class LevensteinTitle extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||
|
||||
public LevensteinTitle(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
|
||||
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@ComparatorClass("levensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
||||
|
||||
public LevensteinTitleIgnoreVersion(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitleIgnoreVersion(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
|
||||
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
return 1 - (Math.abs(score) / Math.max(la, lb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* The Class Contains match
|
||||
*
|
||||
* @author miconis
|
||||
* */
|
||||
@ComparatorClass("listContainsMatch")
|
||||
public class ListContainsMatch extends AbstractComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
private boolean CASE_SENSITIVE;
|
||||
private String STRING;
|
||||
private String AGGREGATOR;
|
||||
|
||||
public ListContainsMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
//read parameters
|
||||
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||
STRING = params.get("string");
|
||||
AGGREGATOR = params.get("bool");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
List<String> sa = ((FieldList) a).stringList();
|
||||
List<String> sb = ((FieldList) b).stringList();
|
||||
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!CASE_SENSITIVE) {
|
||||
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
|
||||
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
|
||||
STRING = STRING.toLowerCase();
|
||||
}
|
||||
|
||||
switch(AGGREGATOR) {
|
||||
case "AND":
|
||||
if(sa.contains(STRING) && sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "OR":
|
||||
if(sa.contains(STRING) || sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "XOR":
|
||||
if(sa.contains(STRING) ^ sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("mustBeDifferent")
|
||||
public class MustBeDifferent extends AbstractComparator {
|
||||
|
||||
public MustBeDifferent(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public MustBeDifferent(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return !a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@ComparatorClass("null")
|
||||
public class NullDistanceAlgo implements Comparator {
|
||||
|
||||
public NullDistanceAlgo(Map<String, String> params){
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b, Config config) {
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("numbersComparator")
|
||||
public class NumbersComparator extends AbstractComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
public NumbersComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
int n1 = Integer.parseInt(numbers1);
|
||||
int n2 = Integer.parseInt(numbers2);
|
||||
|
||||
return Math.abs(n1 - n2);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("numbersMatch")
|
||||
public class NumbersMatch extends AbstractComparator {
|
||||
|
||||
|
||||
public NumbersMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
|
||||
if (numbers1.isEmpty() && numbers2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
if (numbers1.equals(numbers2))
|
||||
return 1.0;
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("romansMatch")
|
||||
public class RomansMatch extends AbstractComparator {
|
||||
|
||||
|
||||
public RomansMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts romans from the field
|
||||
String romans1 = getRomans(nfd(a));
|
||||
String romans2 = getRomans(nfd(b));
|
||||
|
||||
if (romans1.isEmpty() && romans2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
if (romans1.isEmpty() || romans2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
if (romans1.equals(romans2))
|
||||
return 1.0;
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* Returns true if the number of values in the fields is the same.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ComparatorClass("sizeMatch")
|
||||
public class SizeMatch extends AbstractComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new size match.
|
||||
*
|
||||
* @param params
|
||||
* the parameters
|
||||
*/
|
||||
public SizeMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
return Iterables.size(a) == Iterables.size(b) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if is empty.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @return true, if is empty
|
||||
*/
|
||||
protected boolean isEmpty(final Iterable<?> a) {
|
||||
return (a == null) || Iterables.isEmpty(a);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedJaroWinkler")
|
||||
public class SortedJaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
public SortedJaroWinkler(Map<String,String> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedJaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedLevel2JaroWinkler")
|
||||
public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
*/
|
||||
public SortedLevel2JaroWinkler(final double weight) {
|
||||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public SortedLevel2JaroWinkler(final Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted jaro winkler.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class Contains match
|
||||
*
|
||||
* @author miconis
|
||||
* */
|
||||
@ComparatorClass("stringContainsMatch")
|
||||
public class StringContainsMatch extends AbstractComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
private boolean CASE_SENSITIVE;
|
||||
private String STRING;
|
||||
private String AGGREGATOR;
|
||||
|
||||
public StringContainsMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
//read parameters
|
||||
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||
STRING = params.get("string");
|
||||
AGGREGATOR = params.get("aggregator");
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = a;
|
||||
String cb = b;
|
||||
if (!CASE_SENSITIVE) {
|
||||
ca = a.toLowerCase();
|
||||
cb = b.toLowerCase();
|
||||
STRING = STRING.toLowerCase();
|
||||
}
|
||||
|
||||
if (AGGREGATOR != null) {
|
||||
switch (AGGREGATOR) {
|
||||
case "AND":
|
||||
if (ca.contains(STRING) && cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "OR":
|
||||
if (ca.contains(STRING) || cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "XOR":
|
||||
if (ca.contains(STRING) ^ cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("stringListMatch")
|
||||
public class StringListMatch extends AbstractComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
final private String TYPE; //percentage or count
|
||||
|
||||
public StringListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
TYPE = params.getOrDefault("type", "percentage");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
final Set<String> pa = new HashSet<>(((FieldList) a).stringList());
|
||||
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
|
||||
|
||||
if (pa.isEmpty() || pb.isEmpty()) {
|
||||
return -1; //return undefined if one of the two lists is empty
|
||||
}
|
||||
|
||||
int incommon = Sets.intersection(pa, pb).size();
|
||||
int simDiff = Sets.symmetricDifference(pa, pb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if(TYPE.equals("percentage"))
|
||||
return (double)incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,100 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SubStringLevenstein.
|
||||
*/
|
||||
@ComparatorClass("subStringLevenstein")
|
||||
public class SubStringLevenstein extends AbstractComparator {
|
||||
|
||||
/** The limit. */
|
||||
protected int limit;
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
*/
|
||||
public SubStringLevenstein(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public SubStringLevenstein(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
*/
|
||||
public SubStringLevenstein(final double w, final int limit) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w
|
||||
* the w
|
||||
* @param limit
|
||||
* the limit
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b, final Config conf) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
@ComparatorClass("titleVersionMatch")
|
||||
public class TitleVersionMatch extends AbstractComparator {
|
||||
|
||||
public TitleVersionMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
final String valueA = getFirstValue(a);
|
||||
final String valueB = getFirstValue(b);
|
||||
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
|
||||
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("urlMatcher")
|
||||
public class UrlMatcher extends Levenstein {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public UrlMatcher(Map<String, String> params){
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public UrlMatcher(double weight, Map<String, String> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(Field a, Field b, final Config conf) {
|
||||
final URL urlA = asUrl(getFirstValue(a));
|
||||
final URL urlB = asUrl(getFirstValue(b));
|
||||
|
||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
|
||||
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
|
||||
|
||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ComparatorClass("yearMatch")
|
||||
public class YearMatch extends AbstractComparator {
|
||||
|
||||
private int limit = 4;
|
||||
|
||||
public YearMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
|
||||
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
|
||||
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
|
||||
|
||||
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
|
||||
}
|
||||
|
||||
protected boolean checkLength(final String s) {
|
||||
return s.length() == limit;
|
||||
}
|
||||
|
||||
protected String getFirstValue(final Field value) {
|
||||
return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,124 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
|
||||
|
||||
/** The ssalgo. */
|
||||
protected AbstractStringDistance ssalgo;
|
||||
|
||||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
protected AbstractComparator(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo){
|
||||
this.params = params;
|
||||
this.weight = 1.0;
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
protected AbstractComparator(final AbstractStringDistance ssalgo){
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
* @param d
|
||||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; //return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
protected double distance(final List<String> a, final List<String> b, final Config conf) {
|
||||
return distance(concat(a), concat(b), conf);
|
||||
}
|
||||
|
||||
public double distance(final Field a, final Field b, final Config conf) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* To list.
|
||||
*
|
||||
* @param list
|
||||
* the list
|
||||
* @return the list
|
||||
*/
|
||||
protected List<String> toList(final Field list) {
|
||||
return ((FieldList) list).stringList();
|
||||
}
|
||||
|
||||
public double getWeight(){
|
||||
return this.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractSortedComparator extends AbstractComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo){
|
||||
super(Double.parseDouble(params.get("weight")), ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> toList(final Field list) {
|
||||
FieldList fl = (FieldList) list;
|
||||
List<String> values = Lists.newArrayList(fl.stringList());
|
||||
Collections.sort(values);
|
||||
return values;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public enum AggType {
|
||||
|
||||
W_MEAN, //weighted mean
|
||||
AVG, //average
|
||||
SUM,
|
||||
MAX,
|
||||
MIN,
|
||||
AND, //used for necessary conditions
|
||||
OR; //used for sufficient conditions
|
||||
|
||||
public static AggType getEnum(String value) {
|
||||
|
||||
try {
|
||||
return AggType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
throw new PaceException("Undefined aggregation type", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface Comparator {
|
||||
|
||||
/*
|
||||
* return : -1 -> can't decide (i.e. missing field)
|
||||
* >0 -> similarity degree (depends on the algorithm)
|
||||
* */
|
||||
public double compare(Field a, Field b, Config conf);
|
||||
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ComparatorClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -0,0 +1,82 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The class that defines the configuration of each field in the decision tree.
|
||||
* */
|
||||
public class FieldConf implements Serializable {
|
||||
|
||||
private String field; //name of the field on which apply the comparator
|
||||
private String comparator; //comparator name
|
||||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private Map<String,String> params; //parameters
|
||||
|
||||
private boolean countIfUndefined;
|
||||
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public FieldConf() {
|
||||
}
|
||||
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, String> params, boolean countIfUndefined) {
|
||||
this.field = field;
|
||||
this.comparator = comparator;
|
||||
this.weight = weight;
|
||||
this.params = params;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public String getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
public void setField(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
public String getComparator() {
|
||||
return comparator;
|
||||
}
|
||||
|
||||
public void setComparator(String comparator) {
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public Map<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* The class that contains the result of each comparison in the decision tree
|
||||
* */
|
||||
public class FieldStats implements Serializable {
|
||||
|
||||
private double weight; //weight for the field (to be used in the aggregation)
|
||||
private double threshold; //threshold for the field (to be used in some kind of aggregations)
|
||||
private double result; //the result of the comparison
|
||||
private Field a;
|
||||
private Field b;
|
||||
|
||||
private boolean countIfUndefined;
|
||||
|
||||
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) {
|
||||
this.weight = weight;
|
||||
this.threshold = threshold;
|
||||
this.result = result;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public double getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setResult(double result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public Field getA() {
|
||||
return a;
|
||||
}
|
||||
|
||||
public void setA(Field a) {
|
||||
this.a = a;
|
||||
}
|
||||
|
||||
public Field getB() {
|
||||
return b;
|
||||
}
|
||||
|
||||
public void setB(Field b) {
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
public enum MatchType {
|
||||
|
||||
MATCH,
|
||||
NO_MATCH,
|
||||
UNDEFINED;
|
||||
|
||||
public static MatchType parse(String value) {
|
||||
|
||||
try {
|
||||
return MatchType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,150 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.io.StringWriter;
|
||||
import java.util.List;
|
||||
|
||||
public class TreeNodeDef implements Serializable {
|
||||
|
||||
final static String CROSS_COMPARE = "crossCompare";
|
||||
|
||||
private List<FieldConf> fields;
|
||||
private AggType aggregation;
|
||||
|
||||
private double threshold;
|
||||
|
||||
private String positive;
|
||||
private String negative;
|
||||
private String undefined;
|
||||
|
||||
boolean ignoreUndefined;
|
||||
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) {
|
||||
this.fields = fields;
|
||||
this.aggregation = aggregation;
|
||||
this.threshold = threshold;
|
||||
this.positive = positive;
|
||||
this.negative = negative;
|
||||
this.undefined = undefined;
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public TreeNodeDef() {}
|
||||
|
||||
//function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
|
||||
//for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
||||
double weight = fieldConf.getWeight();
|
||||
|
||||
double result;
|
||||
|
||||
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
|
||||
if(fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) {
|
||||
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
||||
double result1 = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf);
|
||||
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
result = Math.max(result1,result2);
|
||||
}
|
||||
else {
|
||||
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
}
|
||||
|
||||
stats.addFieldStats(
|
||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
new FieldStats(
|
||||
weight,
|
||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
|
||||
result,
|
||||
fieldConf.isCountIfUndefined(),
|
||||
doc1.getFieldMap().get(fieldConf.getField()),
|
||||
doc2.getFieldMap().get(fieldConf.getField())
|
||||
));
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
|
||||
private Comparator comparator(final FieldConf field){
|
||||
|
||||
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
||||
}
|
||||
|
||||
public List<FieldConf> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public void setFields(List<FieldConf> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public AggType getAggregation() {
|
||||
return aggregation;
|
||||
}
|
||||
|
||||
public void setAggregation(AggType aggregation) {
|
||||
this.aggregation = aggregation;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public String getPositive() {
|
||||
return positive;
|
||||
}
|
||||
|
||||
public void setPositive(String positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
public String getNegative() {
|
||||
return negative;
|
||||
}
|
||||
|
||||
public void setNegative(String negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
|
||||
public String getUndefined() {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
public void setUndefined(String undefined) {
|
||||
this.undefined = undefined;
|
||||
}
|
||||
|
||||
public boolean isIgnoreUndefined() {
|
||||
return ignoreUndefined;
|
||||
}
|
||||
|
||||
public void setIgnoreUndefined(boolean ignoreUndefined) {
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class TreeNodeStats implements Serializable {
|
||||
|
||||
private Map<String, FieldStats> results; //this is an accumulator for the results of the node
|
||||
|
||||
public TreeNodeStats(){
|
||||
this.results = new HashMap<>();
|
||||
}
|
||||
|
||||
public Map<String, FieldStats> getResults() {
|
||||
return results;
|
||||
}
|
||||
|
||||
public void addFieldStats(String id, FieldStats fieldStats){
|
||||
this.results.put(id, fieldStats);
|
||||
}
|
||||
|
||||
public int fieldsCount(){
|
||||
return this.results.size();
|
||||
}
|
||||
|
||||
public int undefinedCount(){
|
||||
int undefinedCount = 0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult() == -1)
|
||||
undefinedCount ++;
|
||||
}
|
||||
return undefinedCount;
|
||||
}
|
||||
|
||||
public double scoreSum(){
|
||||
double scoreSum = 0.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>=0.0) {
|
||||
scoreSum += fs.getResult();
|
||||
}
|
||||
}
|
||||
return scoreSum;
|
||||
}
|
||||
|
||||
//return the sum of the weights without considering the fields with countIfMissing=false && result=-1
|
||||
public double weightSum(){
|
||||
double weightSum = 0.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) {
|
||||
weightSum += fs.getWeight();
|
||||
}
|
||||
}
|
||||
return weightSum;
|
||||
}
|
||||
|
||||
public double weightedScoreSum(){
|
||||
double weightedScoreSum = 0.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>=0.0) {
|
||||
weightedScoreSum += fs.getResult()*fs.getWeight();
|
||||
}
|
||||
}
|
||||
return weightedScoreSum;
|
||||
}
|
||||
|
||||
public double max(){
|
||||
double max = -1.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>max)
|
||||
max = fs.getResult();
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
public double min(){
|
||||
double min = 100.0; //random high value
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()<min) {
|
||||
if (fs.getResult()>=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
||||
min = fs.getResult();
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
//if at least one is true, return 1.0
|
||||
public double or(){
|
||||
for (FieldStats fieldStats : this.results.values()) {
|
||||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||
return 1.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
//if at least one is false, return 0.0
|
||||
public double and() {
|
||||
for (FieldStats fieldStats : this.results.values()) {
|
||||
|
||||
if (fieldStats.getResult() == -1) {
|
||||
if (fieldStats.isCountIfUndefined())
|
||||
return 0.0;
|
||||
}
|
||||
else {
|
||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
public double getFinalScore(AggType aggregation){
|
||||
|
||||
switch (aggregation){
|
||||
case AVG:
|
||||
return scoreSum()/fieldsCount();
|
||||
case SUM:
|
||||
return scoreSum();
|
||||
case MAX:
|
||||
return max();
|
||||
case MIN:
|
||||
return min();
|
||||
case W_MEAN:
|
||||
return weightedScoreSum()/weightSum();
|
||||
case OR:
|
||||
return or();
|
||||
case AND:
|
||||
return and();
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
*/
|
||||
public class TreeProcessor{
|
||||
|
||||
private static final Log log = LogFactory.getLog(TreeProcessor.class);
|
||||
|
||||
private Config config;
|
||||
|
||||
public TreeProcessor(final Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
||||
//evaluate the decision tree
|
||||
return evaluateTree(a, b).getResult() == MatchType.MATCH;
|
||||
}
|
||||
|
||||
public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2){
|
||||
|
||||
TreeStats treeStats = new TreeStats();
|
||||
|
||||
String current = "start";
|
||||
|
||||
while (MatchType.parse(current)==MatchType.UNDEFINED) {
|
||||
|
||||
TreeNodeDef currentNode = config.decisionTree().get(current);
|
||||
//throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("Missing tree node: " + current);
|
||||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
treeStats.addNodeStats(current, stats);
|
||||
|
||||
//if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
//if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
current = currentNode.getNegative();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
treeStats.setResult(MatchType.parse(current));
|
||||
return treeStats;
|
||||
}
|
||||
|
||||
public double computeScore(final MapDocument doc1, final MapDocument doc2) {
|
||||
String current = "start";
|
||||
double score = 0.0;
|
||||
|
||||
while (MatchType.parse(current)==MatchType.UNDEFINED) {
|
||||
|
||||
TreeNodeDef currentNode = config.decisionTree().get(current);
|
||||
//throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
|
||||
score = stats.getFinalScore(currentNode.getAggregation());
|
||||
//if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
//if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
current = currentNode.getNegative();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class TreeStats {
|
||||
|
||||
//<layer_id, <field:comparator, result>>
|
||||
Map<String, TreeNodeStats> stats;
|
||||
MatchType result;
|
||||
|
||||
public TreeStats(){
|
||||
this.stats = new HashMap<>();
|
||||
this.result = MatchType.NO_MATCH;
|
||||
}
|
||||
|
||||
public MatchType getResult(){
|
||||
return this.result;
|
||||
}
|
||||
|
||||
public void setResult(MatchType result){
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public Map<String, TreeNodeStats> getStats() {
|
||||
return stats;
|
||||
}
|
||||
|
||||
public void setStats(Map<String, TreeNodeStats> stats) {
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){
|
||||
this.stats.put(layerID, treeNodeStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
try {
|
||||
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,192 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class BlockProcessor {
|
||||
|
||||
public static final List<String> accumulators= new ArrayList<>();
|
||||
|
||||
private static final Log log = LogFactory.getLog(BlockProcessor.class);
|
||||
|
||||
private DedupConfig dedupConf;
|
||||
|
||||
public static void constructAccumulator( final DedupConfig dedupConf) {
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
||||
}
|
||||
|
||||
public BlockProcessor(DedupConfig dedupConf) {
|
||||
this.dedupConf = dedupConf;
|
||||
}
|
||||
|
||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
|
||||
if (documents.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
process(prepare(documents), context);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context) {
|
||||
|
||||
final Queue<MapDocument> q = prepare(documents);
|
||||
|
||||
if (q.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
process(simplifyQueue(q, key, context), context);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
||||
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
|
||||
|
||||
final Set<String> seen = new HashSet<String>();
|
||||
final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
|
||||
|
||||
documents.forEach(doc -> {
|
||||
if (queue.size() <= queueMaxSize) {
|
||||
final String id = doc.getIdentifier();
|
||||
|
||||
if (!seen.contains(id)) {
|
||||
seen.add(id);
|
||||
queue.add(doc);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return queue;
|
||||
}
|
||||
|
||||
private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram, final Reporter context) {
|
||||
final Queue<MapDocument> q = new LinkedList<>();
|
||||
|
||||
String fieldRef = "";
|
||||
final List<MapDocument> tempResults = Lists.newArrayList();
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
final MapDocument result = queue.remove();
|
||||
|
||||
final String orderFieldName = dedupConf.getWf().getOrderField();
|
||||
final Field orderFieldValue = result.values(orderFieldName);
|
||||
if (!orderFieldValue.isEmpty()) {
|
||||
final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
|
||||
if (field.equals(fieldRef)) {
|
||||
tempResults.add(result);
|
||||
} else {
|
||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
||||
tempResults.clear();
|
||||
tempResults.add(result);
|
||||
fieldRef = field;
|
||||
}
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1);
|
||||
}
|
||||
}
|
||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
private void populateSimplifiedQueue(final Queue<MapDocument> q,
|
||||
final List<MapDocument> tempResults,
|
||||
final Reporter context,
|
||||
final String fieldRef,
|
||||
final String ngram) {
|
||||
WfConfig wf = dedupConf.getWf();
|
||||
if (tempResults.size() < wf.getGroupMaxSize()) {
|
||||
q.addAll(tempResults);
|
||||
} else {
|
||||
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size());
|
||||
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
||||
}
|
||||
}
|
||||
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
|
||||
final MapDocument pivot = queue.remove();
|
||||
final String idPivot = pivot.getIdentifier();
|
||||
|
||||
WfConfig wf = dedupConf.getWf();
|
||||
final Field fieldsPivot = pivot.values(wf.getOrderField());
|
||||
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
|
||||
|
||||
if (fieldPivot != null) {
|
||||
int i = 0;
|
||||
for (final MapDocument curr : queue) {
|
||||
final String idCurr = curr.getIdentifier();
|
||||
|
||||
if (mustSkip(idCurr)) {
|
||||
|
||||
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (i > wf.getSlidingWindowSize()) {
|
||||
break;
|
||||
}
|
||||
|
||||
final Field fieldsCurr = curr.values(wf.getOrderField());
|
||||
final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();
|
||||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
||||
|
||||
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
if (result) {
|
||||
writeSimilarity(context, idPivot, idCurr);
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
|
||||
private String getNsPrefix(final String id) {
|
||||
return StringUtils.substringBetween(id, "|", "::");
|
||||
}
|
||||
|
||||
private void writeSimilarity(final Reporter context, final String from, final String to) {
|
||||
final String type = dedupConf.getWf().getEntityType();
|
||||
|
||||
context.emit(type, from, to);
|
||||
context.emit(type, to, from);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,248 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
||||
import eu.dnetlib.pace.tree.*;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class BlockProcessorForTesting {
|
||||
|
||||
public static final List<String> accumulators= new ArrayList<>();
|
||||
|
||||
private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class);
|
||||
|
||||
private DedupConfig dedupConf;
|
||||
|
||||
public static void constructAccumulator( final DedupConfig dedupConf) {
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
||||
}
|
||||
|
||||
public BlockProcessorForTesting(DedupConfig dedupConf) {
|
||||
this.dedupConf = dedupConf;
|
||||
}
|
||||
|
||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch) {
|
||||
if (documents.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
process(prepare(documents), context, useTree, noMatch);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context, boolean useTree, boolean noMatch) {
|
||||
|
||||
final Queue<MapDocument> q = prepare(documents);
|
||||
|
||||
if (q.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
process(simplifyQueue(q, key, context), context, useTree, noMatch);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
||||
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
|
||||
|
||||
final Set<String> seen = new HashSet<String>();
|
||||
final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
|
||||
|
||||
documents.forEach(doc -> {
|
||||
if (queue.size() <= queueMaxSize) {
|
||||
final String id = doc.getIdentifier();
|
||||
|
||||
if (!seen.contains(id)) {
|
||||
seen.add(id);
|
||||
queue.add(doc);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return queue;
|
||||
}
|
||||
|
||||
private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram, final Reporter context) {
|
||||
final Queue<MapDocument> q = new LinkedList<>();
|
||||
|
||||
String fieldRef = "";
|
||||
final List<MapDocument> tempResults = Lists.newArrayList();
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
final MapDocument result = queue.remove();
|
||||
|
||||
final String orderFieldName = dedupConf.getWf().getOrderField();
|
||||
final Field orderFieldValue = result.values(orderFieldName);
|
||||
if (!orderFieldValue.isEmpty()) {
|
||||
final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
|
||||
if (field.equals(fieldRef)) {
|
||||
tempResults.add(result);
|
||||
} else {
|
||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
||||
tempResults.clear();
|
||||
tempResults.add(result);
|
||||
fieldRef = field;
|
||||
}
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1);
|
||||
}
|
||||
}
|
||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
private void populateSimplifiedQueue(final Queue<MapDocument> q,
|
||||
final List<MapDocument> tempResults,
|
||||
final Reporter context,
|
||||
final String fieldRef,
|
||||
final String ngram) {
|
||||
WfConfig wf = dedupConf.getWf();
|
||||
if (tempResults.size() < wf.getGroupMaxSize()) {
|
||||
q.addAll(tempResults);
|
||||
} else {
|
||||
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size());
|
||||
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
||||
}
|
||||
}
|
||||
|
||||
private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree, boolean noMatch) {
|
||||
|
||||
while (!queue.isEmpty()) {
|
||||
|
||||
final MapDocument pivot = queue.remove();
|
||||
final String idPivot = pivot.getIdentifier();
|
||||
|
||||
WfConfig wf = dedupConf.getWf();
|
||||
final Field fieldsPivot = pivot.values(wf.getOrderField());
|
||||
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
|
||||
|
||||
if (fieldPivot != null) {
|
||||
int i = 0;
|
||||
for (final MapDocument curr : queue) {
|
||||
final String idCurr = curr.getIdentifier();
|
||||
|
||||
if (mustSkip(idCurr)) {
|
||||
|
||||
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
if (i > wf.getSlidingWindowSize()) {
|
||||
break;
|
||||
}
|
||||
|
||||
final Field fieldsCurr = curr.values(wf.getOrderField());
|
||||
final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();
|
||||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
//draws no match relations (test purpose)
|
||||
if (noMatch) {
|
||||
emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
}
|
||||
else {
|
||||
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
|
||||
if (useTree)
|
||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
||||
else
|
||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
||||
}
|
||||
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
|
||||
// emitOutput(true, idPivot, idCurr, context);
|
||||
// }
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) {
|
||||
Map<String, String> params = new HashMap<>();
|
||||
InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
|
||||
double compare = instanceTypeMatch.compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf);
|
||||
return compare>=1.0;
|
||||
}
|
||||
|
||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
||||
//if the score gives 1, the publications are equivalent
|
||||
Map<String, String> params = new HashMap<>();
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
params.put("mode", "count");
|
||||
|
||||
double score = 0.0;
|
||||
|
||||
//levenstein title
|
||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
|
||||
score += 0.2;
|
||||
}
|
||||
|
||||
//pid
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
|
||||
score += 0.5;
|
||||
}
|
||||
|
||||
//title version
|
||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
||||
if(result1<0 || result1>=1.0) {
|
||||
score += 0.1;
|
||||
}
|
||||
|
||||
//authors match
|
||||
params.remove("mode");
|
||||
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
||||
if(result2 <0|| result2>=0.6) {
|
||||
score += 0.2;
|
||||
}
|
||||
|
||||
return score>=0.5;
|
||||
}
|
||||
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
if (result) {
|
||||
writeSimilarity(context, idPivot, idCurr);
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
|
||||
private String getNsPrefix(final String id) {
|
||||
return StringUtils.substringBetween(id, "|", "::");
|
||||
}
|
||||
|
||||
private void writeSimilarity(final Reporter context, final String from, final String to) {
|
||||
final String type = dedupConf.getWf().getEntityType();
|
||||
|
||||
context.emit(type, from, to);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
public class Capitalise implements Function<String, String> {
|
||||
|
||||
private final char[] DELIM = {' ', '-'};
|
||||
|
||||
@Override
|
||||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
};
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,10 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
|
||||
public class DotAbbreviations implements Function<String, String> {
|
||||
@Override
|
||||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
};
|
|
@ -0,0 +1,166 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.Configuration;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import com.jayway.jsonpath.Option;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import net.minidev.json.JSONArray;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class MapDocumentUtil {
|
||||
|
||||
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
||||
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
||||
|
||||
public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
|
||||
MapDocument m = new MapDocument();
|
||||
m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));
|
||||
Map<String, Field> stringField = new HashMap<>();
|
||||
conf.getPace().getModel().forEach(fdef -> {
|
||||
switch (fdef.getType()) {
|
||||
case String:
|
||||
case Int:
|
||||
stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), truncateValue(getJPathString(fdef.getPath(), json), fdef.getLength())));
|
||||
break;
|
||||
case URL:
|
||||
String uv = getJPathString(fdef.getPath(), json);
|
||||
if (!urlFilter.test(uv)) uv = "";
|
||||
stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv));
|
||||
break;
|
||||
case List:
|
||||
case JSON:
|
||||
FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType());
|
||||
truncateList(getJPathList(fdef.getPath(), json, fdef.getType()), fdef.getSize())
|
||||
.stream()
|
||||
.map(item -> new FieldValueImpl(Type.String, fdef.getName(), item))
|
||||
.forEach(fi::add);
|
||||
stringField.put(fdef.getName(), fi);
|
||||
break;
|
||||
case DoubleArray:
|
||||
stringField.put(
|
||||
fdef.getName(),
|
||||
new FieldValueImpl(Type.DoubleArray,
|
||||
fdef.getName(),
|
||||
getJPathArray(fdef.getPath(), json))
|
||||
);
|
||||
break;
|
||||
case StringConcat:
|
||||
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
|
||||
stringField.put(
|
||||
fdef.getName(),
|
||||
new FieldValueImpl(Type.String,
|
||||
fdef.getName(),
|
||||
truncateValue(Arrays.stream(jpaths).map(jpath -> getJPathString(jpath, json)).collect(Collectors.joining(" ")),
|
||||
fdef.getLength())
|
||||
)
|
||||
);
|
||||
break;
|
||||
}
|
||||
});
|
||||
m.setFieldMap(stringField);
|
||||
return m;
|
||||
}
|
||||
|
||||
public static List<String> getJPathList(String path, String json, Type type) {
|
||||
if (type == Type.List)
|
||||
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
|
||||
Object jresult;
|
||||
List<String> result = new ArrayList<>();
|
||||
try {
|
||||
jresult = JsonPath.read(json, path);
|
||||
} catch (Throwable e) {
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof JSONArray) {
|
||||
|
||||
((JSONArray) jresult).forEach(it -> {
|
||||
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(it));
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
}
|
||||
);
|
||||
return result;
|
||||
}
|
||||
|
||||
if (jresult instanceof LinkedHashMap) {
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(jresult));
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof String) {
|
||||
result.add((String) jresult);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public static String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String)o;
|
||||
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
|
||||
return (String)((JSONArray)o).get(0);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static double[] getJPathArray(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof double[])
|
||||
return (double[]) o;
|
||||
if (o instanceof JSONArray) {
|
||||
Object[] objects = ((JSONArray) o).toArray();
|
||||
double[] array = new double[objects.length];
|
||||
for (int i = 0; i < objects.length; i++) {
|
||||
if (objects[i] instanceof BigDecimal)
|
||||
array[i] = ((BigDecimal)objects[i]).doubleValue();
|
||||
else
|
||||
array[i] = (double) objects[i];
|
||||
}
|
||||
return array;
|
||||
}
|
||||
return new double[0];
|
||||
}
|
||||
catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return new double[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static String truncateValue(String value, int length) {
|
||||
if (value == null)
|
||||
return "";
|
||||
|
||||
if (length == -1 || length > value.length())
|
||||
return value;
|
||||
|
||||
return value.substring(0, length);
|
||||
}
|
||||
|
||||
public static List<String> truncateList(List<String> list, int size) {
|
||||
if (size == -1 || size > list.size())
|
||||
return list;
|
||||
|
||||
return list.subList(0, size);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
public class PaceException extends RuntimeException {
|
||||
|
||||
public PaceException(String s, Throwable e){
|
||||
super(s, e);
|
||||
}
|
||||
|
||||
public PaceException(String s){
|
||||
super(s);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.reflections.Reflections;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class PaceResolver implements Serializable {
|
||||
|
||||
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
|
||||
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
|
||||
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<Comparator>> comparators;
|
||||
|
||||
public PaceResolver() {
|
||||
|
||||
this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream()
|
||||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||
|
||||
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
|
||||
.filter(Comparator.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||
try {
|
||||
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
|
||||
try {
|
||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public interface Reporter extends Serializable {
|
||||
|
||||
void incrementCounter(String counterGroup, String counterName, long delta);
|
||||
|
||||
void emit(String type, String from, String to);
|
||||
}
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue