diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml
new file mode 100644
index 0000000000..567420bb30
--- /dev/null
+++ b/dnet-pace-core/pom.xml
@@ -0,0 +1,77 @@
+
+
+
+ 4.0.0
+
+
+ eu.dnetlib
+ dnet-dedup
+ 4.1.13-SNAPSHOT
+ ../pom.xml
+
+
+ dnet-pace-core
+ jar
+
+
+
+ edu.cmu
+ secondstring
+
+
+ com.google.guava
+ guava
+
+
+ com.google.code.gson
+ gson
+
+
+ org.apache.commons
+ commons-lang3
+
+
+ commons-io
+ commons-io
+
+
+
+ org.antlr
+ stringtemplate
+
+
+ commons-logging
+ commons-logging
+
+
+
+ org.junit.jupiter
+ junit-jupiter
+ test
+
+
+ org.reflections
+ reflections
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ org.apache.commons
+ commons-math3
+
+
+
+ com.jayway.jsonpath
+ json-path
+
+
+
+ com.ibm.icu
+ icu4j
+
+
+
+
+
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
new file mode 100644
index 0000000000..01f146120b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -0,0 +1,44 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
+
+ protected Map params;
+
+ public AbstractClusteringFunction(final Map params) {
+ this.params = params;
+ }
+
+ protected abstract Collection doApply(Config conf, String s);
+
+ @Override
+ public Collection apply(Config conf, List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(Field::stringValue)
+ .map(this::normalize)
+ .map(s -> filterAllStopWords(s))
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ protected Integer param(String name) {
+ return params.get(name);
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
new file mode 100644
index 0000000000..d3008332db
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
@@ -0,0 +1,49 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("acronyms")
+public class Acronyms extends AbstractClusteringFunction {
+
+ public Acronyms(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
+ }
+
+ private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
+
+ final Set acronyms = Sets.newLinkedHashSet();
+
+ for (int i = 0; i < maxAcronyms; i++) {
+
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (sb.length() > maxLen) {
+ break;
+ }
+ if (token.length() > 1 && i < token.length()) {
+ sb.append(token.charAt(i));
+ }
+ }
+ String acronym = sb.toString();
+ if (acronym.length() > minLen) {
+ acronyms.add(acronym);
+ }
+ }
+ return acronyms;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
new file mode 100644
index 0000000000..79a264a494
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
@@ -0,0 +1,59 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Maps;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.MapDocument;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+
+public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
+
+ public static Collection filterAndCombine(final MapDocument a, final Config conf) {
+ Document filtered = filter(a, conf.blacklists());
+ return combine(filtered, conf);
+ }
+
+ private static MapDocument filter(final MapDocument a, final Map> blacklists) {
+ if (blacklists == null || blacklists.isEmpty()) {
+ return a;
+ }
+
+ final Map filtered = Maps.newHashMap(a.getFieldMap());
+
+ for (final Entry> e : blacklists.entrySet()) {
+ Field fields = a.getFieldMap().get(e.getKey());
+ if (fields != null) {
+ final FieldListImpl fl = new FieldListImpl();
+
+ for (Field f : fields) {
+ if (!isBlackListed(f.stringValue(), e.getValue())) {
+ fl.add(f);
+ }
+ }
+
+ filtered.put(e.getKey(), fl);
+ }
+ }
+
+ return new MapDocument(a.getIdentifier(), filtered);
+ }
+
+ private static boolean isBlackListed(String value, List blacklist) {
+ for (Pattern pattern : blacklist) {
+ if (pattern.matcher(value).matches()) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+}
+
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
new file mode 100644
index 0000000000..e677671716
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
@@ -0,0 +1,13 @@
+package eu.dnetlib.pace.clustering;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface ClusteringClass {
+
+ public String value();
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
new file mode 100644
index 0000000000..0374762898
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
@@ -0,0 +1,60 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldValueImpl;
+import org.apache.commons.lang3.StringUtils;
+
+public class ClusteringCombiner {
+
+ private static String SEPARATOR = ":";
+ private static String COLLAPSE_ON= "collapseOn";
+
+ public static Collection combine(final Document a, final Config conf) {
+ final Collection res = Sets.newLinkedHashSet();
+ for (final ClusteringDef cd : conf.clusterings()) {
+ for (final String fieldName : cd.getFields()) {
+ String prefix = getPrefix(cd, fieldName);
+
+ Field values = a.values(fieldName);
+ List fields = new ArrayList<>();
+
+ if (values instanceof FieldValueImpl) {
+ fields.add(values);
+ }
+ else {
+ fields.addAll((List) values);
+ }
+
+ res.addAll(
+ cd.clusteringFunction()
+ .apply(conf, fields)
+ .stream()
+ .map(k -> prefix + SEPARATOR +k)
+ .collect(Collectors.toList())
+ );
+ }
+ }
+ return res;
+ }
+
+ private static String getPrefix(ClusteringDef cd, String fieldName) {
+ return cd.getName()+ SEPARATOR +
+ cd.getParams().keySet()
+ .stream()
+ .filter(k -> k.contains(COLLAPSE_ON))
+ .findFirst()
+ .map(k -> StringUtils.substringAfter(k, SEPARATOR))
+ .orElse(fieldName);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
new file mode 100644
index 0000000000..0554d27a19
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -0,0 +1,16 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+
+public interface ClusteringFunction {
+
+ public Collection apply(Config config, List fields);
+
+ public Map getParams();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
new file mode 100644
index 0000000000..7f342f69c4
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("immutablefieldvalue")
+public class ImmutableFieldValue extends AbstractClusteringFunction {
+
+ public ImmutableFieldValue(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ final List res = Lists.newArrayList();
+
+ res.add(s);
+
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
new file mode 100644
index 0000000000..e67275c4e5
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
@@ -0,0 +1,53 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("keywordsclustering")
+public class KeywordsClustering extends AbstractClusteringFunction {
+
+ public KeywordsClustering(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, String s) {
+
+ //takes city codes and keywords codes without duplicates
+ Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
+ Set cities = getCities(s, params.getOrDefault("windowSize", 4));
+
+ //list of combination to return as result
+ final Collection combinations = new LinkedHashSet();
+
+ for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
+ for (String city: citiesToCodes(cities)) {
+ combinations.add(keyword+"-"+city);
+ if (combinations.size()>=params.getOrDefault("max", 2)) {
+ return combinations;
+ }
+ }
+ }
+
+ return combinations;
+ }
+
+ @Override
+ public Collection apply(final Config conf, List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(Field::stringValue)
+ .map(this::cleanup)
+ .map(this::normalize)
+ .map(s -> filterAllStopWords(s))
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
new file mode 100644
index 0000000000..7f86854c2a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@@ -0,0 +1,77 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("lnfi")
+public class LastNameFirstInitial extends AbstractClusteringFunction{
+
+ private boolean DEFAULT_AGGRESSIVE = true;
+
+ public LastNameFirstInitial(final Map params) {
+ super(params);
+ }
+
+ @Override
+ public Collection apply(Config conf, List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(Field::stringValue)
+ .map(this::normalize)
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
+ @Override
+ protected String normalize(final String s) {
+ return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+ .replaceAll("[^ \\w]+", "")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+
+ final List res = Lists.newArrayList();
+
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+
+ Person p = new Person(s, aggressive);
+
+ if (p.isAccurate()) {
+ String lastName = p.getNormalisedSurname().toLowerCase();
+ String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
+
+ res.add(firstInitial.concat(lastName));
+ }
+ else { // is not accurate, meaning it has no defined name and surname
+ List fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
+ if (fullname.size() == 1) {
+ res.add(p.getNormalisedFullname().toLowerCase());
+ }
+ else if (fullname.size() == 2) {
+ res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
+ res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
+ }
+ else {
+ res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
+ res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
+ }
+ }
+
+ return res;
+ }
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
new file mode 100644
index 0000000000..309650f73a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -0,0 +1,36 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang3.StringUtils;
+
+@ClusteringClass("lowercase")
+public class LowercaseClustering extends AbstractClusteringFunction {
+
+ public LowercaseClustering(final Map params) {
+ super(params);
+ }
+
+ @Override
+ public Collection apply(Config conf, List fields) {
+ Collection c = Sets.newLinkedHashSet();
+ for(Field f : fields) {
+ c.addAll(doApply(conf, f.stringValue()));
+ }
+ return c;
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ if(StringUtils.isBlank(s)) {
+ return Lists.newArrayList();
+ }
+ return Lists.newArrayList(s.toLowerCase().trim());
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
new file mode 100644
index 0000000000..2391685b05
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
@@ -0,0 +1,20 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Set;
+
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+
+public class NGramUtils extends AbstractPaceFunctions {
+
+ private static final int SIZE = 100;
+
+ private static Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+
+ public static String cleanupForOrdering(String s) {
+ NGramUtils utils = new NGramUtils();
+ return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
new file mode 100644
index 0000000000..baa30d7471
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -0,0 +1,36 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("ngrampairs")
+public class NgramPairs extends Ngrams {
+
+ public NgramPairs(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
+ }
+
+ protected Collection ngramPairs(final List ngrams, int maxNgrams) {
+ Collection res = Lists.newArrayList();
+ int j = 0;
+ for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
+ if (++j >= ngrams.size()) {
+ break;
+ }
+ res.add(ngrams.get(i) + ngrams.get(j));
+ //System.out.println("-- " + concatNgrams);
+ }
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
new file mode 100644
index 0000000000..214b1451fa
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -0,0 +1,43 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.config.Config;
+
+import java.util.*;
+
+@ClusteringClass("ngrams")
+public class Ngrams extends AbstractClusteringFunction {
+
+ public Ngrams(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
+ }
+
+ protected Collection getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
+
+ final Collection ngrams = new LinkedHashSet();
+ final StringTokenizer st = new StringTokenizer(s);
+
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!token.isEmpty()) {
+
+ for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
+ String ngram = (token + " ").substring(i, ngramLen + i).trim();
+ if (ngrams.size() >= max) {
+ return ngrams;
+ }
+ if (ngram.length() >= minNgramLen) {
+ ngrams.add(ngram);
+ }
+ }
+ }
+ }
+ //System.out.println(ngrams + " n: " + ngrams.size());
+ return ngrams;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
new file mode 100644
index 0000000000..db8d90bcee
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -0,0 +1,81 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@ClusteringClass("personClustering")
+public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
+
+ private Map params;
+
+ private static final int MAX_TOKENS = 5;
+
+ public PersonClustering(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public Collection apply(final Config conf, final List fields) {
+ final Set hashes = Sets.newHashSet();
+
+ for (final Field f : fields) {
+
+ final Person person = new Person(f.stringValue(), false);
+
+ if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
+ hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
+ } else {
+ for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
+ for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
+ if (!token1.equals(token2)) {
+ hashes.add(firstLC(token1) + token2);
+ }
+ }
+ }
+ }
+ }
+
+ return hashes;
+ }
+
+// @Override
+// public Collection apply(final List fields) {
+// final Set hashes = Sets.newHashSet();
+//
+// for (final Field f : fields) {
+//
+// final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
+//
+// final Author a = gta.getAuthor();
+//
+// if (StringUtils.isNotBlank(a.getFirstname()) && StringUtils.isNotBlank(a.getSecondnames())) {
+// hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
+// } else {
+// for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
+// for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
+// if (!token1.equals(token2)) {
+// hashes.add(firstLC(token1) + token2);
+// }
+// }
+// }
+// }
+// }
+//
+// return hashes;
+// }
+
+ @Override
+ public Map getParams() {
+ return params;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
new file mode 100644
index 0000000000..f6c4fe07f1
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@@ -0,0 +1,32 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+
+@ClusteringClass("personHash")
+public class PersonHash extends AbstractClusteringFunction {
+
+ private boolean DEFAULT_AGGRESSIVE = false;
+
+ public PersonHash(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ final List res = Lists.newArrayList();
+
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+
+ res.add(new Person(s, aggressive).hash());
+
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
new file mode 100644
index 0000000000..86a2e4e4f0
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
@@ -0,0 +1,19 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.config.Config;
+
+import java.util.Collection;
+import java.util.Map;
+
+public class RandomClusteringFunction extends AbstractClusteringFunction {
+
+ public RandomClusteringFunction(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, String s) {
+ return null;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
new file mode 100644
index 0000000000..55b203d7a2
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -0,0 +1,27 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.*;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("sortedngrampairs")
+public class SortedNgramPairs extends NgramPairs {
+
+ public SortedNgramPairs(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+
+ final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
+
+ Collections.sort(tokens);
+
+ return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
new file mode 100644
index 0000000000..50cea4db3c
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
@@ -0,0 +1,29 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import eu.dnetlib.pace.config.Config;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.collect.Lists;
+
+@ClusteringClass("spacetrimmingfieldvalue")
+public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
+
+ public SpaceTrimmingFieldValue(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ final List res = Lists.newArrayList();
+
+ res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
+
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
new file mode 100644
index 0000000000..fa1f643621
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
@@ -0,0 +1,40 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("suffixprefix")
+public class SuffixPrefix extends AbstractClusteringFunction {
+
+ public SuffixPrefix(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefix(s, param("len"), param("max"));
+ }
+
+ private Collection suffixPrefix(String s, int len, int max) {
+ final Set bigrams = Sets.newLinkedHashSet();
+ int i = 0;
+ while (++i < s.length() && bigrams.size() < max) {
+ int j = s.indexOf(" ", i);
+
+ int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
+
+ if (j - len > 0) {
+ String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
+ if (bigram.length() >= 4) {
+ bigrams.add(bigram);
+ }
+ }
+ }
+ return bigrams;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
new file mode 100644
index 0000000000..feb60a221d
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -0,0 +1,54 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+@ClusteringClass("urlclustering")
+public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
+
+ protected Map params;
+
+ public UrlClustering(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public Collection apply(final Config conf, List fields) {
+ try {
+ return fields.stream()
+ .filter(f -> !f.isEmpty())
+ .map(Field::stringValue)
+ .map(this::asUrl)
+ .map(URL::getHost)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+ catch (IllegalStateException e){
+ return new HashSet<>();
+ }
+ }
+
+ @Override
+ public Map getParams() {
+ return null;
+ }
+
+ private URL asUrl(String value) {
+ try {
+ return new URL(value);
+ } catch (MalformedURLException e) {
+ // should not happen as checked by pace typing
+ throw new IllegalStateException("invalid URL: " + value);
+ }
+ }
+
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
new file mode 100644
index 0000000000..6fa2668fa3
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
@@ -0,0 +1,90 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("wordsStatsSuffixPrefixChain")
+public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
+
+ public WordsStatsSuffixPrefixChain(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefixChain(s, param("mod"));
+ }
+
+ private Collection suffixPrefixChain(String s, int mod) {
+
+ //create the list of words from the string (remove short words)
+ List wordsList =
+ Arrays.stream(s.split(" "))
+ .filter(si -> si.length() > 3)
+ .collect(Collectors.toList());
+
+ final int words = wordsList.size();
+ final int letters = s.length();
+
+ //create the prefix: number of words + number of letters/mod
+ String prefix = words + "-" + letters/mod + "-";
+
+ return doSuffixPrefixChain(wordsList, prefix);
+
+ }
+
+ private Collection doSuffixPrefixChain(List wordsList, String prefix) {
+
+ Set set = Sets.newLinkedHashSet();
+ switch(wordsList.size()){
+ case 0:
+ case 1:
+ break;
+ case 2:
+ set.add(
+ prefix +
+ suffix(wordsList.get(0), 3) +
+ prefix(wordsList.get(1), 3)
+ );
+
+ set.add(
+ prefix +
+ prefix(wordsList.get(0), 3) +
+ suffix(wordsList.get(1), 3)
+ );
+
+ break;
+ default:
+ set.add(
+ prefix +
+ suffix(wordsList.get(0), 3) +
+ prefix(wordsList.get(1), 3) +
+ suffix(wordsList.get(2), 3)
+ );
+
+ set.add(
+ prefix +
+ prefix(wordsList.get(0), 3) +
+ suffix(wordsList.get(1), 3) +
+ prefix(wordsList.get(2), 3)
+ );
+ break;
+ }
+
+ return set;
+
+ }
+
+
+ private String suffix(String s, int len) {
+ return s.substring(s.length()-len);
+ }
+
+ private String prefix(String s, int len) {
+ return s.substring(0, len);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
new file mode 100644
index 0000000000..1e94b34d26
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("wordssuffixprefix")
+public class WordsSuffixPrefix extends AbstractClusteringFunction {
+
+ public WordsSuffixPrefix(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefix(s, param("len"), param("max"));
+ }
+
+ private Collection suffixPrefix(String s, int len, int max) {
+
+ final int words = s.split(" ").length;
+
+ // adjust the token length according to the number of words
+ switch (words) {
+ case 1:
+ return Sets.newLinkedHashSet();
+ case 2:
+ return doSuffixPrefix(s, len+2, max, words);
+ case 3:
+ return doSuffixPrefix(s, len+1, max, words);
+ default:
+ return doSuffixPrefix(s, len, max, words);
+ }
+ }
+
+ private Collection doSuffixPrefix(String s, int len, int max, int words) {
+ final Set bigrams = Sets.newLinkedHashSet();
+ int i = 0;
+ while (++i < s.length() && bigrams.size() < max) {
+ int j = s.indexOf(" ", i);
+
+ int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
+
+ if (j - len > 0) {
+ String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
+ if (bigram.length() >= 4) {
+ bigrams.add(words+bigram);
+ }
+ }
+ }
+ return bigrams;
+ }
+
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
new file mode 100644
index 0000000000..bfe9f6220b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -0,0 +1,357 @@
+package eu.dnetlib.pace.common;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+import eu.dnetlib.pace.clustering.NGramUtils;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.model.FieldListImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+import java.text.Normalizer;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Set of common functions for the framework
+ *
+ * @author claudio
+ */
+public abstract class AbstractPaceFunctions {
+
+ //city map to be used when translating the city names into codes
+ private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
+
+ //list of stopwords in different languages
+ protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
+ protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+ protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
+ protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
+ protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
+ protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
+ protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
+
+ //transliterator
+ protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+ //blacklist of ngrams: to avoid generic keys
+ protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
+
+ //html regex for normalization
+ public final String HTML_REGEX = "<[^>]*>";
+
+ private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
+ private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
+ private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
+
+ //doi prefix for normalization
+ public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+
+ private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
+
+ private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
+
+ protected final static FieldList EMPTY_FIELD = new FieldListImpl();
+
+ protected String concat(final List l) {
+ return Joiner.on(" ").skipNulls().join(l);
+ }
+
+ protected String cleanup(final String s) {
+
+ final String s1 = s.replaceAll(HTML_REGEX, "");
+ final String s2 = unicodeNormalization(s1.toLowerCase());
+ final String s3 = nfd(s2);
+ final String s4 = fixXML(s3);
+ final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
+ final String s6 = transliterate(s5);
+ final String s7 = fixAliases(s6);
+ final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
+ final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
+ final String s10 = s9.replaceAll("\\n", " ");
+ final String s11 = s10.replaceAll("(?m)\\s+", " ");
+ final String s12 = s11.trim();
+ return s12;
+ }
+
+ protected String fixXML(final String a){
+
+ return a.replaceAll("–", " ")
+ .replaceAll("&", " ")
+ .replaceAll(""", " ")
+ .replaceAll("−", " ");
+ }
+
+ protected boolean checkNumbers(final String a, final String b) {
+ final String numbersA = getNumbers(a);
+ final String numbersB = getNumbers(b);
+ final String romansA = getRomans(a);
+ final String romansB = getRomans(b);
+ return !numbersA.equals(numbersB) || !romansA.equals(romansB);
+ }
+
+ protected String getRomans(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isRoman(t) ? t : "");
+ }
+ return sb.toString();
+ }
+
+ protected boolean isRoman(final String s) {
+ return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
+ }
+
+ protected String getNumbers(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isNumber(t) ? t : "");
+ }
+ return sb.toString();
+ }
+
+ public boolean isNumber(String strNum) {
+ if (strNum == null) {
+ return false;
+ }
+ return numberPattern.matcher(strNum).matches();
+ }
+
+ protected static String fixAliases(final String s) {
+ final StringBuilder sb = new StringBuilder();
+
+ s.chars().forEach(ch -> {
+ final int i = StringUtils.indexOf(aliases_from, ch);
+ sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
+ });
+
+ return sb.toString();
+ }
+
+ protected static String transliterate(final String s) {
+ try {
+ return transliterator.transliterate(s);
+ }
+ catch(Exception e) {
+ return s;
+ }
+ }
+
+ protected String removeSymbols(final String s) {
+ final StringBuilder sb = new StringBuilder();
+
+ s.chars().forEach(ch -> {
+ sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
+ });
+
+ return sb.toString().replaceAll("\\s+", " ");
+ }
+
+ protected String getFirstValue(final Field values) {
+ return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : "";
+ }
+
+ protected boolean notNull(final String s) {
+ return s != null;
+ }
+
+ protected String normalize(final String s) {
+ return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+ .toLowerCase()
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+ .replaceAll("[^ \\w]+", "")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ public String nfd(final String s) {
+ return Normalizer.normalize(s, Normalizer.Form.NFD);
+ }
+
+ public String utf8(final String s) {
+ byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
+ return new String(bytes, StandardCharsets.UTF_8);
+ }
+
+ public String unicodeNormalization(final String s) {
+
+ Matcher m = hexUnicodePattern.matcher(s);
+ StringBuffer buf = new StringBuffer(s.length());
+ while (m.find()) {
+ String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
+ m.appendReplacement(buf, Matcher.quoteReplacement(ch));
+ }
+ m.appendTail(buf);
+ return buf.toString();
+ }
+
+ protected String filterStopWords(final String s, final Set stopwords) {
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!stopwords.contains(token)) {
+ sb.append(token);
+ sb.append(" ");
+ }
+ }
+ return sb.toString().trim();
+ }
+
+ public String filterAllStopWords(String s) {
+
+ s = filterStopWords(s, stopwords_en);
+ s = filterStopWords(s, stopwords_de);
+ s = filterStopWords(s, stopwords_it);
+ s = filterStopWords(s, stopwords_fr);
+ s = filterStopWords(s, stopwords_pt);
+ s = filterStopWords(s, stopwords_es);
+ s = filterStopWords(s, stopwords_gr);
+
+ return s;
+ }
+
+ protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) {
+ final Set newset = Sets.newLinkedHashSet();
+ for (final String s : set) {
+ if (!ngramBlacklist.contains(s)) {
+ newset.add(s);
+ }
+ }
+ return newset;
+ }
+
+ public static Set loadFromClasspath(final String classpath) {
+
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+ final Set h = Sets.newHashSet();
+ try {
+ for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
+ h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
+ }
+ } catch (final Throwable e) {
+ return Sets.newHashSet();
+ }
+ return h;
+ }
+
+ public static Map loadMapFromClasspath(final String classpath) {
+
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+ final Map m = new HashMap<>();
+ try {
+ for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
+ //string is like this: code;word1;word2;word3
+ String[] line = s.split(";");
+ String value = line[0];
+ for (int i = 1; i < line.length; i++) {
+ m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
+ }
+ }
+ } catch (final Throwable e) {
+ return new HashMap<>();
+ }
+ return m;
+ }
+
+ public String removeKeywords(String s, Set keywords) {
+
+ s = " " + s + " ";
+ for (String k : keywords) {
+ s = s.replaceAll(k.toLowerCase(), "");
+ }
+
+ return s.trim();
+ }
+
+ public double commonElementsPercentage(Set s1, Set s2) {
+
+ double longer = Math.max(s1.size(), s2.size());
+ return (double) s1.stream().filter(s2::contains).count() / longer;
+ }
+
+ //convert the set of keywords to codes
+ public Set toCodes(Set keywords, Map translationMap) {
+ return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
+ }
+
+ public Set keywordsToCodes(Set keywords, Map translationMap) {
+ return toCodes(keywords, translationMap);
+ }
+
+ public Set citiesToCodes(Set keywords) {
+ return toCodes(keywords, cityMap);
+ }
+
+ protected String firstLC(final String s) {
+ return StringUtils.substring(s, 0, 1).toLowerCase();
+ }
+
+ protected Iterable tokens(final String s, final int maxTokens) {
+ return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
+ }
+
+ public String normalizePid(String pid) {
+ return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
+ }
+
+ //get the list of keywords into the input string
+ public Set getKeywords(String s1, Map translationMap, int windowSize) {
+
+ String s = s1;
+
+ List tokens = Arrays.asList(s.toLowerCase().split(" "));
+
+ Set codes = new HashSet<>();
+
+ if (tokens.size() < windowSize)
+ windowSize = tokens.size();
+
+ int length = windowSize;
+
+ while (length != 0) {
+
+ for (int i = 0; i <= tokens.size() - length; i++) {
+ String candidate = concat(tokens.subList(i, i + length));
+ if (translationMap.containsKey(candidate)) {
+ codes.add(candidate);
+ s = s.replace(candidate, "").trim();
+ }
+ }
+
+ tokens = Arrays.asList(s.split(" "));
+ length -= 1;
+ }
+
+ return codes;
+ }
+
+ public Set getCities(String s1, int windowSize) {
+ return getKeywords(s1, cityMap, windowSize);
+ }
+
+ public static String readFromClasspath(final String filename, final Class clazz) {
+ final StringWriter sw = new StringWriter();
+ try {
+ IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
+ return sw.toString();
+ } catch (final IOException e) {
+ throw new RuntimeException("cannot load resource from classpath: " + filename);
+ }
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
new file mode 100644
index 0000000000..6b44f4ebd4
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@@ -0,0 +1,60 @@
+package eu.dnetlib.pace.config;
+
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.tree.support.TreeNodeDef;
+
+/**
+ * Interface for PACE configuration bean.
+ *
+ * @author claudio
+ */
+public interface Config {
+
+ /**
+ * Field configuration definitions.
+ *
+ * @return the list of definitions
+ */
+ public List model();
+
+ /**
+ * Decision Tree definition
+ *
+ * @return the map representing the decision tree
+ */
+ public Map decisionTree();
+
+ /**
+ * Field configuration definitions.
+ *
+ * @return the list of definitions
+ */
+ public Map modelMap();
+
+ /**
+ * Clusterings.
+ *
+ * @return the list
+ */
+ public List clusterings();
+
+ /**
+ * Blacklists.
+ *
+ * @return the map
+ */
+ public Map> blacklists();
+
+
+ /**
+ * Translation map.
+ *
+ * @return the map
+ * */
+ public Map translationMap();
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
new file mode 100644
index 0000000000..a377b087fe
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@@ -0,0 +1,163 @@
+package eu.dnetlib.pace.config;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Maps;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.util.PaceException;
+import org.antlr.stringtemplate.StringTemplate;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+
+import eu.dnetlib.pace.tree.support.TreeNodeDef;
+
+
+public class DedupConfig implements Config, Serializable {
+
+ private static final Log log = LogFactory.getLog(DedupConfig.class);
+
+ private static String CONFIG_TEMPLATE = "dedupConfig.st";
+
+ private PaceConfig pace;
+
+ private WfConfig wf;
+
+ @JsonIgnore
+ private Map> blacklists;
+
+ private static Map defaults = Maps.newHashMap();
+
+ static {
+ defaults.put("dedupRun", "001");
+ defaults.put("entityType", "result");
+ defaults.put("subEntityType", "resulttype");
+ defaults.put("subEntityValue", "publication");
+ defaults.put("orderField", "title");
+ defaults.put("queueMaxSize", "2000");
+ defaults.put("groupMaxSize", "10");
+ defaults.put("slidingWindowSize", "200");
+ defaults.put("rootBuilder", "result");
+ defaults.put("includeChildren", "true");
+ defaults.put("maxIterations", "20");
+ defaults.put("idPath", "$.id");
+ }
+
+ public DedupConfig() {}
+
+ public static DedupConfig load(final String json) {
+
+ final DedupConfig config;
+ try {
+ config = new ObjectMapper().readValue(json, DedupConfig.class);
+ config.getPace().initModel();
+ config.getPace().initTranslationMap();
+
+ config.blacklists = config.getPace().getBlacklists().entrySet()
+ .stream()
+ .collect(Collectors.toMap(e -> e.getKey(),
+ e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
+
+ return config;
+ } catch (IOException e) {
+ throw new PaceException("Error in parsing configuration json", e);
+ }
+
+ }
+
+ public static DedupConfig loadDefault() throws IOException {
+ return loadDefault(new HashMap());
+ }
+
+ public static DedupConfig loadDefault(final Map params) throws IOException {
+
+ final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
+
+ for (final Entry e : defaults.entrySet()) {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ for (final Entry e : params.entrySet()) {
+ if (template.getAttribute(e.getKey()) != null) {
+ template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
+ } else {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ }
+
+ final String json = template.toString();
+ return load(json);
+ }
+
+ private String readFromClasspath(final String resource) throws IOException {
+ return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
+ }
+
+ public PaceConfig getPace() {
+ return pace;
+ }
+
+ public void setPace(final PaceConfig pace) {
+ this.pace = pace;
+ }
+
+ public WfConfig getWf() {
+ return wf;
+ }
+
+ public void setWf(final WfConfig wf) {
+ this.wf = wf;
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise configuration", e);
+ }
+ }
+
+ @Override
+ public Map decisionTree(){
+ return getPace().getDecisionTree();
+ }
+
+ @Override
+ public List model() {
+ return getPace().getModel();
+ }
+
+ @Override
+ public Map modelMap() {
+ return getPace().getModelMap();
+ }
+
+ @Override
+ public List clusterings() {
+ return getPace().getClustering();
+ }
+
+ @Override
+ public Map> blacklists() {
+ return blacklists;
+ }
+
+ @Override
+ public Map translationMap() {
+ return getPace().translationMap();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
new file mode 100644
index 0000000000..5ae2edfb01
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@@ -0,0 +1,110 @@
+package eu.dnetlib.pace.config;
+
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.google.common.collect.Maps;
+import com.ibm.icu.text.Transliterator;
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.tree.support.TreeNodeDef;
+import eu.dnetlib.pace.util.PaceResolver;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+
+public class PaceConfig extends AbstractPaceFunctions implements Serializable {
+
+ private List model;
+
+ private List clustering;
+ private Map decisionTree;
+
+ private Map> blacklists;
+ private Map> synonyms;
+
+ @JsonIgnore
+ private Map translationMap;
+
+ @JsonIgnore
+ private Map modelMap;
+
+ @JsonIgnore
+ public static PaceResolver resolver = new PaceResolver();
+
+ public PaceConfig() {}
+
+ public void initModel() {
+ modelMap = Maps.newHashMap();
+ for (FieldDef fd : getModel()) {
+ modelMap.put(fd.getName(), fd);
+ }
+ }
+
+ public void initTranslationMap(){
+ translationMap = Maps.newHashMap();
+
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+ for (String key : synonyms.keySet()) {
+ for (String term : synonyms.get(key)){
+ translationMap.put(
+ fixAliases(transliterator.transliterate(term.toLowerCase())),
+ key);
+ }
+ }
+ }
+
+ public Map translationMap(){
+ return translationMap;
+ }
+
+ public List getModel() {
+ return model;
+ }
+
+ public void setModel(final List model) {
+ this.model = model;
+ }
+
+ public List getClustering() {
+ return clustering;
+ }
+
+ public void setClustering(final List clustering) {
+ this.clustering = clustering;
+ }
+
+ public Map getDecisionTree() {
+ return decisionTree;
+ }
+
+ public void setDecisionTree(Map decisionTree) {
+ this.decisionTree = decisionTree;
+ }
+
+ public Map> getBlacklists() {
+ return blacklists;
+ }
+
+ public void setBlacklists(final Map> blacklists) {
+ this.blacklists = blacklists;
+ }
+
+ public Map> getSynonyms() {
+ return synonyms;
+ }
+
+ public void setSynonyms(Map> synonyms) {
+ this.synonyms = synonyms;
+ }
+
+ public Map getModelMap() {
+ return modelMap;
+ }
+
+ public void setModelMap(final Map modelMap) {
+ this.modelMap = modelMap;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
new file mode 100644
index 0000000000..20981c4279
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
@@ -0,0 +1,5 @@
+package eu.dnetlib.pace.config;
+
+public enum Type {
+ String, Int, List, JSON, URL, StringConcat, DoubleArray
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
new file mode 100644
index 0000000000..78fc18a134
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
@@ -0,0 +1,292 @@
+package eu.dnetlib.pace.config;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.util.PaceException;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+public class WfConfig implements Serializable {
+
+ /**
+ * Entity type.
+ */
+ private String entityType = "";
+
+ /**
+ * Sub-Entity type refers to one of fields declared in the model. See eu.dnetlib.pace.config.PaceConfig.modelMap
+ */
+ private String subEntityType = "";
+
+ /**
+ * Sub-Entity value declares a value for subTypes to be considered.
+ */
+ private String subEntityValue = "";
+
+ /**
+ * Field name used to sort the values in the reducer phase.
+ */
+ private String orderField = "";
+
+ /**
+ * Column Families involved in the relations redirection.
+ */
+ private List rootBuilder = Lists.newArrayList();
+
+ /**
+ * Set of datasource namespace prefixes that won't be deduplicated.
+ */
+ private Set skipList = Sets.newHashSet();
+
+ /**
+ * Subprefix used to build the root id, allows multiple dedup runs.
+ */
+ private String dedupRun = "";
+
+ /**
+ * Similarity threshold.
+ */
+ private double threshold = 0;
+
+ /** The queue max size. */
+ private int queueMaxSize = 2000;
+
+ /** The group max size. */
+ private int groupMaxSize;
+
+ /** The sliding window size. */
+ private int slidingWindowSize;
+
+ /** The configuration id. */
+ private String configurationId;
+
+ /** The include children. */
+ private boolean includeChildren;
+
+ /** Default maximum number of allowed children. */
+ private final static int MAX_CHILDREN = 10;
+
+ /** Maximum number of allowed children. */
+ private int maxChildren = MAX_CHILDREN;
+
+
+ /** Default maximum number of iterations. */
+ private final static int MAX_ITERATIONS = 20;
+
+ /** Maximum number of iterations */
+ private int maxIterations = MAX_ITERATIONS;
+
+ /** The Jquery path to retrieve the identifier */
+ private String idPath = "$.id";
+
+ public WfConfig() {}
+
+ /**
+ * Instantiates a new dedup config.
+ *
+ * @param entityType
+ * the entity type
+ * @param orderField
+ * the order field
+ * @param rootBuilder
+ * the root builder families
+ * @param dedupRun
+ * the dedup run
+ * @param skipList
+ * the skip list
+ * @param queueMaxSize
+ * the queue max size
+ * @param groupMaxSize
+ * the group max size
+ * @param slidingWindowSize
+ * the sliding window size
+ * @param includeChildren
+ * allows the children to be included in the representative records or not.
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param idPath
+ * the path for the id of the entity
+ */
+ public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun,
+ final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
+ super();
+ this.entityType = entityType;
+ this.orderField = orderField;
+ this.rootBuilder = rootBuilder;
+ this.dedupRun = cleanupStringNumber(dedupRun);
+ this.skipList = skipList;
+ this.queueMaxSize = queueMaxSize;
+ this.groupMaxSize = groupMaxSize;
+ this.slidingWindowSize = slidingWindowSize;
+ this.includeChildren = includeChildren;
+ this.maxIterations = maxIterations;
+ this.idPath = idPath;
+ }
+
+ /**
+ * Cleanup string number.
+ *
+ * @param s
+ * the s
+ * @return the string
+ */
+ private String cleanupStringNumber(final String s) {
+ return s.contains("'") ? s.replaceAll("'", "") : s;
+ }
+
+ public boolean hasSubType() {
+ return StringUtils.isNotBlank(getSubEntityType()) && StringUtils.isNotBlank(getSubEntityValue());
+ }
+
+ public String getEntityType() {
+ return entityType;
+ }
+
+ public void setEntityType(final String entityType) {
+ this.entityType = entityType;
+ }
+
+ public String getSubEntityType() {
+ return subEntityType;
+ }
+
+ public void setSubEntityType(final String subEntityType) {
+ this.subEntityType = subEntityType;
+ }
+
+ public String getSubEntityValue() {
+ return subEntityValue;
+ }
+
+ public void setSubEntityValue(final String subEntityValue) {
+ this.subEntityValue = subEntityValue;
+ }
+
+ public String getOrderField() {
+ return orderField;
+ }
+
+ public void setOrderField(final String orderField) {
+ this.orderField = orderField;
+ }
+
+ public List getRootBuilder() {
+ return rootBuilder;
+ }
+
+ public void setRootBuilder(final List rootBuilder) {
+ this.rootBuilder = rootBuilder;
+ }
+
+ public Set getSkipList() {
+ return skipList != null ? skipList : new HashSet();
+ }
+
+ public void setSkipList(final Set skipList) {
+ this.skipList = skipList;
+ }
+
+ public String getDedupRun() {
+ return dedupRun;
+ }
+
+ public void setDedupRun(final String dedupRun) {
+ this.dedupRun = dedupRun;
+ }
+
+ public double getThreshold() {
+ return threshold;
+ }
+
+ public void setThreshold(final double threshold) {
+ this.threshold = threshold;
+ }
+
+ public int getQueueMaxSize() {
+ return queueMaxSize;
+ }
+
+ public void setQueueMaxSize(final int queueMaxSize) {
+ this.queueMaxSize = queueMaxSize;
+ }
+
+ public int getGroupMaxSize() {
+ return groupMaxSize;
+ }
+
+ public void setGroupMaxSize(final int groupMaxSize) {
+ this.groupMaxSize = groupMaxSize;
+ }
+
+ public int getSlidingWindowSize() {
+ return slidingWindowSize;
+ }
+
+ public void setSlidingWindowSize(final int slidingWindowSize) {
+ this.slidingWindowSize = slidingWindowSize;
+ }
+
+ public String getConfigurationId() {
+ return configurationId;
+ }
+
+ public void setConfigurationId(final String configurationId) {
+ this.configurationId = configurationId;
+ }
+
+ public boolean isIncludeChildren() {
+ return includeChildren;
+ }
+
+ public void setIncludeChildren(final boolean includeChildren) {
+ this.includeChildren = includeChildren;
+ }
+
+ public int getMaxChildren() {
+ return maxChildren;
+ }
+
+ public void setMaxChildren(final int maxChildren) {
+ this.maxChildren = maxChildren;
+ }
+
+
+ public int getMaxIterations() {
+ return maxIterations;
+ }
+
+ public void setMaxIterations(int maxIterations) {
+ this.maxIterations = maxIterations;
+ }
+
+ public String getIdPath() {
+ return idPath;
+ }
+
+ public void setIdPath(String idPath) {
+ this.idPath = idPath;
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise " + this.getClass().getName(), e);
+ }
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java
new file mode 100644
index 0000000000..b418b75bbd
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java
@@ -0,0 +1,74 @@
+package eu.dnetlib.pace.model;
+
+import eu.dnetlib.pace.config.Type;
+
+/**
+ * The Class AbstractField.
+ */
+public abstract class AbstractField implements Field {
+
+ /** The type. */
+ protected Type type = Type.String;
+
+ /** The name. */
+ protected String name;
+
+ /**
+ * Instantiates a new abstract field.
+ */
+ protected AbstractField() {}
+
+ /**
+ * Instantiates a new abstract field.
+ *
+ * @param type
+ * the type
+ * @param name
+ * the name
+ */
+ protected AbstractField(final Type type, final String name) {
+ this.type = type;
+ this.name = name;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#getName()
+ */
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#getType()
+ */
+ @Override
+ public Type getType() {
+ return type;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#setName(java.lang.String)
+ */
+ @Override
+ public void setName(final String name) {
+ this.name = name;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type)
+ */
+ @Override
+ public void setType(final Type type) {
+ this.type = type;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
new file mode 100644
index 0000000000..c15885ecf8
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
@@ -0,0 +1,61 @@
+package eu.dnetlib.pace.model;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.pace.clustering.ClusteringFunction;
+import eu.dnetlib.pace.config.PaceConfig;
+import eu.dnetlib.pace.util.PaceException;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+
+
+public class ClusteringDef implements Serializable {
+
+ private String name;
+
+ private List fields;
+
+ private Map params;
+
+ public ClusteringDef() {}
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(final String name) {
+ this.name = name;
+ }
+
+ public ClusteringFunction clusteringFunction() {
+ return PaceConfig.resolver.getClusteringFunction(getName(), params);
+ }
+
+ public List getFields() {
+ return fields;
+ }
+
+ public void setFields(final List fields) {
+ this.fields = fields;
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ public void setParams(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise " + this.getClass().getName(), e);
+ }
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java
new file mode 100644
index 0000000000..fcacadd6fc
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java
@@ -0,0 +1,39 @@
+package eu.dnetlib.pace.model;
+
+import java.util.Set;
+
+/**
+ * The Interface Document. Models the common operations available on a Pace Document.
+ */
+public interface Document {
+
+ /**
+ * Gets the identifier.
+ *
+ * @return the identifier
+ */
+ String getIdentifier();
+
+ /**
+ * Fields.
+ *
+ * @return the iterable
+ */
+ Iterable fields();
+
+ /**
+ * Values.
+ *
+ * @param name
+ * the name
+ * @return the field list
+ */
+ Field values(String name);
+
+ /**
+ * Field names.
+ *
+ * @return the sets the
+ */
+ Set fieldNames();
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java
new file mode 100644
index 0000000000..4b7a73e31b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java
@@ -0,0 +1,56 @@
+package eu.dnetlib.pace.model;
+
+import eu.dnetlib.pace.config.Type;
+
+import java.io.Serializable;
+
+/**
+ * The Interface Field.
+ */
+public interface Field extends Iterable, Serializable {
+
+ /**
+ * Gets the name.
+ *
+ * @return the name
+ */
+ public String getName();
+
+ /**
+ * Sets the name.
+ *
+ * @param name
+ * the new name
+ */
+ public void setName(String name);
+
+ /**
+ * Gets the type.
+ *
+ * @return the type
+ */
+ public Type getType();
+
+ /**
+ * Sets the type.
+ *
+ * @param type
+ * the new type
+ */
+ public void setType(Type type);
+
+ /**
+ * Checks if is empty.
+ *
+ * @return true, if is empty
+ */
+ public boolean isEmpty();
+
+ /**
+ * String value.
+ *
+ * @return the string
+ */
+ public String stringValue();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
new file mode 100644
index 0000000000..f7831edaaa
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@@ -0,0 +1,114 @@
+package eu.dnetlib.pace.model;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Type;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
+ */
+public class FieldDef implements Serializable {
+
+ public final static String PATH_SEPARATOR = "/";
+
+ private String name;
+
+ private String path;
+
+ private Type type;
+
+ private boolean overrideMatch;
+
+ /**
+ * Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
+ */
+ private int size = -1;
+
+ /**
+ * Sets maximum length for field values in the model. -1 for unbounded length.
+ */
+ private int length = -1;
+
+ public FieldDef() {}
+
+ // def apply(s: String): Field[A]
+ public Field apply(final Type type, final String s) {
+ switch (type) {
+ case Int:
+ return new FieldValueImpl(type, name, Integer.parseInt(s));
+ case String:
+ return new FieldValueImpl(type, name, s);
+ case List:
+ return new FieldListImpl(name, type);
+ default:
+ throw new IllegalArgumentException("Casting not implemented for type " + type);
+ }
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ public List getPathList() {
+ return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(final Type type) {
+ this.type = type;
+ }
+
+ public boolean isOverrideMatch() {
+ return overrideMatch;
+ }
+
+ public void setOverrideMatch(final boolean overrideMatch) {
+ this.overrideMatch = overrideMatch;
+ }
+
+ public int getSize() {
+ return size;
+ }
+
+ public void setSize(int size) {
+ this.size = size;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public void setLength(int length) {
+ this.length = length;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public void setPath(String path) {
+ this.path = path;
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (JsonProcessingException e) {
+ return null;
+ }
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java
new file mode 100644
index 0000000000..b1f5422b7b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.pace.model;
+
+import java.util.List;
+
+/**
+ * The Interface FieldList.
+ */
+public interface FieldList extends List, Field {
+
+ /**
+ * String list.
+ *
+ * @return the list
+ */
+ public List stringList();
+
+ /**
+ * Double[] Array
+ *
+ * @return the double[] array
+ */
+ public double[] doubleArray();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
new file mode 100644
index 0000000000..635178b835
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
@@ -0,0 +1,338 @@
+package eu.dnetlib.pace.model;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Type;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.ListIterator;
+
+/**
+ * The Class FieldListImpl.
+ */
+public class FieldListImpl extends AbstractField implements FieldList {
+
+ /** The fields. */
+ private List fields;
+
+ /**
+ * Instantiates a new field list impl.
+ */
+ public FieldListImpl() {
+ fields = Lists.newArrayList();
+ }
+
+ /**
+ * Instantiates a new field list impl.
+ *
+ * @param name
+ * the name
+ */
+ public FieldListImpl(final String name, final Type type) {
+ super(type, name);
+ fields = Lists.newArrayList();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#add(java.lang.Object)
+ */
+ @Override
+ public boolean add(final Field f) {
+ return fields.add(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#add(int, java.lang.Object)
+ */
+ @Override
+ public void add(final int i, final Field f) {
+ fields.add(i, f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#addAll(java.util.Collection)
+ */
+ @Override
+ public boolean addAll(final Collection extends Field> f) {
+ return fields.addAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#addAll(int, java.util.Collection)
+ */
+ @Override
+ public boolean addAll(final int i, final Collection extends Field> f) {
+ return fields.addAll(i, f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#clear()
+ */
+ @Override
+ public void clear() {
+ fields.clear();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#contains(java.lang.Object)
+ */
+ @Override
+ public boolean contains(final Object o) {
+ return fields.contains(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#containsAll(java.util.Collection)
+ */
+ @Override
+ public boolean containsAll(final Collection> f) {
+ return fields.containsAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#get(int)
+ */
+ @Override
+ public Field get(final int i) {
+ return fields.get(i);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#indexOf(java.lang.Object)
+ */
+ @Override
+ public int indexOf(final Object o) {
+ return fields.indexOf(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#isEmpty()
+ */
+ @Override
+ public boolean isEmpty() {
+ return Iterables.all(fields, f -> f.isEmpty());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Iterable#iterator()
+ */
+ @Override
+ public Iterator iterator() {
+ return fields.iterator();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#lastIndexOf(java.lang.Object)
+ */
+ @Override
+ public int lastIndexOf(final Object o) {
+ return fields.lastIndexOf(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#listIterator()
+ */
+ @Override
+ public ListIterator listIterator() {
+ return fields.listIterator();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#listIterator(int)
+ */
+ @Override
+ public ListIterator listIterator(final int i) {
+ return fields.listIterator(i);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#remove(java.lang.Object)
+ */
+ @Override
+ public boolean remove(final Object o) {
+ return fields.remove(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#remove(int)
+ */
+ @Override
+ public Field remove(final int i) {
+ return fields.remove(i);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#removeAll(java.util.Collection)
+ */
+ @Override
+ public boolean removeAll(final Collection> f) {
+ return fields.removeAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#retainAll(java.util.Collection)
+ */
+ @Override
+ public boolean retainAll(final Collection> f) {
+ return fields.retainAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#set(int, java.lang.Object)
+ */
+ @Override
+ public Field set(final int i, final Field f) {
+ return fields.set(i, f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#size()
+ */
+ @Override
+ public int size() {
+ return fields.size();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#subList(int, int)
+ */
+ @Override
+ public List subList(final int from, final int to) {
+ return fields.subList(from, to);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#toArray()
+ */
+ @Override
+ public Object[] toArray() {
+ return fields.toArray();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#toArray(java.lang.Object[])
+ */
+ @Override
+ public T[] toArray(final T[] t) {
+ return fields.toArray(t);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#stringValue()
+ */
+ @Override
+ public String stringValue() {
+ switch (getType()) {
+
+ case List:
+ case Int:
+ case String:
+ return Joiner.on(" ").join(stringList());
+ case JSON:
+ String json;
+ try {
+ json = new ObjectMapper().writeValueAsString(this);
+ } catch (JsonProcessingException e) {
+ json = null;
+ }
+ return json;
+ default:
+ throw new IllegalArgumentException("Unknown type: " + getType().toString());
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.FieldList#stringList()
+ */
+ @Override
+ public List stringList() {
+ return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer()));
+ }
+
+ private Function getValuesTransformer() {
+ return new Function() {
+
+ @Override
+ public String apply(final Field f) {
+ return f.stringValue();
+ }
+ };
+ }
+
+ @Override
+ public double[] doubleArray() {
+ return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d-> d).toArray();
+ }
+
+ private Function getDouble() {
+
+ return new Function() {
+ @Override
+ public Double apply(final Field f) {
+ return Double.parseDouble(f.stringValue());
+ }
+ };
+ }
+
+ @Override
+ public String toString() {
+ return stringList().toString();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
new file mode 100644
index 0000000000..ebe474363a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
@@ -0,0 +1,25 @@
+package eu.dnetlib.pace.model;
+
+/**
+ * The Interface FieldValue.
+ */
+public interface FieldValue extends Field {
+
+ /**
+ * Gets the value.
+ *
+ * @return the value
+ */
+ public Object getValue();
+
+ /**
+ * Sets the value.
+ *
+ * @param value
+ * the new value
+ */
+ public void setValue(final Object value);
+
+ public double[] doubleArrayValue();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
new file mode 100644
index 0000000000..a235315d97
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
@@ -0,0 +1,136 @@
+package eu.dnetlib.pace.model;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import eu.dnetlib.pace.config.Type;
+import org.apache.commons.lang3.StringUtils;
+
+/**
+ * The Class FieldValueImpl.
+ */
+public class FieldValueImpl extends AbstractField implements FieldValue {
+
+ /** The value. */
+ private Object value = null;
+
+ /**
+ * Instantiates a new field value impl.
+ */
+ public FieldValueImpl() {}
+
+ /**
+ * Instantiates a new field value impl.
+ *
+ * @param type
+ * the type
+ * @param name
+ * the name
+ * @param value
+ * the value
+ */
+ public FieldValueImpl(final Type type, final String name, final Object value) {
+ super(type, name);
+ this.value = value;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#isEmpty()
+ */
+ @Override
+ public boolean isEmpty() {
+ if (value == null) return false;
+
+ switch (type) {
+ case String:
+ case JSON:
+ return value.toString().isEmpty();
+ case List:
+ try {
+ List> list = (List>) value;
+ return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
+ } catch (Exception e) {
+ throw new RuntimeException(value.toString());
+ }
+ case URL:
+ String str = value.toString();
+ return StringUtils.isBlank(str) || !isValidURL(str);
+ case DoubleArray:
+ return doubleArrayValue().length==0;
+ default:
+ return true;
+ }
+ }
+
+ private boolean isValidURL(final String s) {
+ try {
+ new URL(s);
+ return true;
+ } catch (MalformedURLException e) {
+ return false;
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.FieldValue#getValue()
+ */
+ @Override
+ public Object getValue() {
+ return value;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
+ */
+ @Override
+ public void setValue(final Object value) {
+ this.value = value;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#stringValue()
+ */
+ @Override
+ // @SuppressWarnings("unchecked")
+ public String stringValue() {
+ return String.valueOf(getValue());
+ // switch (getType()) {
+ //
+ // case Int:
+ // return String.valueOf(getValue());
+ // case List:
+ // return Joiner.on(" ").join((List) getValue());
+ // case String:
+ // return (String) getValue();
+ // default:
+ // throw new IllegalArgumentException("Unknown type: " + getType().toString());
+ // }
+ }
+
+ public double[] doubleArrayValue() {
+ return (double[])getValue();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Iterable#iterator()
+ */
+ @Override
+ @SuppressWarnings("unchecked")
+ public Iterator iterator() {
+ return Collections.singleton((Field) this).iterator();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java
new file mode 100644
index 0000000000..77b7c120e1
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java
@@ -0,0 +1,147 @@
+package eu.dnetlib.pace.model;
+
+import java.io.Serializable;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * The Class MapDocument.
+ */
+public class MapDocument implements Document, Serializable {
+
+ /** The identifier. */
+ private String identifier;
+
+ /** The field map. */
+ private Map fieldMap;
+
+ /**
+ * Instantiates a new map document.
+ */
+ public MapDocument() {
+ identifier = null;
+ fieldMap = Maps.newHashMap();
+ }
+
+ /**
+ * Instantiates a new map document.
+ *
+ * @param identifier
+ * the identifier
+ * @param fieldMap
+ * the field map
+ */
+ public MapDocument(final String identifier, final Map fieldMap) {
+ this.setIdentifier(identifier);
+ this.fieldMap = fieldMap;
+ }
+
+ /**
+ * Instantiates a new map document.
+ *
+ * @param identifier
+ * the identifier
+ * @param data
+ * the data
+ */
+ public MapDocument(final String identifier, final byte[] data) {
+ final MapDocument doc = MapDocumentSerializer.decode(data);
+
+ this.fieldMap = doc.fieldMap;
+ this.identifier = doc.identifier;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#fields()
+ */
+ @Override
+ public Iterable fields() {
+ return Lists.newArrayList(Iterables.concat(fieldMap.values()));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
+ */
+ @Override
+ public Field values(final String name) {
+ return fieldMap.get(name);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#fieldNames()
+ */
+ @Override
+ public Set fieldNames() {
+ return fieldMap.keySet();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return MapDocumentSerializer.toString(this);
+ // return String.format("Document(%s)", fieldMap.toString());
+ }
+
+ /**
+ * To byte array.
+ *
+ * @return the byte[]
+ */
+ public byte[] toByteArray() {
+ return MapDocumentSerializer.toByteArray(this);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#getIdentifier()
+ */
+ @Override
+ public String getIdentifier() {
+ return identifier;
+ }
+
+ /**
+ * Sets the identifier.
+ *
+ * @param identifier
+ * the new identifier
+ */
+ public void setIdentifier(final String identifier) {
+ this.identifier = identifier;
+ }
+
+ /**
+ * Gets the field map.
+ *
+ * @return the field map
+ */
+ public Map getFieldMap() {
+ return fieldMap;
+ }
+
+ /**
+ * Sets the field map.
+ *
+ * @param fieldMap
+ * the field map
+ */
+ public void setFieldMap(final Map fieldMap) {
+ this.fieldMap = fieldMap;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
new file mode 100644
index 0000000000..7217b2b59a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.pace.model;
+
+import java.util.Comparator;
+
+import com.google.common.collect.Iterables;
+
+import eu.dnetlib.pace.clustering.NGramUtils;
+
+/**
+ * The Class MapDocumentComparator.
+ */
+public class MapDocumentComparator implements Comparator {
+
+ /** The comparator field. */
+ private String comparatorField;
+
+ private final FieldList emptyField = new FieldListImpl();
+
+ /**
+ * Instantiates a new map document comparator.
+ *
+ * @param comparatorField
+ * the comparator field
+ */
+ public MapDocumentComparator(final String comparatorField) {
+ this.comparatorField = comparatorField;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(final Document d1, final Document d2) {
+
+ if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0;
+
+ final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue();
+ final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue();
+
+ if ((o1 == null) || (o2 == null)) return 0;
+
+ final String to1 = NGramUtils.cleanupForOrdering(o1);
+ final String to2 = NGramUtils.cleanupForOrdering(o2);
+
+ return to1.compareTo(to2);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
new file mode 100644
index 0000000000..e5b3522dfb
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
@@ -0,0 +1,101 @@
+package eu.dnetlib.pace.model;
+
+import java.lang.reflect.Type;
+
+import com.google.gson.GsonBuilder;
+import com.google.gson.InstanceCreator;
+import com.google.gson.JsonDeserializationContext;
+import com.google.gson.JsonDeserializer;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParseException;
+
+/**
+ * The Class MapDocumentSerializer.
+ */
+public class MapDocumentSerializer implements InstanceCreator {
+
+ @Override
+ public MapDocument createInstance(final Type type) {
+ return new MapDocument();
+ }
+
+ /**
+ * Decode.
+ *
+ * @param s
+ * the String
+ * @return the map document
+ */
+ public static MapDocument decode(final String s) {
+ final GsonBuilder gson = new GsonBuilder();
+
+ gson.registerTypeAdapter(Field.class, new JsonDeserializer() {
+
+ @Override
+ public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
+ final FieldListImpl fl = new FieldListImpl();
+ if (json.isJsonObject()) {
+
+ fl.add(handleJsonObject(json.getAsJsonObject()));
+
+ } else if (json.isJsonArray()) {
+
+ for (final JsonElement e : json.getAsJsonArray()) {
+ if (e.isJsonObject()) {
+ fl.add(handleJsonObject(e.getAsJsonObject()));
+ }
+ }
+ }
+ return fl;
+ }
+
+ private Field handleJsonObject(final JsonObject o) {
+ final FieldListImpl fl = new FieldListImpl();
+ final String name = o.get("name").getAsString();
+ final String type = o.get("type").getAsString();
+ final String value = o.get("value").getAsString();
+ fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
+ return fl;
+ }
+ });
+
+ return gson.create().fromJson(s, MapDocument.class);
+ }
+
+ /**
+ * Decode.
+ *
+ * @param bytes
+ * the bytes
+ * @return the map document
+ */
+ public static MapDocument decode(final byte[] bytes) {
+ return decode(new String(bytes));
+ }
+
+ /**
+ * To string.
+ *
+ * @param doc
+ * the doc
+ * @return the string
+ */
+ public static String toString(final MapDocument doc) {
+ final GsonBuilder b = new GsonBuilder();
+ return b.setPrettyPrinting().create().toJson(doc);
+
+ }
+
+ /**
+ * To byte array.
+ *
+ * @param doc
+ * the doc
+ * @return the byte[]
+ */
+ public static byte[] toByteArray(final MapDocument doc) {
+ return toString(doc).getBytes();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
new file mode 100644
index 0000000000..543b1bdfe7
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@@ -0,0 +1,155 @@
+package eu.dnetlib.pace.model;
+
+import java.nio.charset.Charset;
+import java.text.Normalizer;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.Hashing;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.util.Capitalise;
+import eu.dnetlib.pace.util.DotAbbreviations;
+
+public class Person {
+
+ private static final String UTF8 = "UTF-8";
+ private List name = Lists.newArrayList();
+ private List surname = Lists.newArrayList();
+ private List fullname = Lists.newArrayList();
+ private final String original;
+
+ private static Set particles = null;
+
+ public Person(String s, final boolean aggressive) {
+ original = s;
+ s = Normalizer.normalize(s, Normalizer.Form.NFD);
+ s = s.replaceAll("\\(.+\\)", "");
+ s = s.replaceAll("\\[.+\\]", "");
+ s = s.replaceAll("\\{.+\\}", "");
+ s = s.replaceAll("\\s+-\\s+", "-");
+ s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
+ s = s.replaceAll("\\d", " ");
+ s = s.replaceAll("\\n", " ");
+ s = s.replaceAll("\\.", " ");
+ s = s.replaceAll("\\s+", " ");
+
+ if (aggressive) {
+ s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
+ // s = s.replaceAll("[\\W&&[^,-]]", "");
+ }
+
+ if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
+ final String[] arr = s.split(",");
+ if (arr.length == 1) {
+ fullname = splitTerms(arr[0]);
+ } else if (arr.length > 1) {
+ surname = splitTerms(arr[0]);
+ name = splitTerms(arr[1]);
+ fullname.addAll(surname);
+ fullname.addAll(name);
+ }
+ } else {
+ fullname = splitTerms(s);
+
+ int lastInitialPosition = fullname.size();
+ boolean hasSurnameInUpperCase = false;
+
+ for (int i = 0; i < fullname.size(); i++) {
+ final String term = fullname.get(i);
+ if (term.length() == 1) {
+ lastInitialPosition = i;
+ } else if (term.equals(term.toUpperCase())) {
+ hasSurnameInUpperCase = true;
+ }
+ }
+
+ if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
+ name = fullname.subList(0, lastInitialPosition + 1);
+ surname = fullname.subList(lastInitialPosition + 1, fullname.size());
+ } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
+ for (final String term : fullname) {
+ if ((term.length() > 1) && term.equals(term.toUpperCase())) {
+ surname.add(term);
+ } else {
+ name.add(term);
+ }
+ }
+ }
+ }
+ }
+
+ private List splitTerms(final String s) {
+ if (particles == null) {
+ particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+ }
+
+ final List list = Lists.newArrayList();
+ for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+ if (!particles.contains(part.toLowerCase())) {
+ list.add(part);
+ }
+ }
+ return list;
+ }
+
+ public List getName() {
+ return name;
+ }
+
+ public String getNameString() {
+ return Joiner.on(" ").join(getName());
+ }
+
+ public List getSurname() {
+ return surname;
+ }
+
+ public List getFullname() {
+ return fullname;
+ }
+
+ public String getOriginal() {
+ return original;
+ }
+
+ public String hash() {
+ return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
+ }
+
+ public String getNormalisedFirstName() {
+ return Joiner.on(" ").join(getCapitalFirstnames());
+ }
+
+ public String getNormalisedSurname() {
+ return Joiner.on(" ").join(getCapitalSurname());
+ }
+
+ public String getSurnameString() {
+ return Joiner.on(" ").join(getSurname());
+ }
+
+ public String getNormalisedFullname() {
+ return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
+ }
+
+ public List getCapitalFirstnames() {
+ return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
+ }
+
+ public List getCapitalSurname() {
+ return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
+ }
+
+ public List getNameWithAbbreviations() {
+ return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
+ }
+
+ public boolean isAccurate() {
+ return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty());
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
new file mode 100644
index 0000000000..a900a6082f
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
@@ -0,0 +1,118 @@
+package eu.dnetlib.pace.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+public class PersonComparatorUtils {
+
+ private static final int MAX_FULLNAME_LENGTH = 50;
+
+ public static Set getNgramsForPerson(String fullname) {
+
+ Set set = Sets.newHashSet();
+
+ if (fullname.length() > MAX_FULLNAME_LENGTH) {
+ return set;
+ }
+
+ Person p = new Person(fullname, true);
+
+ if (p.isAccurate()) {
+ for (String name : p.getName()) {
+ for (String surname : p.getSurname()) {
+ set.add((name.charAt(0) + "_" + surname).toLowerCase());
+ }
+ }
+ } else {
+ List list = p.getFullname();
+ for (int i = 0; i < list.size(); i++) {
+ if (list.get(i).length() > 1) {
+ for (int j = 0; j < list.size(); j++) {
+ if (i != j) {
+ set.add((list.get(j).charAt(0) + "_" + list.get(i)).toLowerCase());
+ }
+ }
+ }
+ }
+ }
+
+ return set;
+ }
+
+ public static boolean areSimilar(String s1, String s2) {
+ Person p1 = new Person(s1, true);
+ Person p2 = new Person(s2, true);
+
+ if (p1.isAccurate() && p2.isAccurate()) {
+ return verifyNames(p1.getName(), p2.getName()) && verifySurnames(p1.getSurname(), p2.getSurname());
+ } else {
+ return verifyFullnames(p1.getFullname(), p2.getFullname());
+ }
+ }
+
+ private static boolean verifyNames(List list1, List list2) {
+ return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ }
+
+ private static boolean verifySurnames(List list1, List list2) {
+ if (list1.size() != list2.size()) {
+ return false;
+ }
+ for (int i = 0; i < list1.size(); i++) {
+ if (!list1.get(i).equalsIgnoreCase(list2.get(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean verifyFullnames(List list1, List list2) {
+ Collections.sort(list1);
+ Collections.sort(list2);
+ return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ }
+
+ private static List extractExtendedNames(List list) {
+ ArrayList res = Lists.newArrayList();
+ for (String s : list) {
+ if (s.length() > 1) {
+ res.add(s.toLowerCase());
+ }
+ }
+ return res;
+ }
+
+ private static List extractInitials(List list) {
+ ArrayList res = Lists.newArrayList();
+ for (String s : list) {
+ res.add(s.substring(0, 1).toLowerCase());
+ }
+ return res;
+ }
+
+ private static boolean verifySimilarity(List list1, List list2) {
+ if (list1.size() > list2.size()) {
+ return verifySimilarity(list2, list1);
+ }
+
+ // NB: List2 is greater than list1 (or equal)
+ int pos = -1;
+ for (String s : list1) {
+ int curr = list2.indexOf(s);
+ if (curr > pos) {
+ list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
+ pos = curr;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
new file mode 100644
index 0000000000..2fb8eb97c1
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
@@ -0,0 +1,42 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+@ComparatorClass("alwaysMatch")
+public class AlwaysMatch extends AbstractComparator {
+
+ public AlwaysMatch(final Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
+
+ public AlwaysMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double compare(final Field a, final Field b, final Config conf) {
+ return 1.0;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
+
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
new file mode 100644
index 0000000000..33f86d85d6
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@@ -0,0 +1,154 @@
+package eu.dnetlib.pace.tree;
+
+import com.google.common.collect.Iterables;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.model.Person;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+import com.wcohen.ss.AbstractStringDistance;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+@ComparatorClass("authorsMatch")
+public class AuthorsMatch extends AbstractComparator {
+
+ Map params;
+
+ private double SURNAME_THRESHOLD;
+ private double NAME_THRESHOLD;
+ private double FULLNAME_THRESHOLD;
+ private String MODE; //full or surname
+ private int SIZE_THRESHOLD;
+ private String TYPE; //count or percentage
+ private int common;
+
+ public AuthorsMatch(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ this.params = params;
+
+ MODE = params.getOrDefault("mode", "full");
+ SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
+ NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
+ FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
+ SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
+ TYPE = params.getOrDefault("type", "percentage");
+ common = 0;
+ }
+
+ protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double compare(final Field a, final Field b, final Config conf) {
+
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
+
+ if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
+ return 1.0;
+
+ List aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+ List bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+
+ common = 0;
+ //compare each element of List1 with each element of List2
+ for (Person p1 : aList)
+
+ for (Person p2 : bList) {
+
+ //both persons are inaccurate
+ if (!p1.isAccurate() && !p2.isAccurate()) {
+ //compare just normalized fullnames
+ String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
+ String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
+
+ if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
+ common += 1;
+ break;
+ }
+ }
+
+ //one person is inaccurate
+ if (p1.isAccurate() ^ p2.isAccurate()) {
+ //prepare data
+ //data for the accurate person
+ String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
+ String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
+
+ //data for the inaccurate person
+ String fullname = normalization(
+ p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
+ );
+
+ if (fullname.contains(surname)) {
+ if (MODE.equals("full")) {
+ if (fullname.contains(name)) {
+ common += 1;
+ break;
+ }
+ }
+ else { //MODE equals "surname"
+ common += 1;
+ break;
+ }
+ }
+ }
+
+ //both persons are accurate
+ if (p1.isAccurate() && p2.isAccurate()) {
+
+ if (compareSurname(p1, p2)) {
+ if (MODE.equals("full")) {
+ if(compareFirstname(p1, p2)) {
+ common += 1;
+ break;
+ }
+ }
+ else { //MODE equals "surname"
+ common += 1;
+ break;
+ }
+ }
+
+ }
+
+ }
+
+ //normalization factor to compute the score
+ int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
+
+ if(TYPE.equals("percentage")) {
+ return (double) common / normFactor;
+ }
+ else {
+ return (double) common;
+ }
+ }
+
+ public boolean compareSurname(Person p1, Person p2) {
+ return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
+ }
+
+ public boolean compareFirstname(Person p1, Person p2) {
+
+ if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
+ if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
+ return true;
+ }
+
+ return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
+ }
+
+ public String normalization(String s) {
+ return normalize(utf8(cleanup(s)));
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
new file mode 100644
index 0000000000..8ff818e07a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
@@ -0,0 +1,47 @@
+package eu.dnetlib.pace.tree;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+import java.util.Set;
+
+@ComparatorClass("cityMatch")
+public class CityMatch extends AbstractComparator {
+
+ private Map params;
+
+ public CityMatch(Map params) {
+ super(params);
+ this.params = params;
+ }
+
+ @Override
+ public double distance(final String a, final String b, final Config conf) {
+
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ ca = normalize(ca);
+ cb = normalize(cb);
+
+ ca = filterAllStopWords(ca);
+ cb = filterAllStopWords(cb);
+
+ Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+
+ Set codes1 = citiesToCodes(cities1);
+ Set