diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index 6df11f4ea..6e656034b 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -143,8 +143,8 @@
- eu.dnetlib
- dnet-pace-core
+ eu.dnetlib.dhp
+ dhp-pace-core
diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
new file mode 100644
index 000000000..b66976ea6
--- /dev/null
+++ b/dhp-pace-core/pom.xml
@@ -0,0 +1,91 @@
+
+
+
+ 4.0.0
+
+
+ eu.dnetlib.dhp
+ dhp
+ 1.2.5-SNAPSHOT
+ ../pom.xml
+
+
+ eu.dnetlib.dhp
+ dhp-pace-core
+ 1.2.5-SNAPSHOT
+ jar
+
+
+
+ edu.cmu
+ secondstring
+
+
+ com.google.guava
+ guava
+
+
+ com.google.code.gson
+ gson
+
+
+ org.apache.commons
+ commons-lang3
+
+
+ commons-io
+ commons-io
+
+
+
+ org.antlr
+ stringtemplate
+
+
+ commons-logging
+ commons-logging
+
+
+
+ org.junit.jupiter
+ junit-jupiter
+ test
+
+
+ org.reflections
+ reflections
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+
+
+ org.apache.commons
+ commons-math3
+
+
+
+ com.jayway.jsonpath
+ json-path
+
+
+
+ com.ibm.icu
+ icu4j
+
+
+
+ org.antlr
+ stringtemplate
+ 3.2.1
+
+
+ org.apache.spark
+ spark-catalyst_2.11
+ 2.4.0.cloudera2
+ compile
+
+
+
+
+
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
new file mode 100644
index 000000000..b7a70d607
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -0,0 +1,42 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
+
+ protected Map params;
+
+ public AbstractClusteringFunction(final Map params) {
+ this.params = params;
+ }
+
+ protected abstract Collection doApply(Config conf, String s);
+
+ @Override
+ public Collection apply(Config conf, List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(this::normalize)
+ .map(s -> filterAllStopWords(s))
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ protected Integer param(String name) {
+ return params.get(name);
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
new file mode 100644
index 000000000..d3008332d
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
@@ -0,0 +1,49 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("acronyms")
+public class Acronyms extends AbstractClusteringFunction {
+
+ public Acronyms(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
+ }
+
+ private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
+
+ final Set acronyms = Sets.newLinkedHashSet();
+
+ for (int i = 0; i < maxAcronyms; i++) {
+
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (sb.length() > maxLen) {
+ break;
+ }
+ if (token.length() > 1 && i < token.length()) {
+ sb.append(token.charAt(i));
+ }
+ }
+ String acronym = sb.toString();
+ if (acronym.length() > minLen) {
+ acronyms.add(acronym);
+ }
+ }
+ return acronyms;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
new file mode 100644
index 000000000..e67767171
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
@@ -0,0 +1,13 @@
+package eu.dnetlib.pace.clustering;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface ClusteringClass {
+
+ public String value();
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
new file mode 100644
index 000000000..4660d2b6c
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -0,0 +1,15 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.config.Config;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+public interface ClusteringFunction {
+
+ public Collection apply(Config config, List fields);
+
+ public Map getParams();
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
new file mode 100644
index 000000000..7f342f69c
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("immutablefieldvalue")
+public class ImmutableFieldValue extends AbstractClusteringFunction {
+
+ public ImmutableFieldValue(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ final List res = Lists.newArrayList();
+
+ res.add(s);
+
+ return res;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
new file mode 100644
index 000000000..73ba221c3
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.config.Config;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("keywordsclustering")
+public class KeywordsClustering extends AbstractClusteringFunction {
+
+ public KeywordsClustering(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, String s) {
+
+ //takes city codes and keywords codes without duplicates
+ Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
+ Set cities = getCities(s, params.getOrDefault("windowSize", 4));
+
+ //list of combination to return as result
+ final Collection combinations = new LinkedHashSet();
+
+ for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
+ for (String city: citiesToCodes(cities)) {
+ combinations.add(keyword+"-"+city);
+ if (combinations.size()>=params.getOrDefault("max", 2)) {
+ return combinations;
+ }
+ }
+ }
+
+ return combinations;
+ }
+
+ @Override
+ public Collection apply(final Config conf, List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(this::cleanup)
+ .map(this::normalize)
+ .map(s -> filterAllStopWords(s))
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
new file mode 100644
index 000000000..fa45ac909
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@@ -0,0 +1,75 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("lnfi")
+public class LastNameFirstInitial extends AbstractClusteringFunction{
+
+ private boolean DEFAULT_AGGRESSIVE = true;
+
+ public LastNameFirstInitial(final Map params) {
+ super(params);
+ }
+
+ @Override
+ public Collection apply(Config conf, List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(this::normalize)
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
+ @Override
+ protected String normalize(final String s) {
+ return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+ .replaceAll("[^ \\w]+", "")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+
+ final List res = Lists.newArrayList();
+
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+
+ Person p = new Person(s, aggressive);
+
+ if (p.isAccurate()) {
+ String lastName = p.getNormalisedSurname().toLowerCase();
+ String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
+
+ res.add(firstInitial.concat(lastName));
+ }
+ else { // is not accurate, meaning it has no defined name and surname
+ List fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
+ if (fullname.size() == 1) {
+ res.add(p.getNormalisedFullname().toLowerCase());
+ }
+ else if (fullname.size() == 2) {
+ res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
+ res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
+ }
+ else {
+ res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
+ res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
+ }
+ }
+
+ return res;
+ }
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
new file mode 100644
index 000000000..d50a95008
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -0,0 +1,35 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+@ClusteringClass("lowercase")
+public class LowercaseClustering extends AbstractClusteringFunction {
+
+ public LowercaseClustering(final Map params) {
+ super(params);
+ }
+
+ @Override
+ public Collection apply(Config conf, List fields) {
+ Collection c = Sets.newLinkedHashSet();
+ for(String f : fields) {
+ c.addAll(doApply(conf, f));
+ }
+ return c;
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ if(StringUtils.isBlank(s)) {
+ return Lists.newArrayList();
+ }
+ return Lists.newArrayList(s.toLowerCase().trim());
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
new file mode 100644
index 000000000..30d33629c
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
@@ -0,0 +1,21 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Set;
+
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+
+public class NGramUtils extends AbstractPaceFunctions {
+ static private final NGramUtils NGRAMUTILS = new NGramUtils();
+
+ private static final int SIZE = 100;
+
+ private static final Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+
+ public static String cleanupForOrdering(String s) {
+ String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords);
+ return result.isEmpty() ? result : result.replace(" ", "");
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
new file mode 100644
index 000000000..fd7c17ec3
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -0,0 +1,40 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("ngrampairs")
+public class NgramPairs extends Ngrams {
+
+ public NgramPairs(Map params) {
+ super(params, false);
+ }
+
+ public NgramPairs(Map params, boolean sorted) {
+ super(params, sorted);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
+ }
+
+ protected Collection ngramPairs(final List ngrams, int maxNgrams) {
+ Collection res = Lists.newArrayList();
+ int j = 0;
+ for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
+ if (++j >= ngrams.size()) {
+ break;
+ }
+ res.add(ngrams.get(i) + ngrams.get(j));
+ //System.out.println("-- " + concatNgrams);
+ }
+ return res;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
new file mode 100644
index 000000000..3af7e98e8
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -0,0 +1,51 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.config.Config;
+
+import java.util.*;
+
+@ClusteringClass("ngrams")
+public class Ngrams extends AbstractClusteringFunction {
+
+ private final boolean sorted;
+
+ public Ngrams(Map params) {
+ this(params, false);
+ }
+
+ public Ngrams(Map params, boolean sorted) {
+ super(params);
+ this.sorted = sorted;
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
+ }
+
+ protected Collection getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
+
+ final Collection ngrams = sorted ? new TreeSet<>() : new LinkedHashSet();
+ final StringTokenizer st = new StringTokenizer(s);
+
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!token.isEmpty()) {
+ for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
+ String ngram = token.substring(i, Math.min(ngramLen + i, token.length())).trim();
+
+ if (ngram.length() >= minNgramLen) {
+ ngrams.add(ngram);
+
+ if (ngrams.size() >= max) {
+ return ngrams;
+ }
+ }
+ }
+ }
+ }
+ //System.out.println(ngrams + " n: " + ngrams.size());
+ return ngrams;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
new file mode 100644
index 000000000..a5bad2075
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -0,0 +1,80 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+import org.apache.commons.lang3.StringUtils;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+@ClusteringClass("personClustering")
+public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
+
+ private Map params;
+
+ private static final int MAX_TOKENS = 5;
+
+ public PersonClustering(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public Collection apply(final Config conf, final List fields) {
+ final Set hashes = Sets.newHashSet();
+
+ for (final String f : fields) {
+
+ final Person person = new Person(f, false);
+
+ if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
+ hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
+ } else {
+ for (final String token1 : tokens(f, MAX_TOKENS)) {
+ for (final String token2 : tokens(f, MAX_TOKENS)) {
+ if (!token1.equals(token2)) {
+ hashes.add(firstLC(token1) + token2);
+ }
+ }
+ }
+ }
+ }
+
+ return hashes;
+ }
+
+// @Override
+// public Collection apply(final List fields) {
+// final Set hashes = Sets.newHashSet();
+//
+// for (final Field f : fields) {
+//
+// final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
+//
+// final Author a = gta.getAuthor();
+//
+// if (StringUtils.isNotBlank(a.getFirstname()) && StringUtils.isNotBlank(a.getSecondnames())) {
+// hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
+// } else {
+// for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
+// for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
+// if (!token1.equals(token2)) {
+// hashes.add(firstLC(token1) + token2);
+// }
+// }
+// }
+// }
+// }
+//
+// return hashes;
+// }
+
+ @Override
+ public Map getParams() {
+ return params;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
new file mode 100644
index 000000000..f6c4fe07f
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@@ -0,0 +1,32 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+
+@ClusteringClass("personHash")
+public class PersonHash extends AbstractClusteringFunction {
+
+ private boolean DEFAULT_AGGRESSIVE = false;
+
+ public PersonHash(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ final List res = Lists.newArrayList();
+
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+
+ res.add(new Person(s, aggressive).hash());
+
+ return res;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
new file mode 100644
index 000000000..86a2e4e4f
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
@@ -0,0 +1,19 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.config.Config;
+
+import java.util.Collection;
+import java.util.Map;
+
+public class RandomClusteringFunction extends AbstractClusteringFunction {
+
+ public RandomClusteringFunction(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, String s) {
+ return null;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
new file mode 100644
index 000000000..77c2c0155
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -0,0 +1,17 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.*;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("sortedngrampairs")
+public class SortedNgramPairs extends NgramPairs {
+
+ public SortedNgramPairs(Map params) {
+ super(params, true);
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
new file mode 100644
index 000000000..50cea4db3
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
@@ -0,0 +1,29 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import eu.dnetlib.pace.config.Config;
+import org.apache.commons.lang3.RandomStringUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.collect.Lists;
+
+@ClusteringClass("spacetrimmingfieldvalue")
+public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
+
+ public SpaceTrimmingFieldValue(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
+ final List res = Lists.newArrayList();
+
+ res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
+
+ return res;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
new file mode 100644
index 000000000..fa1f64362
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
@@ -0,0 +1,40 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("suffixprefix")
+public class SuffixPrefix extends AbstractClusteringFunction {
+
+ public SuffixPrefix(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefix(s, param("len"), param("max"));
+ }
+
+ private Collection suffixPrefix(String s, int len, int max) {
+ final Set bigrams = Sets.newLinkedHashSet();
+ int i = 0;
+ while (++i < s.length() && bigrams.size() < max) {
+ int j = s.indexOf(" ", i);
+
+ int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
+
+ if (j - len > 0) {
+ String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
+ if (bigram.length() >= 4) {
+ bigrams.add(bigram);
+ }
+ }
+ }
+ return bigrams;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
new file mode 100644
index 000000000..235cec101
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -0,0 +1,52 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+@ClusteringClass("urlclustering")
+public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
+
+ protected Map params;
+
+ public UrlClustering(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public Collection apply(final Config conf, List fields) {
+ try {
+ return fields.stream()
+ .filter(f -> !f.isEmpty())
+ .map(this::asUrl)
+ .map(URL::getHost)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+ catch (IllegalStateException e){
+ return new HashSet<>();
+ }
+ }
+
+ @Override
+ public Map getParams() {
+ return null;
+ }
+
+ private URL asUrl(String value) {
+ try {
+ return new URL(value);
+ } catch (MalformedURLException e) {
+ // should not happen as checked by pace typing
+ throw new IllegalStateException("invalid URL: " + value);
+ }
+ }
+
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
new file mode 100644
index 000000000..6fa2668fa
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
@@ -0,0 +1,90 @@
+package eu.dnetlib.pace.clustering;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+@ClusteringClass("wordsStatsSuffixPrefixChain")
+public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
+
+ public WordsStatsSuffixPrefixChain(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefixChain(s, param("mod"));
+ }
+
+ private Collection suffixPrefixChain(String s, int mod) {
+
+ //create the list of words from the string (remove short words)
+ List wordsList =
+ Arrays.stream(s.split(" "))
+ .filter(si -> si.length() > 3)
+ .collect(Collectors.toList());
+
+ final int words = wordsList.size();
+ final int letters = s.length();
+
+ //create the prefix: number of words + number of letters/mod
+ String prefix = words + "-" + letters/mod + "-";
+
+ return doSuffixPrefixChain(wordsList, prefix);
+
+ }
+
+ private Collection doSuffixPrefixChain(List wordsList, String prefix) {
+
+ Set set = Sets.newLinkedHashSet();
+ switch(wordsList.size()){
+ case 0:
+ case 1:
+ break;
+ case 2:
+ set.add(
+ prefix +
+ suffix(wordsList.get(0), 3) +
+ prefix(wordsList.get(1), 3)
+ );
+
+ set.add(
+ prefix +
+ prefix(wordsList.get(0), 3) +
+ suffix(wordsList.get(1), 3)
+ );
+
+ break;
+ default:
+ set.add(
+ prefix +
+ suffix(wordsList.get(0), 3) +
+ prefix(wordsList.get(1), 3) +
+ suffix(wordsList.get(2), 3)
+ );
+
+ set.add(
+ prefix +
+ prefix(wordsList.get(0), 3) +
+ suffix(wordsList.get(1), 3) +
+ prefix(wordsList.get(2), 3)
+ );
+ break;
+ }
+
+ return set;
+
+ }
+
+
+ private String suffix(String s, int len) {
+ return s.substring(s.length()-len);
+ }
+
+ private String prefix(String s, int len) {
+ return s.substring(0, len);
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
new file mode 100644
index 000000000..1e94b34d2
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+
+@ClusteringClass("wordssuffixprefix")
+public class WordsSuffixPrefix extends AbstractClusteringFunction {
+
+ public WordsSuffixPrefix(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefix(s, param("len"), param("max"));
+ }
+
+ private Collection suffixPrefix(String s, int len, int max) {
+
+ final int words = s.split(" ").length;
+
+ // adjust the token length according to the number of words
+ switch (words) {
+ case 1:
+ return Sets.newLinkedHashSet();
+ case 2:
+ return doSuffixPrefix(s, len+2, max, words);
+ case 3:
+ return doSuffixPrefix(s, len+1, max, words);
+ default:
+ return doSuffixPrefix(s, len, max, words);
+ }
+ }
+
+ private Collection doSuffixPrefix(String s, int len, int max, int words) {
+ final Set bigrams = Sets.newLinkedHashSet();
+ int i = 0;
+ while (++i < s.length() && bigrams.size() < max) {
+ int j = s.indexOf(" ", i);
+
+ int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
+
+ if (j - len > 0) {
+ String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
+ if (bigram.length() >= 4) {
+ bigrams.add(words+bigram);
+ }
+ }
+ }
+ return bigrams;
+ }
+
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
new file mode 100644
index 000000000..3b80bfcd1
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -0,0 +1,346 @@
+package eu.dnetlib.pace.common;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+import eu.dnetlib.pace.clustering.NGramUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.IOException;
+import java.io.StringWriter;
+import java.nio.charset.StandardCharsets;
+import java.text.Normalizer;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+/**
+ * Set of common functions for the framework
+ *
+ * @author claudio
+ */
+public abstract class AbstractPaceFunctions {
+
+ //city map to be used when translating the city names into codes
+ private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
+
+ //list of stopwords in different languages
+ protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
+ protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+ protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
+ protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
+ protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
+ protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
+ protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
+
+ //transliterator
+ protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+ //blacklist of ngrams: to avoid generic keys
+ protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
+
+ //html regex for normalization
+ public final String HTML_REGEX = "<[^>]*>";
+
+ private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
+ private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
+ private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
+
+ //doi prefix for normalization
+ public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+
+ private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
+
+ private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
+
+ protected String concat(final List l) {
+ return Joiner.on(" ").skipNulls().join(l);
+ }
+
+ protected String cleanup(final String s) {
+
+ final String s1 = s.replaceAll(HTML_REGEX, "");
+ final String s2 = unicodeNormalization(s1.toLowerCase());
+ final String s3 = nfd(s2);
+ final String s4 = fixXML(s3);
+ final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
+ final String s6 = transliterate(s5);
+ final String s7 = fixAliases(s6);
+ final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
+ final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
+ final String s10 = s9.replaceAll("\\n", " ");
+ final String s11 = s10.replaceAll("(?m)\\s+", " ");
+ final String s12 = s11.trim();
+ return s12;
+ }
+
+ protected String fixXML(final String a){
+
+ return a.replaceAll("–", " ")
+ .replaceAll("&", " ")
+ .replaceAll(""", " ")
+ .replaceAll("−", " ");
+ }
+
+ protected boolean checkNumbers(final String a, final String b) {
+ final String numbersA = getNumbers(a);
+ final String numbersB = getNumbers(b);
+ final String romansA = getRomans(a);
+ final String romansB = getRomans(b);
+ return !numbersA.equals(numbersB) || !romansA.equals(romansB);
+ }
+
+ protected String getRomans(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isRoman(t) ? t : "");
+ }
+ return sb.toString();
+ }
+
+ protected boolean isRoman(final String s) {
+ return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
+ }
+
+ protected String getNumbers(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isNumber(t) ? t : "");
+ }
+ return sb.toString();
+ }
+
+ public boolean isNumber(String strNum) {
+ if (strNum == null) {
+ return false;
+ }
+ return numberPattern.matcher(strNum).matches();
+ }
+
+ protected static String fixAliases(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final char ch : Lists.charactersOf(s)) {
+ final int i = StringUtils.indexOf(aliases_from, ch);
+ sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
+ }
+ return sb.toString();
+ }
+
+ protected static String transliterate(final String s) {
+ try {
+ return transliterator.transliterate(s);
+ }
+ catch(Exception e) {
+ return s;
+ }
+ }
+
+ protected String removeSymbols(final String s) {
+ final StringBuilder sb = new StringBuilder();
+
+ for (final char ch : Lists.charactersOf(s)) {
+ sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
+ }
+ return sb.toString().replaceAll("\\s+", " ");
+ }
+
+ protected boolean notNull(final String s) {
+ return s != null;
+ }
+
+ protected String normalize(final String s) {
+ return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+ .toLowerCase()
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+ .replaceAll("[^ \\w]+", "")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ public String nfd(final String s) {
+ return Normalizer.normalize(s, Normalizer.Form.NFD);
+ }
+
+ public String utf8(final String s) {
+ byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
+ return new String(bytes, StandardCharsets.UTF_8);
+ }
+
+ public String unicodeNormalization(final String s) {
+
+ Matcher m = hexUnicodePattern.matcher(s);
+ StringBuffer buf = new StringBuffer(s.length());
+ while (m.find()) {
+ String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
+ m.appendReplacement(buf, Matcher.quoteReplacement(ch));
+ }
+ m.appendTail(buf);
+ return buf.toString();
+ }
+
+ protected String filterStopWords(final String s, final Set stopwords) {
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!stopwords.contains(token)) {
+ sb.append(token);
+ sb.append(" ");
+ }
+ }
+ return sb.toString().trim();
+ }
+
+ public String filterAllStopWords(String s) {
+
+ s = filterStopWords(s, stopwords_en);
+ s = filterStopWords(s, stopwords_de);
+ s = filterStopWords(s, stopwords_it);
+ s = filterStopWords(s, stopwords_fr);
+ s = filterStopWords(s, stopwords_pt);
+ s = filterStopWords(s, stopwords_es);
+ s = filterStopWords(s, stopwords_gr);
+
+ return s;
+ }
+
+ protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) {
+ final Set newset = Sets.newLinkedHashSet();
+ for (final String s : set) {
+ if (!ngramBlacklist.contains(s)) {
+ newset.add(s);
+ }
+ }
+ return newset;
+ }
+
+ public static Set loadFromClasspath(final String classpath) {
+
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+ final Set h = Sets.newHashSet();
+ try {
+ for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
+ h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
+ }
+ } catch (final Throwable e) {
+ return Sets.newHashSet();
+ }
+ return h;
+ }
+
+ public static Map loadMapFromClasspath(final String classpath) {
+
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+
+ final Map m = new HashMap<>();
+ try {
+ for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
+ //string is like this: code;word1;word2;word3
+ String[] line = s.split(";");
+ String value = line[0];
+ for (int i = 1; i < line.length; i++) {
+ m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
+ }
+ }
+ } catch (final Throwable e) {
+ return new HashMap<>();
+ }
+ return m;
+ }
+
+ public String removeKeywords(String s, Set keywords) {
+
+ s = " " + s + " ";
+ for (String k : keywords) {
+ s = s.replaceAll(k.toLowerCase(), "");
+ }
+
+ return s.trim();
+ }
+
+ public double commonElementsPercentage(Set s1, Set s2) {
+
+ double longer = Math.max(s1.size(), s2.size());
+ return (double) s1.stream().filter(s2::contains).count() / longer;
+ }
+
+ //convert the set of keywords to codes
+ public Set toCodes(Set keywords, Map translationMap) {
+ return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
+ }
+
+ public Set keywordsToCodes(Set keywords, Map translationMap) {
+ return toCodes(keywords, translationMap);
+ }
+
+ public Set citiesToCodes(Set keywords) {
+ return toCodes(keywords, cityMap);
+ }
+
+ protected String firstLC(final String s) {
+ return StringUtils.substring(s, 0, 1).toLowerCase();
+ }
+
+ protected Iterable tokens(final String s, final int maxTokens) {
+ return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
+ }
+
+ public String normalizePid(String pid) {
+ return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
+ }
+
+ //get the list of keywords into the input string
+ public Set getKeywords(String s1, Map translationMap, int windowSize) {
+
+ String s = s1;
+
+ List tokens = Arrays.asList(s.toLowerCase().split(" "));
+
+ Set codes = new HashSet<>();
+
+ if (tokens.size() < windowSize)
+ windowSize = tokens.size();
+
+ int length = windowSize;
+
+ while (length != 0) {
+
+ for (int i = 0; i <= tokens.size() - length; i++) {
+ String candidate = concat(tokens.subList(i, i + length));
+ if (translationMap.containsKey(candidate)) {
+ codes.add(candidate);
+ s = s.replace(candidate, "").trim();
+ }
+ }
+
+ tokens = Arrays.asList(s.split(" "));
+ length -= 1;
+ }
+
+ return codes;
+ }
+
+ public Set getCities(String s1, int windowSize) {
+ return getKeywords(s1, cityMap, windowSize);
+ }
+
+ public static String readFromClasspath(final String filename, final Class clazz) {
+ final StringWriter sw = new StringWriter();
+ try {
+ IOUtils.copy(clazz.getResourceAsStream(filename), sw);
+ return sw.toString();
+ } catch (final IOException e) {
+ throw new RuntimeException("cannot load resource from classpath: " + filename);
+ }
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
new file mode 100644
index 000000000..0db0270fb
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@@ -0,0 +1,54 @@
+package eu.dnetlib.pace.config;
+
+import java.util.List;
+import java.util.Map;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.tree.support.TreeNodeDef;
+
+/**
+ * Interface for PACE configuration bean.
+ *
+ * @author claudio
+ */
+public interface Config {
+
+ /**
+ * Field configuration definitions.
+ *
+ * @return the list of definitions
+ */
+ public List model();
+
+ /**
+ * Decision Tree definition
+ *
+ * @return the map representing the decision tree
+ */
+ public Map decisionTree();
+
+ /**
+ * Clusterings.
+ *
+ * @return the list
+ */
+ public List clusterings();
+
+ /**
+ * Blacklists.
+ *
+ * @return the map
+ */
+ public Map> blacklists();
+
+
+ /**
+ * Translation map.
+ *
+ * @return the map
+ * */
+ public Map translationMap();
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
new file mode 100644
index 000000000..63fc96aef
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@@ -0,0 +1,163 @@
+package eu.dnetlib.pace.config;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Maps;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.util.PaceException;
+import org.antlr.stringtemplate.StringTemplate;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+import java.util.stream.Collectors;
+
+
+import eu.dnetlib.pace.tree.support.TreeNodeDef;
+
+
+public class DedupConfig implements Config, Serializable {
+ private static String CONFIG_TEMPLATE = "dedupConfig.st";
+
+ private PaceConfig pace;
+
+ private WfConfig wf;
+
+ @JsonIgnore
+ private Map> blacklists;
+
+ private static Map defaults = Maps.newHashMap();
+
+ static {
+ defaults.put("dedupRun", "001");
+ defaults.put("entityType", "result");
+ defaults.put("subEntityType", "resulttype");
+ defaults.put("subEntityValue", "publication");
+ defaults.put("orderField", "title");
+ defaults.put("queueMaxSize", "2000");
+ defaults.put("groupMaxSize", "10");
+ defaults.put("slidingWindowSize", "200");
+ defaults.put("rootBuilder", "result");
+ defaults.put("includeChildren", "true");
+ defaults.put("maxIterations", "20");
+ defaults.put("idPath", "$.id");
+ }
+
+ public DedupConfig() {
+ }
+
+ public static DedupConfig load(final String json) {
+
+ final DedupConfig config;
+ try {
+ config = new ObjectMapper().readValue(json, DedupConfig.class);
+ config.getPace().initModel();
+ config.getPace().initTranslationMap();
+
+ config.blacklists = config.getPace().getBlacklists().entrySet()
+ .stream()
+ .map(e -> new AbstractMap.SimpleEntry>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList())))
+ .collect(Collectors.toMap(e -> e.getKey(),
+ e -> (Predicate & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent()))
+
+ ;
+
+ return config;
+ } catch (IOException |
+ PatternSyntaxException e) {
+ throw new PaceException("Error in parsing configuration json", e);
+ }
+
+ }
+
+ public static DedupConfig loadDefault() throws IOException {
+ return loadDefault(new HashMap());
+ }
+
+ public static DedupConfig loadDefault(final Map params) throws IOException {
+
+ final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
+
+ for (final Entry e : defaults.entrySet()) {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ for (final Entry e : params.entrySet()) {
+ if (template.getAttribute(e.getKey()) != null) {
+ template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
+ } else {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ }
+
+ final String json = template.toString();
+ return load(json);
+ }
+
+ private String readFromClasspath(final String resource) throws IOException {
+ return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
+ }
+
+ public PaceConfig getPace() {
+ return pace;
+ }
+
+ public void setPace(final PaceConfig pace) {
+ this.pace = pace;
+ }
+
+ public WfConfig getWf() {
+ return wf;
+ }
+
+ public void setWf(final WfConfig wf) {
+ this.wf = wf;
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise configuration", e);
+ }
+ }
+
+ @Override
+ public Map decisionTree() {
+ return getPace().getDecisionTree();
+ }
+
+ @Override
+ public List model() {
+ return getPace().getModel();
+ }
+
+ @Override
+ public List clusterings() {
+ return getPace().getClustering();
+ }
+
+ @Override
+ public Map> blacklists() {
+ return blacklists;
+ }
+
+ @Override
+ public Map translationMap() {
+ return getPace().translationMap();
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
new file mode 100644
index 000000000..dc87a1b06
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@@ -0,0 +1,105 @@
+package eu.dnetlib.pace.config;
+
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.google.common.collect.Maps;
+import com.ibm.icu.text.Transliterator;
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.tree.support.TreeNodeDef;
+import eu.dnetlib.pace.util.PaceResolver;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+
+public class PaceConfig extends AbstractPaceFunctions implements Serializable {
+
+ private List model;
+
+ private List clustering;
+ private Map decisionTree;
+
+ private Map> blacklists;
+ private Map> synonyms;
+
+ @JsonIgnore
+ private Map translationMap;
+
+ public Map getModelMap() {
+ return modelMap;
+ }
+
+ @JsonIgnore
+ private Map modelMap;
+
+ @JsonIgnore
+ public static PaceResolver resolver = new PaceResolver();
+
+ public PaceConfig() {}
+
+ public void initModel() {
+ modelMap = Maps.newHashMap();
+ for (FieldDef fd : getModel()) {
+ modelMap.put(fd.getName(), fd);
+ }
+ }
+
+ public void initTranslationMap(){
+ translationMap = Maps.newHashMap();
+
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+ for (String key : synonyms.keySet()) {
+ for (String term : synonyms.get(key)){
+ translationMap.put(
+ fixAliases(transliterator.transliterate(term.toLowerCase())),
+ key);
+ }
+ }
+ }
+
+ public Map translationMap(){
+ return translationMap;
+ }
+
+ public List getModel() {
+ return model;
+ }
+
+ public void setModel(final List model) {
+ this.model = model;
+ }
+
+ public List getClustering() {
+ return clustering;
+ }
+
+ public void setClustering(final List clustering) {
+ this.clustering = clustering;
+ }
+
+ public Map getDecisionTree() {
+ return decisionTree;
+ }
+
+ public void setDecisionTree(Map decisionTree) {
+ this.decisionTree = decisionTree;
+ }
+
+ public Map> getBlacklists() {
+ return blacklists;
+ }
+
+ public void setBlacklists(final Map> blacklists) {
+ this.blacklists = blacklists;
+ }
+
+ public Map> getSynonyms() {
+ return synonyms;
+ }
+
+ public void setSynonyms(Map> synonyms) {
+ this.synonyms = synonyms;
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
new file mode 100644
index 000000000..20981c427
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
@@ -0,0 +1,5 @@
+package eu.dnetlib.pace.config;
+
+public enum Type {
+ String, Int, List, JSON, URL, StringConcat, DoubleArray
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
new file mode 100644
index 000000000..78fc18a13
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
@@ -0,0 +1,292 @@
+package eu.dnetlib.pace.config;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.util.PaceException;
+import org.apache.commons.lang3.StringUtils;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+public class WfConfig implements Serializable {
+
+ /**
+ * Entity type.
+ */
+ private String entityType = "";
+
+ /**
+ * Sub-Entity type refers to one of fields declared in the model. See eu.dnetlib.pace.config.PaceConfig.modelMap
+ */
+ private String subEntityType = "";
+
+ /**
+ * Sub-Entity value declares a value for subTypes to be considered.
+ */
+ private String subEntityValue = "";
+
+ /**
+ * Field name used to sort the values in the reducer phase.
+ */
+ private String orderField = "";
+
+ /**
+ * Column Families involved in the relations redirection.
+ */
+ private List rootBuilder = Lists.newArrayList();
+
+ /**
+ * Set of datasource namespace prefixes that won't be deduplicated.
+ */
+ private Set skipList = Sets.newHashSet();
+
+ /**
+ * Subprefix used to build the root id, allows multiple dedup runs.
+ */
+ private String dedupRun = "";
+
+ /**
+ * Similarity threshold.
+ */
+ private double threshold = 0;
+
+ /** The queue max size. */
+ private int queueMaxSize = 2000;
+
+ /** The group max size. */
+ private int groupMaxSize;
+
+ /** The sliding window size. */
+ private int slidingWindowSize;
+
+ /** The configuration id. */
+ private String configurationId;
+
+ /** The include children. */
+ private boolean includeChildren;
+
+ /** Default maximum number of allowed children. */
+ private final static int MAX_CHILDREN = 10;
+
+ /** Maximum number of allowed children. */
+ private int maxChildren = MAX_CHILDREN;
+
+
+ /** Default maximum number of iterations. */
+ private final static int MAX_ITERATIONS = 20;
+
+ /** Maximum number of iterations */
+ private int maxIterations = MAX_ITERATIONS;
+
+ /** The Jquery path to retrieve the identifier */
+ private String idPath = "$.id";
+
+ public WfConfig() {}
+
+ /**
+ * Instantiates a new dedup config.
+ *
+ * @param entityType
+ * the entity type
+ * @param orderField
+ * the order field
+ * @param rootBuilder
+ * the root builder families
+ * @param dedupRun
+ * the dedup run
+ * @param skipList
+ * the skip list
+ * @param queueMaxSize
+ * the queue max size
+ * @param groupMaxSize
+ * the group max size
+ * @param slidingWindowSize
+ * the sliding window size
+ * @param includeChildren
+ * allows the children to be included in the representative records or not.
+ * @param maxIterations
+ * the maximum number of iterations
+ * @param idPath
+ * the path for the id of the entity
+ */
+ public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun,
+ final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
+ super();
+ this.entityType = entityType;
+ this.orderField = orderField;
+ this.rootBuilder = rootBuilder;
+ this.dedupRun = cleanupStringNumber(dedupRun);
+ this.skipList = skipList;
+ this.queueMaxSize = queueMaxSize;
+ this.groupMaxSize = groupMaxSize;
+ this.slidingWindowSize = slidingWindowSize;
+ this.includeChildren = includeChildren;
+ this.maxIterations = maxIterations;
+ this.idPath = idPath;
+ }
+
+ /**
+ * Cleanup string number.
+ *
+ * @param s
+ * the s
+ * @return the string
+ */
+ private String cleanupStringNumber(final String s) {
+ return s.contains("'") ? s.replaceAll("'", "") : s;
+ }
+
+ public boolean hasSubType() {
+ return StringUtils.isNotBlank(getSubEntityType()) && StringUtils.isNotBlank(getSubEntityValue());
+ }
+
+ public String getEntityType() {
+ return entityType;
+ }
+
+ public void setEntityType(final String entityType) {
+ this.entityType = entityType;
+ }
+
+ public String getSubEntityType() {
+ return subEntityType;
+ }
+
+ public void setSubEntityType(final String subEntityType) {
+ this.subEntityType = subEntityType;
+ }
+
+ public String getSubEntityValue() {
+ return subEntityValue;
+ }
+
+ public void setSubEntityValue(final String subEntityValue) {
+ this.subEntityValue = subEntityValue;
+ }
+
+ public String getOrderField() {
+ return orderField;
+ }
+
+ public void setOrderField(final String orderField) {
+ this.orderField = orderField;
+ }
+
+ public List getRootBuilder() {
+ return rootBuilder;
+ }
+
+ public void setRootBuilder(final List rootBuilder) {
+ this.rootBuilder = rootBuilder;
+ }
+
+ public Set getSkipList() {
+ return skipList != null ? skipList : new HashSet();
+ }
+
+ public void setSkipList(final Set skipList) {
+ this.skipList = skipList;
+ }
+
+ public String getDedupRun() {
+ return dedupRun;
+ }
+
+ public void setDedupRun(final String dedupRun) {
+ this.dedupRun = dedupRun;
+ }
+
+ public double getThreshold() {
+ return threshold;
+ }
+
+ public void setThreshold(final double threshold) {
+ this.threshold = threshold;
+ }
+
+ public int getQueueMaxSize() {
+ return queueMaxSize;
+ }
+
+ public void setQueueMaxSize(final int queueMaxSize) {
+ this.queueMaxSize = queueMaxSize;
+ }
+
+ public int getGroupMaxSize() {
+ return groupMaxSize;
+ }
+
+ public void setGroupMaxSize(final int groupMaxSize) {
+ this.groupMaxSize = groupMaxSize;
+ }
+
+ public int getSlidingWindowSize() {
+ return slidingWindowSize;
+ }
+
+ public void setSlidingWindowSize(final int slidingWindowSize) {
+ this.slidingWindowSize = slidingWindowSize;
+ }
+
+ public String getConfigurationId() {
+ return configurationId;
+ }
+
+ public void setConfigurationId(final String configurationId) {
+ this.configurationId = configurationId;
+ }
+
+ public boolean isIncludeChildren() {
+ return includeChildren;
+ }
+
+ public void setIncludeChildren(final boolean includeChildren) {
+ this.includeChildren = includeChildren;
+ }
+
+ public int getMaxChildren() {
+ return maxChildren;
+ }
+
+ public void setMaxChildren(final int maxChildren) {
+ this.maxChildren = maxChildren;
+ }
+
+
+ public int getMaxIterations() {
+ return maxIterations;
+ }
+
+ public void setMaxIterations(int maxIterations) {
+ this.maxIterations = maxIterations;
+ }
+
+ public String getIdPath() {
+ return idPath;
+ }
+
+ public void setIdPath(String idPath) {
+ this.idPath = idPath;
+
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise " + this.getClass().getName(), e);
+ }
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
new file mode 100644
index 000000000..c15885ecf
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
@@ -0,0 +1,61 @@
+package eu.dnetlib.pace.model;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import eu.dnetlib.pace.clustering.ClusteringFunction;
+import eu.dnetlib.pace.config.PaceConfig;
+import eu.dnetlib.pace.util.PaceException;
+
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
+
+
+public class ClusteringDef implements Serializable {
+
+ private String name;
+
+ private List fields;
+
+ private Map params;
+
+ public ClusteringDef() {}
+
+ public String getName() {
+ return name;
+ }
+
+ public void setName(final String name) {
+ this.name = name;
+ }
+
+ public ClusteringFunction clusteringFunction() {
+ return PaceConfig.resolver.getClusteringFunction(getName(), params);
+ }
+
+ public List getFields() {
+ return fields;
+ }
+
+ public void setFields(final List fields) {
+ this.fields = fields;
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ public void setParams(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise " + this.getClass().getName(), e);
+ }
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
new file mode 100644
index 000000000..196ac7248
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@@ -0,0 +1,100 @@
+package eu.dnetlib.pace.model;
+
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Type;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
+ */
+public class FieldDef implements Serializable {
+
+ public final static String PATH_SEPARATOR = "/";
+
+ private String name;
+
+ private String path;
+
+ private Type type;
+
+ private boolean overrideMatch;
+
+ /**
+ * Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
+ */
+ private int size = -1;
+
+ /**
+ * Sets maximum length for field values in the model. -1 for unbounded length.
+ */
+ private int length = -1;
+
+ public FieldDef() {}
+
+ public String getName() {
+ return name;
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ public List getPathList() {
+ return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(final Type type) {
+ this.type = type;
+ }
+
+ public boolean isOverrideMatch() {
+ return overrideMatch;
+ }
+
+ public void setOverrideMatch(final boolean overrideMatch) {
+ this.overrideMatch = overrideMatch;
+ }
+
+ public int getSize() {
+ return size;
+ }
+
+ public void setSize(int size) {
+ this.size = size;
+ }
+
+ public int getLength() {
+ return length;
+ }
+
+ public void setLength(int length) {
+ this.length = length;
+ }
+
+ public void setName(String name) {
+ this.name = name;
+ }
+
+ public void setPath(String path) {
+ this.path = path;
+ }
+
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (JsonProcessingException e) {
+ return null;
+ }
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
new file mode 100644
index 000000000..543b1bdfe
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@@ -0,0 +1,155 @@
+package eu.dnetlib.pace.model;
+
+import java.nio.charset.Charset;
+import java.text.Normalizer;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.Hashing;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.util.Capitalise;
+import eu.dnetlib.pace.util.DotAbbreviations;
+
+public class Person {
+
+ private static final String UTF8 = "UTF-8";
+ private List name = Lists.newArrayList();
+ private List surname = Lists.newArrayList();
+ private List fullname = Lists.newArrayList();
+ private final String original;
+
+ private static Set particles = null;
+
+ public Person(String s, final boolean aggressive) {
+ original = s;
+ s = Normalizer.normalize(s, Normalizer.Form.NFD);
+ s = s.replaceAll("\\(.+\\)", "");
+ s = s.replaceAll("\\[.+\\]", "");
+ s = s.replaceAll("\\{.+\\}", "");
+ s = s.replaceAll("\\s+-\\s+", "-");
+ s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
+ s = s.replaceAll("\\d", " ");
+ s = s.replaceAll("\\n", " ");
+ s = s.replaceAll("\\.", " ");
+ s = s.replaceAll("\\s+", " ");
+
+ if (aggressive) {
+ s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
+ // s = s.replaceAll("[\\W&&[^,-]]", "");
+ }
+
+ if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
+ final String[] arr = s.split(",");
+ if (arr.length == 1) {
+ fullname = splitTerms(arr[0]);
+ } else if (arr.length > 1) {
+ surname = splitTerms(arr[0]);
+ name = splitTerms(arr[1]);
+ fullname.addAll(surname);
+ fullname.addAll(name);
+ }
+ } else {
+ fullname = splitTerms(s);
+
+ int lastInitialPosition = fullname.size();
+ boolean hasSurnameInUpperCase = false;
+
+ for (int i = 0; i < fullname.size(); i++) {
+ final String term = fullname.get(i);
+ if (term.length() == 1) {
+ lastInitialPosition = i;
+ } else if (term.equals(term.toUpperCase())) {
+ hasSurnameInUpperCase = true;
+ }
+ }
+
+ if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
+ name = fullname.subList(0, lastInitialPosition + 1);
+ surname = fullname.subList(lastInitialPosition + 1, fullname.size());
+ } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
+ for (final String term : fullname) {
+ if ((term.length() > 1) && term.equals(term.toUpperCase())) {
+ surname.add(term);
+ } else {
+ name.add(term);
+ }
+ }
+ }
+ }
+ }
+
+ private List splitTerms(final String s) {
+ if (particles == null) {
+ particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+ }
+
+ final List list = Lists.newArrayList();
+ for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+ if (!particles.contains(part.toLowerCase())) {
+ list.add(part);
+ }
+ }
+ return list;
+ }
+
+ public List getName() {
+ return name;
+ }
+
+ public String getNameString() {
+ return Joiner.on(" ").join(getName());
+ }
+
+ public List getSurname() {
+ return surname;
+ }
+
+ public List getFullname() {
+ return fullname;
+ }
+
+ public String getOriginal() {
+ return original;
+ }
+
+ public String hash() {
+ return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
+ }
+
+ public String getNormalisedFirstName() {
+ return Joiner.on(" ").join(getCapitalFirstnames());
+ }
+
+ public String getNormalisedSurname() {
+ return Joiner.on(" ").join(getCapitalSurname());
+ }
+
+ public String getSurnameString() {
+ return Joiner.on(" ").join(getSurname());
+ }
+
+ public String getNormalisedFullname() {
+ return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
+ }
+
+ public List getCapitalFirstnames() {
+ return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
+ }
+
+ public List getCapitalSurname() {
+ return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
+ }
+
+ public List getNameWithAbbreviations() {
+ return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
+ }
+
+ public boolean isAccurate() {
+ return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty());
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
new file mode 100644
index 000000000..a900a6082
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
@@ -0,0 +1,118 @@
+package eu.dnetlib.pace.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+public class PersonComparatorUtils {
+
+ private static final int MAX_FULLNAME_LENGTH = 50;
+
+ public static Set getNgramsForPerson(String fullname) {
+
+ Set set = Sets.newHashSet();
+
+ if (fullname.length() > MAX_FULLNAME_LENGTH) {
+ return set;
+ }
+
+ Person p = new Person(fullname, true);
+
+ if (p.isAccurate()) {
+ for (String name : p.getName()) {
+ for (String surname : p.getSurname()) {
+ set.add((name.charAt(0) + "_" + surname).toLowerCase());
+ }
+ }
+ } else {
+ List list = p.getFullname();
+ for (int i = 0; i < list.size(); i++) {
+ if (list.get(i).length() > 1) {
+ for (int j = 0; j < list.size(); j++) {
+ if (i != j) {
+ set.add((list.get(j).charAt(0) + "_" + list.get(i)).toLowerCase());
+ }
+ }
+ }
+ }
+ }
+
+ return set;
+ }
+
+ public static boolean areSimilar(String s1, String s2) {
+ Person p1 = new Person(s1, true);
+ Person p2 = new Person(s2, true);
+
+ if (p1.isAccurate() && p2.isAccurate()) {
+ return verifyNames(p1.getName(), p2.getName()) && verifySurnames(p1.getSurname(), p2.getSurname());
+ } else {
+ return verifyFullnames(p1.getFullname(), p2.getFullname());
+ }
+ }
+
+ private static boolean verifyNames(List list1, List list2) {
+ return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ }
+
+ private static boolean verifySurnames(List list1, List list2) {
+ if (list1.size() != list2.size()) {
+ return false;
+ }
+ for (int i = 0; i < list1.size(); i++) {
+ if (!list1.get(i).equalsIgnoreCase(list2.get(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean verifyFullnames(List list1, List list2) {
+ Collections.sort(list1);
+ Collections.sort(list2);
+ return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ }
+
+ private static List extractExtendedNames(List list) {
+ ArrayList res = Lists.newArrayList();
+ for (String s : list) {
+ if (s.length() > 1) {
+ res.add(s.toLowerCase());
+ }
+ }
+ return res;
+ }
+
+ private static List extractInitials(List list) {
+ ArrayList res = Lists.newArrayList();
+ for (String s : list) {
+ res.add(s.substring(0, 1).toLowerCase());
+ }
+ return res;
+ }
+
+ private static boolean verifySimilarity(List list1, List list2) {
+ if (list1.size() > list2.size()) {
+ return verifySimilarity(list2, list1);
+ }
+
+ // NB: List2 is greater than list1 (or equal)
+ int pos = -1;
+ for (String s : list1) {
+ int curr = list2.indexOf(s);
+ if (curr > pos) {
+ list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
+ pos = curr;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
new file mode 100644
index 000000000..3926b2897
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
@@ -0,0 +1,54 @@
+package eu.dnetlib.pace.model;
+
+import eu.dnetlib.pace.clustering.NGramUtils;
+import org.apache.spark.sql.Row;
+
+import java.util.Comparator;
+
+/**
+ * The Class MapDocumentComparator.
+ */
+public class RowDataOrderingComparator implements Comparator {
+
+ /** The comparator field. */
+ private int comparatorField;
+
+ /**
+ * Instantiates a new map document comparator.
+ *
+ * @param comparatorField
+ * the comparator field
+ */
+ public RowDataOrderingComparator(final int comparatorField) {
+ this.comparatorField = comparatorField;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(final Row d1, final Row d2) {
+ if (d1 == null)
+ return d2==null ? 0: -1;
+ else if (d2 == null) {
+ return 1;
+ }
+
+ final String o1 = d1.getString(comparatorField);
+ final String o2 = d2.getString(comparatorField);
+
+ if (o1 == null)
+ return o2==null ? 0: -1;
+ else if (o2 == null) {
+ return 1;
+ }
+
+ final String to1 = NGramUtils.cleanupForOrdering(o1);
+ final String to2 = NGramUtils.cleanupForOrdering(o2);
+
+ return to1.compareTo(to2);
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
new file mode 100644
index 000000000..aaac36ad7
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
@@ -0,0 +1,41 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+@ComparatorClass("alwaysMatch")
+public class AlwaysMatch extends AbstractComparator {
+
+ public AlwaysMatch(final Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
+
+ public AlwaysMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double compare(final Object a, final Object b, final Config conf) {
+ return 1.0;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
+
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
new file mode 100644
index 000000000..eedc7f562
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@@ -0,0 +1,148 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+@ComparatorClass("authorsMatch")
+public class AuthorsMatch extends AbstractListComparator {
+
+ Map params;
+
+ private double SURNAME_THRESHOLD;
+ private double NAME_THRESHOLD;
+ private double FULLNAME_THRESHOLD;
+ private String MODE; //full or surname
+ private int SIZE_THRESHOLD;
+ private String TYPE; //count or percentage
+ private int common;
+
+ public AuthorsMatch(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ this.params = params;
+
+ MODE = params.getOrDefault("mode", "full");
+ SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
+ NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
+ FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
+ SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
+ TYPE = params.getOrDefault("type", "percentage");
+ common = 0;
+ }
+
+ protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double compare(final List a, final List b, final Config conf) {
+
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
+
+ if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
+ return 1.0;
+
+ List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+ List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+
+ common = 0;
+ //compare each element of List1 with each element of List2
+ for (Person p1 : aList)
+
+ for (Person p2 : bList) {
+
+ //both persons are inaccurate
+ if (!p1.isAccurate() && !p2.isAccurate()) {
+ //compare just normalized fullnames
+ String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
+ String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
+
+ if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
+ common += 1;
+ break;
+ }
+ }
+
+ //one person is inaccurate
+ if (p1.isAccurate() ^ p2.isAccurate()) {
+ //prepare data
+ //data for the accurate person
+ String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
+ String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
+
+ //data for the inaccurate person
+ String fullname = normalization(
+ p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
+ );
+
+ if (fullname.contains(surname)) {
+ if (MODE.equals("full")) {
+ if (fullname.contains(name)) {
+ common += 1;
+ break;
+ }
+ }
+ else { //MODE equals "surname"
+ common += 1;
+ break;
+ }
+ }
+ }
+
+ //both persons are accurate
+ if (p1.isAccurate() && p2.isAccurate()) {
+
+ if (compareSurname(p1, p2)) {
+ if (MODE.equals("full")) {
+ if(compareFirstname(p1, p2)) {
+ common += 1;
+ break;
+ }
+ }
+ else { //MODE equals "surname"
+ common += 1;
+ break;
+ }
+ }
+
+ }
+
+ }
+
+ //normalization factor to compute the score
+ int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
+
+ if(TYPE.equals("percentage")) {
+ return (double) common / normFactor;
+ }
+ else {
+ return (double) common;
+ }
+ }
+
+ public boolean compareSurname(Person p1, Person p2) {
+ return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
+ }
+
+ public boolean compareFirstname(Person p1, Person p2) {
+
+ if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
+ if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
+ return true;
+ }
+
+ return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
+ }
+
+ public String normalization(String s) {
+ return normalize(utf8(cleanup(s)));
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
new file mode 100644
index 000000000..a51d07eb7
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
@@ -0,0 +1,47 @@
+package eu.dnetlib.pace.tree;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+import java.util.Set;
+
+@ComparatorClass("cityMatch")
+public class CityMatch extends AbstractStringComparator {
+
+ private Map params;
+
+ public CityMatch(Map params) {
+ super(params);
+ this.params = params;
+ }
+
+ @Override
+ public double distance(final String a, final String b, final Config conf) {
+
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ ca = normalize(ca);
+ cb = normalize(cb);
+
+ ca = filterAllStopWords(ca);
+ cb = filterAllStopWords(cb);
+
+ Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+
+ Set codes1 = citiesToCodes(cities1);
+ Set codes2 = citiesToCodes(cities2);
+
+ //if no cities are detected, the comparator gives 1.0
+ if (codes1.isEmpty() && codes2.isEmpty())
+ return 1.0;
+ else {
+ if (codes1.isEmpty() ^ codes2.isEmpty())
+ return -1; //undefined if one of the two has no cities
+ return commonElementsPercentage(codes1, codes2);
+ }
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
new file mode 100644
index 000000000..59e5dd346
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
@@ -0,0 +1,47 @@
+package eu.dnetlib.pace.tree;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+@ComparatorClass("cosineSimilarity")
+public class CosineSimilarity extends AbstractComparator {
+
+ Map params;
+
+ public CosineSimilarity(Map params) {
+ super(params);
+ }
+
+ @Override
+ public double compare(Object a, Object b, Config config) {
+ return compare((double[])a, (double[])b, config);
+ }
+
+ public double compare(final double[] a, final double[] b, final Config conf) {
+
+ if (a.length == 0 || b.length == 0)
+ return -1;
+
+ return cosineSimilarity(a, b);
+ }
+
+ double cosineSimilarity(double[] a, double[] b) {
+ double dotProduct = 0;
+ double normASum = 0;
+ double normBSum = 0;
+
+ for(int i = 0; i < a.length; i ++) {
+ dotProduct += a[i] * b[i];
+ normASum += a[i] * a[i];
+ normBSum += b[i] * b[i];
+ }
+
+ double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
+ return dotProduct / eucledianDist;
+ }
+
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
new file mode 100644
index 000000000..429882450
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.tree;
+
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+/**
+ * The Class ExactMatch.
+ *
+ * @author claudio
+ */
+@ComparatorClass("doiExactMatch")
+public class DoiExactMatch extends ExactMatchIgnoreCase {
+
+ public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+
+ public DoiExactMatch(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected String toString(final Object f) {
+ return super.toString(f).replaceAll(PREFIX, "");
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
new file mode 100644
index 000000000..2e99595e0
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
@@ -0,0 +1,29 @@
+package eu.dnetlib.pace.tree;
+
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+
+@ComparatorClass("domainExactMatch")
+public class DomainExactMatch extends ExactMatchIgnoreCase {
+
+ public DomainExactMatch(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected String toString(final Object f) {
+
+ try {
+ return asUrl(super.toString(f)).getHost();
+ } catch (MalformedURLException e) {
+ return "";
+ }
+ }
+
+ private URL asUrl(final String value) throws MalformedURLException {
+ return new URL(value);
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
new file mode 100644
index 000000000..08fca05c9
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
@@ -0,0 +1,42 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+@ComparatorClass("exactMatch")
+public class ExactMatch extends AbstractStringComparator {
+
+ public ExactMatch(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
+
+ public ExactMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(final String a, final String b, final Config conf) {
+ if (a.isEmpty() || b.isEmpty()) {
+ return -1.0; //return -1 if a field is missing
+ }
+ return a.equals(b) ? 1.0 : 0;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
new file mode 100644
index 000000000..b6b4d1af4
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
@@ -0,0 +1,30 @@
+package eu.dnetlib.pace.tree;
+
+import com.google.common.base.Joiner;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.List;
+import java.util.Map;
+
+@ComparatorClass("exactMatchIgnoreCase")
+public class ExactMatchIgnoreCase extends AbstractStringComparator {
+
+ public ExactMatchIgnoreCase(Map params) {
+ super(params);
+ }
+
+ @Override
+ public double compare(String a, String b, final Config conf) {
+
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
+
+ return a.equalsIgnoreCase(b) ? 1 : 0;
+ }
+
+ protected String toString(final Object object) {
+ return toFirstString(object);
+ }
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
new file mode 100644
index 000000000..074b82a19
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
@@ -0,0 +1,80 @@
+package eu.dnetlib.pace.tree;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+@ComparatorClass("instanceTypeMatch")
+public class InstanceTypeMatch extends AbstractListComparator {
+
+ final Map translationMap = new HashMap<>();
+
+ public InstanceTypeMatch(Map params){
+ super(params);
+
+ //jolly types
+ translationMap.put("Conference object", "*");
+ translationMap.put("Other literature type", "*");
+ translationMap.put("Unknown", "*");
+
+ //article types
+ translationMap.put("Article", "Article");
+ translationMap.put("Data Paper", "Article");
+ translationMap.put("Software Paper", "Article");
+ translationMap.put("Preprint", "Article");
+
+ //thesis types
+ translationMap.put("Thesis", "Thesis");
+ translationMap.put("Master thesis", "Thesis");
+ translationMap.put("Bachelor thesis", "Thesis");
+ translationMap.put("Doctoral thesis", "Thesis");
+ }
+
+
+ @Override
+ public double compare(final List a, final List b, final Config conf) {
+
+ if (a == null || b == null) {
+ return -1;
+ }
+
+
+ if (a.isEmpty() || b.isEmpty()) {
+ return -1;
+ }
+
+ final Set ca = a.stream().map(this::translate).collect(Collectors.toSet());
+ final Set cb = b.stream().map(this::translate).collect(Collectors.toSet());
+
+ //if at least one is a jolly type, it must produce a match
+ if (ca.contains("*") || cb.contains("*"))
+ return 1.0;
+
+ int incommon = Sets.intersection(ca, cb).size();
+
+ //if at least one is in common, it must produce a match
+ return incommon >= 1 ? 1 : 0;
+ }
+
+ public String translate(String term){
+ return translationMap.getOrDefault(term, term);
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
new file mode 100644
index 000000000..e151edaaf
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
@@ -0,0 +1,44 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
+@ComparatorClass("jaroWinkler")
+public class JaroWinkler extends AbstractStringComparator {
+
+ public JaroWinkler(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
+
+ public JaroWinkler(double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(String a, String b, final Config conf) {
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ return normalize(ssalgo.score(ca, cb));
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return d;
+ }
+
+}
\ No newline at end of file
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
new file mode 100644
index 000000000..3f122cd62
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
@@ -0,0 +1,70 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+import java.util.Set;
+
+@ComparatorClass("jaroWinklerNormalizedName")
+public class JaroWinklerNormalizedName extends AbstractStringComparator {
+
+ private Map params;
+
+ public JaroWinklerNormalizedName(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ this.params = params;
+ }
+
+ public JaroWinklerNormalizedName(double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(String a, String b, final Config conf) {
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ ca = normalize(ca);
+ cb = normalize(cb);
+
+ ca = filterAllStopWords(ca);
+ cb = filterAllStopWords(cb);
+
+ Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
+
+ Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+
+ ca = removeKeywords(ca, keywords1);
+ ca = removeKeywords(ca, cities1);
+ cb = removeKeywords(cb, keywords2);
+ cb = removeKeywords(cb, cities2);
+
+ ca = ca.replaceAll("[ ]{2,}", " ");
+ cb = cb.replaceAll("[ ]{2,}", " ");
+
+ if (ca.isEmpty() && cb.isEmpty())
+ return 1.0;
+ else
+ return normalize(ssalgo.score(ca,cb));
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return d;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
new file mode 100644
index 000000000..a98778ac9
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
@@ -0,0 +1,45 @@
+package eu.dnetlib.pace.tree;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+
+//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
+@ComparatorClass("jaroWinklerTitle")
+public class JaroWinklerTitle extends AbstractStringComparator {
+
+ public JaroWinklerTitle(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
+
+ public JaroWinklerTitle(double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(String a, String b, final Config conf) {
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ boolean check = checkNumbers(ca, cb);
+ return check ? 0.5 : normalize(ssalgo.score(ca, cb));
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return d;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
new file mode 100644
index 000000000..16145b47a
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
@@ -0,0 +1,72 @@
+package eu.dnetlib.pace.tree;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+import eu.dnetlib.pace.util.MapDocumentUtil;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+@ComparatorClass("jsonListMatch")
+public class JsonListMatch extends AbstractListComparator {
+
+ private static final Log log = LogFactory.getLog(JsonListMatch.class);
+ private Map params;
+
+ private String MODE; //"percentage" or "count"
+
+ public JsonListMatch(final Map params) {
+ super(params);
+ this.params = params;
+
+ MODE = params.getOrDefault("mode", "percentage");
+ }
+
+ @Override
+ public double compare(final List sa, final List sb, final Config conf) {
+ if (sa.isEmpty() || sb.isEmpty()) {
+ return -1;
+ }
+
+ final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
+ final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
+
+ int incommon = Sets.intersection(ca, cb).size();
+ int simDiff = Sets.symmetricDifference(ca, cb).size();
+
+ if (incommon + simDiff == 0) {
+ return 0.0;
+ }
+
+ if (MODE.equals("percentage"))
+ return (double)incommon / (incommon + simDiff);
+ else
+ return incommon;
+
+ }
+
+ //converts every json into a comparable string basing on parameters
+ private String toComparableString(String json){
+
+ StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
+
+ //for each path in the param list
+ for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
+ String path = params.get(key);
+ String value = MapDocumentUtil.getJPathString(path, json);
+ if (value == null || value.isEmpty())
+ value = "";
+ st.append(value);
+ st.append("::");
+ }
+
+ st.setLength(st.length()-2);
+ return st.toString();
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java
new file mode 100644
index 000000000..8c4e6e50f
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java
@@ -0,0 +1,47 @@
+package eu.dnetlib.pace.tree;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Map;
+import java.util.Set;
+
+@ComparatorClass("keywordMatch")
+public class KeywordMatch extends AbstractStringComparator {
+
+ Map params;
+
+ public KeywordMatch(Map params) {
+ super(params);
+ this.params = params;
+ }
+
+ @Override
+ public double distance(final String a, final String b, final Config conf) {
+
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ ca = normalize(ca);
+ cb = normalize(cb);
+
+ ca = filterAllStopWords(ca);
+ cb = filterAllStopWords(cb);
+
+ Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set