diff --git a/.gitignore b/.gitignore
index 0770081..0c12c6c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,10 @@
# Mobile Tools for Java (J2ME)
.mtj.tmp/
+
+
+*target
+
# Package Files #
*.jar
*.war
@@ -21,6 +25,7 @@
*.idea
+*.iml
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml
new file mode 100644
index 0000000..8e0e0f7
--- /dev/null
+++ b/dnet-pace-core/pom.xml
@@ -0,0 +1,70 @@
+
+
+
+ eu.dnetlib
+ dnet45-parent
+ 1.0.0
+
+
+ 4.0.0
+ eu.dnetlib
+ dnet-pace-core
+ jar
+ 2.6.8-SNAPSHOT
+
+ scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-pace-core/trunk
+
+
+
+ edu.cmu
+ secondstring
+ 1.0.0
+
+
+ com.google.guava
+ guava
+ ${google.guava.version}
+
+
+ com.google.code.gson
+ gson
+ ${google.gson.version}
+
+
+ commons-lang
+ commons-lang
+ ${commons.lang.version}
+
+
+ commons-io
+ commons-io
+ ${commons.io.version}
+
+
+ commons-collections
+ commons-collections
+ ${commons.collections.version}
+
+
+ com.googlecode.protobuf-java-format
+ protobuf-java-format
+ 1.2
+
+
+ org.antlr
+ stringtemplate
+ 3.2
+
+
+ commons-logging
+ commons-logging
+ ${commons.logging.version}
+
+
+ junit
+ junit
+ ${junit.version}
+ test
+
+
+
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
new file mode 100644
index 0000000..6f29f22
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -0,0 +1,44 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.*;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang.StringUtils;
+
+public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
+
+ protected Map params;
+
+ public AbstractClusteringFunction(final Map params) {
+ this.params = params;
+ }
+
+ protected abstract Collection doApply(String s);
+
+ @Override
+ public Collection apply(List fields) {
+ return fields.stream().filter(f -> !f.isEmpty())
+ .map(Field::stringValue)
+ .map(this::normalize)
+ .map(s -> filterStopWords(s, stopwords))
+ .map(this::doApply)
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ protected Integer param(String name) {
+ return params.get(name);
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
new file mode 100644
index 0000000..1897e6a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
@@ -0,0 +1,47 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import com.google.common.collect.Sets;
+
+public class Acronyms extends AbstractClusteringFunction {
+
+ public Acronyms(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(String s) {
+ return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
+ }
+
+ private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
+
+ final Set acronyms = Sets.newLinkedHashSet();
+
+ for (int i = 0; i < maxAcronyms; i++) {
+
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (sb.length() > maxLen) {
+ break;
+ }
+ if (token.length() > 1 && i < token.length()) {
+ sb.append(token.charAt(i));
+ }
+ }
+ String acronym = sb.toString();
+ if (acronym.length() > minLen) {
+ acronyms.add(acronym);
+ }
+ }
+ return acronyms;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
new file mode 100644
index 0000000..b007853
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
@@ -0,0 +1,59 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldListImpl;
+import eu.dnetlib.pace.model.MapDocument;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
+
+ private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
+
+ public static Collection filterAndCombine(final MapDocument a, final Config conf, final Map> blacklists) {
+
+ final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists);
+ return combine(filtered, conf);
+ }
+
+ private MapDocument filter(final MapDocument a, final Map> blacklists) {
+ final Map filtered = Maps.newHashMap(a.getFieldMap());
+ if (blacklists != null) {
+ for (final Entry e : filtered.entrySet()) {
+
+ final FieldListImpl fl = new FieldListImpl();
+ fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
+ filtered.put(e.getKey(), fl);
+ }
+ }
+ return new MapDocument(a.getIdentifier(), filtered);
+ }
+
+ /**
+ * Tries to match the fields in the regex blacklist.
+ *
+ * @param fieldName
+ * @param value
+ * @return true if the field matches, false otherwise
+ */
+ protected boolean regexMatches(final String fieldName, final String value, final Map> blacklists) {
+ if (blacklists.containsKey(fieldName)) {
+ for (final String regex : blacklists.get(fieldName)) {
+ if (value.matches(regex)) return true;
+ }
+ }
+ return false;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
new file mode 100644
index 0000000..7257540
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
@@ -0,0 +1,5 @@
+package eu.dnetlib.pace.clustering;
+
+public enum Clustering {
+ acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
new file mode 100644
index 0000000..d688705
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
@@ -0,0 +1,29 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+
+public class ClusteringCombiner {
+
+ public static Collection combine(final Document a, final Config conf) {
+ return new ClusteringCombiner().doCombine(a, conf.clusterings());
+ }
+
+ private Collection doCombine(final Document a, final List defs) {
+ final Collection res = Sets.newLinkedHashSet();
+ for (final ClusteringDef cd : defs) {
+ for (final String fieldName : cd.getFields()) {
+ final Field values = a.values(fieldName);
+ res.addAll(cd.getClusteringFunction().apply((List) values));
+ }
+ }
+ return res;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
new file mode 100644
index 0000000..4fe1b59
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -0,0 +1,15 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import eu.dnetlib.pace.model.Field;
+
+public interface ClusteringFunction {
+
+ public Collection apply(List fields);
+
+ public Map getParams();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
new file mode 100644
index 0000000..7ede4c2
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java
@@ -0,0 +1,48 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Predicate;
+
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class FieldFilter implements Predicate {
+
+ private static final Log log = LogFactory.getLog(FieldFilter.class);
+
+ private Map> blacklists;
+
+ private String filedName;
+
+ public FieldFilter(final String fieldName, final Map> blacklists) {
+ this.filedName = fieldName;
+ this.blacklists = blacklists;
+ }
+
+ @Override
+ public boolean apply(final Field f) {
+ return !regexMatches(filedName, f.stringValue(), blacklists);
+ }
+
+ /**
+ * Tries to match the fields in the regex blacklist.
+ *
+ * @param fieldName
+ * @param value
+ * @return true if the field matches, false otherwise
+ */
+ protected boolean regexMatches(final String fieldName, final String value, final Map> blacklists) {
+ if (blacklists.containsKey(fieldName)) {
+ final Iterable regexes = blacklists.get(fieldName);
+ for (final String regex : regexes) {
+ if (StringUtils.isBlank(regex)) return false;
+ if (value.matches(regex)) return true;
+ }
+ }
+ return false;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
new file mode 100644
index 0000000..988476d
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+
+public class ImmutableFieldValue extends AbstractClusteringFunction {
+
+ public ImmutableFieldValue(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final String s) {
+ final List res = Lists.newArrayList();
+
+ res.add(s);
+
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
new file mode 100644
index 0000000..6d00992
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -0,0 +1,34 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang.StringUtils;
+
+public class LowercaseClustering extends AbstractClusteringFunction {
+
+ public LowercaseClustering(final Map params) {
+ super(params);
+ }
+
+ @Override
+ public Collection apply(List fields) {
+ Collection c = Sets.newLinkedHashSet();
+ for(Field f : fields) {
+ c.addAll(doApply(f.stringValue()));
+ }
+ return c;
+ }
+
+ @Override
+ protected Collection doApply(final String s) {
+ if(StringUtils.isBlank(s)) {
+ return Lists.newArrayList();
+ }
+ return Lists.newArrayList(s.toLowerCase().trim());
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
new file mode 100644
index 0000000..aeb790f
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
@@ -0,0 +1,20 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+
+public class NGramUtils extends AbstractPaceFunctions {
+
+ private static final int SIZE = 100;
+
+ private static Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+
+ public static String cleanupForOrdering(String s) {
+ NGramUtils utils = new NGramUtils();
+ return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", "");
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
new file mode 100644
index 0000000..3cffa4d
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -0,0 +1,33 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+
+public class NgramPairs extends Ngrams {
+
+ public NgramPairs(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(String s) {
+ return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
+ }
+
+ protected Collection ngramPairs(final List ngrams, int maxNgrams) {
+ Collection res = Lists.newArrayList();
+ int j = 0;
+ for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
+ if (++j >= ngrams.size()) {
+ break;
+ }
+ res.add(ngrams.get(i) + ngrams.get(j));
+ //System.out.println("-- " + concatNgrams);
+ }
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
new file mode 100644
index 0000000..aaba9af
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -0,0 +1,43 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.StringTokenizer;
+
+public class Ngrams extends AbstractClusteringFunction {
+
+ public Ngrams(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(String s) {
+ return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
+ }
+
+ protected Collection getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
+
+ final Collection ngrams = new LinkedHashSet();
+ final StringTokenizer st = new StringTokenizer(s);
+
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!token.isEmpty()) {
+
+ for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
+ String ngram = (token + " ").substring(i, ngramLen + i).trim();
+ if (ngrams.size() >= max) {
+ return ngrams;
+ }
+ if (ngram.length() >= minNgramLen) {
+ ngrams.add(ngram);
+ }
+ }
+ }
+ }
+ //System.out.println(ngrams + " n: " + ngrams.size());
+ return ngrams;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
new file mode 100644
index 0000000..d717077
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -0,0 +1,69 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.model.FieldValue;
+import org.apache.commons.lang.StringUtils;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.gt.Author;
+import eu.dnetlib.pace.model.gt.GTAuthor;
+
+public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
+
+ private Map params;
+
+ private static final int MAX_TOKENS = 5;
+
+ public PersonClustering(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public Collection apply(final List fields) {
+ final Set hashes = Sets.newHashSet();
+
+ for (final Field f : fields) {
+
+ final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
+
+ final Author a = gta.getAuthor();
+ if (a.isWellFormed()) {
+ hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
+ } else {
+ for (final String token1 : tokens(a.getFullname())) {
+ for (final String token2 : tokens(a.getFullname())) {
+ if (!token1.equals(token2)) {
+ hashes.add(firstLC(token1) + token2);
+ }
+ }
+ }
+ }
+ }
+
+ return hashes;
+ }
+
+ private String firstLC(final String s) {
+ return StringUtils.substring(s, 0, 1).toLowerCase();
+ }
+
+ private Iterable tokens(final String s) {
+ return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), MAX_TOKENS);
+ }
+
+ @Override
+ public Map getParams() {
+ return params;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
new file mode 100644
index 0000000..42d9d5b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@@ -0,0 +1,30 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.pace.model.Person;
+
+public class PersonHash extends AbstractClusteringFunction {
+
+ private boolean DEFAULT_AGGRESSIVE = false;
+
+ public PersonHash(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final String s) {
+ final List res = Lists.newArrayList();
+
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+
+ res.add(new Person(s, aggressive).hash());
+
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
new file mode 100644
index 0000000..f012aac
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
@@ -0,0 +1,18 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+
+public class RandomClusteringFunction extends AbstractClusteringFunction {
+
+ public RandomClusteringFunction(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(String s) {
+ // TODO Auto-generated method stub
+ return null;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
new file mode 100644
index 0000000..56e6604
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -0,0 +1,28 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+
+public class SortedNgramPairs extends NgramPairs {
+
+ public SortedNgramPairs(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(String s) {
+
+ final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
+
+ Collections.sort(tokens);
+
+ return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
new file mode 100644
index 0000000..19a51d4
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
@@ -0,0 +1,27 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.lang.RandomStringUtils;
+import org.apache.commons.lang.StringUtils;
+
+import com.google.common.collect.Lists;
+
+public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
+
+ public SpaceTrimmingFieldValue(final Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(final String s) {
+ final List res = Lists.newArrayList();
+
+ res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
+
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
new file mode 100644
index 0000000..3ed336a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
@@ -0,0 +1,38 @@
+package eu.dnetlib.pace.clustering;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+
+public class SuffixPrefix extends AbstractClusteringFunction {
+
+ public SuffixPrefix(Map params) {
+ super(params);
+ }
+
+ @Override
+ protected Collection doApply(String s) {
+ return suffixPrefix(s, param("len"), param("max"));
+ }
+
+ private Collection suffixPrefix(String s, int len, int max) {
+ final Set bigrams = Sets.newLinkedHashSet();
+ int i = 0;
+ while (++i < s.length() && bigrams.size() < max) {
+ int j = s.indexOf(" ", i);
+
+ int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
+
+ if (j - len > 0) {
+ String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
+ if (bigram.length() >= 4) {
+ bigrams.add(bigram);
+ }
+ }
+ }
+ return bigrams;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
new file mode 100644
index 0000000..1962814
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -0,0 +1,46 @@
+package eu.dnetlib.pace.clustering;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.model.Field;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
+
+ protected Map params;
+
+ public UrlClustering(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public Collection apply(List fields) {
+ return fields.stream()
+ .filter(f -> !f.isEmpty())
+ .map(Field::stringValue)
+ .map(this::asUrl)
+ .map(URL::getHost)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+
+ @Override
+ public Map getParams() {
+ return null;
+ }
+
+ private URL asUrl(final String value) {
+ try {
+ return new URL(value);
+ } catch (MalformedURLException e) {
+ // should not happen as checked by pace typing
+ throw new IllegalStateException("invalid URL: " + value);
+ }
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
new file mode 100644
index 0000000..9174bed
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -0,0 +1,167 @@
+package eu.dnetlib.pace.common;
+
+import java.text.Normalizer;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.clustering.NGramUtils;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.model.FieldListImpl;
+
+/**
+ * Set of common functions
+ *
+ * @author claudio
+ *
+ */
+public abstract class AbstractPaceFunctions {
+
+ protected static Set stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+
+ protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
+
+ private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
+ private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎";
+ private static final String aliases_to = "0123456789+-=()n0123456789+-=()";
+
+ protected final static FieldList EMPTY_FIELD = new FieldListImpl();
+
+ protected String concat(final List l) {
+ return Joiner.on(" ").skipNulls().join(l);
+ }
+
+ protected String cleanup(final String s) {
+ final String s1 = nfd(s);
+ final String s2 = fixAliases(s1);
+ final String s3 = s2.replaceAll("–", " ");
+ final String s4 = s3.replaceAll("&", " ");
+ final String s5 = s4.replaceAll(""", " ");
+ final String s6 = s5.replaceAll("−", " ");
+ final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
+ final String s8 = s7.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
+ final String s9 = s8.replaceAll("\\n", " ");
+ final String s10 = s9.replaceAll("(?m)\\s+", " ");
+ final String s11 = s10.trim();
+ return s11;
+ }
+
+ protected String finalCleanup(final String s) {
+ return s.toLowerCase();
+ }
+
+ protected boolean checkNumbers(final String a, final String b) {
+ final String numbersA = getNumbers(a);
+ final String numbersB = getNumbers(b);
+ final String romansA = getRomans(a);
+ final String romansB = getRomans(b);
+ return !numbersA.equals(numbersB) || !romansA.equals(romansB);
+ }
+
+ protected String getRomans(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isRoman(t) ? t : "");
+ }
+ return sb.toString();
+ }
+
+ protected boolean isRoman(final String s) {
+ return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
+ }
+
+ protected String getNumbers(final String s) {
+ return s.replaceAll("\\D", "");
+ }
+
+ protected String fixAliases(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final char ch : Lists.charactersOf(s)) {
+ final int i = StringUtils.indexOf(aliases_from, ch);
+ sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
+ }
+ return sb.toString();
+ }
+
+ protected String removeSymbols(final String s) {
+ final StringBuilder sb = new StringBuilder();
+
+ for (final char ch : Lists.charactersOf(s)) {
+ sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
+ }
+ return sb.toString().replaceAll("\\s+", " ");
+ }
+
+ protected String getFirstValue(final Field values) {
+ return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : "";
+ }
+
+ protected boolean notNull(final String s) {
+ return s != null;
+ }
+
+ // ///////////////////////
+
+ protected String normalize(final String s) {
+ return nfd(s).toLowerCase()
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
+ .replaceAll("(\\W)+", " ")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
+
+ private String nfd(final String s) {
+ return Normalizer.normalize(s, Normalizer.Form.NFD);
+ }
+
+ protected String filterStopWords(final String s, final Set stopwords) {
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!stopwords.contains(token)) {
+ sb.append(token);
+ sb.append(" ");
+ }
+ }
+ return sb.toString().trim();
+ }
+
+ protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) {
+ final Set newset = Sets.newLinkedHashSet();
+ for (final String s : set) {
+ if (!ngramBlacklist.contains(s)) {
+ newset.add(s);
+ }
+ }
+ return newset;
+ }
+
+ // ////////////////////
+
+ public static Set loadFromClasspath(final String classpath) {
+ final Set h = Sets.newHashSet();
+ try {
+ for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
+ h.add(s);
+ }
+ } catch (final Throwable e) {
+ return Sets.newHashSet();
+ }
+ return h;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java
new file mode 100644
index 0000000..bbfac97
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java
@@ -0,0 +1,52 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Abstract conditions needs a list of field names.
+ *
+ * @author claudio
+ *
+ */
+public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
+
+ protected Cond cond;
+
+ protected List fields;
+
+ public AbstractCondition(final Cond cond, final List fields) {
+ this.cond = cond;
+ this.fields = fields;
+ }
+
+ protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
+
+ @Override
+ public ConditionEvalMap verify(final Document a, final Document b) {
+ final ConditionEvalMap res = new ConditionEvalMap();
+ for (final FieldDef fd : getFields()) {
+
+ final Field va = a.values(fd.getName());
+ final Field vb = b.values(fd.getName());
+
+ if ((va.isEmpty() || vb.isEmpty()) && fd.isIgnoreMissing()) {
+ res.put(fd.getName(), new ConditionEval(cond, va, vb, 0));
+ } else {
+ res.put(fd.getName(), verify(fd, va, vb));
+ }
+ }
+ return res;
+ }
+
+ public List getFields() {
+ return fields;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java
new file mode 100644
index 0000000..f9ff2b6
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java
@@ -0,0 +1,25 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Default always true condition
+ *
+ * @author claudio
+ */
+public class AlwaysTrueCondition extends AbstractCondition {
+
+ public AlwaysTrueCondition(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+ return new ConditionEval(cond, a, b, 1);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java
new file mode 100644
index 0000000..ceb7c73
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java
@@ -0,0 +1,27 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.Map;
+
+import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
+import eu.dnetlib.pace.model.Document;
+
+/**
+ * Allows to express general conditions to be satisfied or not between two Documents.
+ *
+ * @author claudio
+ */
+public interface ConditionAlgo {
+
+ /**
+ * Verify a condition.
+ *
+ * @param a
+ * the Document a
+ * @param b
+ * the Document b
+ * @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when
+ * the condition is not verified.
+ */
+ public abstract ConditionEvalMap verify(Document a, Document b);
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java
new file mode 100644
index 0000000..25b1a01
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java
@@ -0,0 +1,27 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * The Class ExactMatch.
+ *
+ * @author claudio
+ */
+public class DoiExactMatch extends ExactMatchIgnoreCase {
+
+ public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+
+ public DoiExactMatch(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ @Override
+ protected String getValue(final Field f) {
+ return super.getValue(f).replaceAll(PREFIX, "");
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java
new file mode 100644
index 0000000..4f0f371
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java
@@ -0,0 +1,43 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * The Class ExactMatch.
+ *
+ * @author claudio
+ */
+public class ExactMatch extends AbstractCondition {
+
+ public ExactMatch(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+
+ final String fa = getValue(a);
+ final String fb = getValue(b);
+
+ int res;
+
+ if (StringUtils.isBlank(fa) && StringUtils.isBlank(fb)) {
+ res = 0;
+ } else {
+ res = fa.equals(fb) ? 1 : -1;
+ }
+
+ return new ConditionEval(cond, a, b, res);
+ }
+
+ protected String getValue(final Field f) {
+ return getFirstValue(f);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java
new file mode 100644
index 0000000..8baad5b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java
@@ -0,0 +1,34 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * The Class ExactMatch.
+ *
+ * @author claudio
+ */
+public class ExactMatchIgnoreCase extends AbstractCondition {
+
+ public ExactMatchIgnoreCase(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+
+ final String fa = getValue(a);
+ final String fb = getValue(b);
+
+ return new ConditionEval(cond, a, b, fa.equalsIgnoreCase(fb) ? 1 : -1);
+ }
+
+ protected String getValue(final Field f) {
+ return getFirstValue(f);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java
new file mode 100644
index 0000000..bc99a4c
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java
@@ -0,0 +1,56 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Returns true if the field values are different.
+ *
+ * @author claudio
+ */
+public class MustBeDifferent extends AbstractCondition {
+
+ /**
+ * Instantiates a new size match.
+ *
+ * @param fields the fields
+ */
+ public MustBeDifferent(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
+ */
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+
+ final String fa = getValue(a);
+ final String fb = getValue(b);
+
+ return new ConditionEval(cond, a, b, fa.equals(fb) ? -1 : 1);
+
+ }
+
+ protected String getValue(final Field f) {
+ return getFirstValue(f);
+ }
+
+ /**
+ * Checks if is empty.
+ *
+ * @param a the a
+ * @return true, if is empty
+ */
+ protected boolean isEmpty(final Iterable> a) {
+ return (a == null) || Iterables.isEmpty(a);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java
new file mode 100644
index 0000000..4f9e042
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java
@@ -0,0 +1,52 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.model.adaptor.Pid;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * The Class PidMatch.
+ *
+ * @author claudio
+ */
+public class PidMatch extends AbstractCondition {
+
+ private static final Log log = LogFactory.getLog(PidMatch.class);
+
+ public PidMatch(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+
+ final List sa = ((FieldList) a).stringList();
+ final List sb = ((FieldList) b).stringList();
+
+ final List pal = Pid.fromOafJson(sa);
+ final List pbl = Pid.fromOafJson(sb);
+
+ int result = 0;
+ for(Pid pa : pal) {
+ final String ta = pa.getType();
+
+ for(Pid pb : pbl) {
+ final String tb = pb.getType();
+
+ if (tb.equalsIgnoreCase(ta)) {
+ result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1;
+ }
+ }
+ }
+
+ return new ConditionEval(cond, a, b, result);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java
new file mode 100644
index 0000000..ae6e940
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java
@@ -0,0 +1,56 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Returns true if the number of values in the fields is the same.
+ *
+ * @author claudio
+ */
+public class SizeMatch extends AbstractCondition {
+
+ /**
+ * Instantiates a new size match.
+ *
+ * @param fields
+ * the fields
+ */
+ public SizeMatch(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
+ */
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+
+ // if (a.isEmpty() & b.isEmpty()) return 1;
+ //
+ // if (a.isEmpty()) return -1;
+ // if (b.isEmpty()) return -1;
+
+ return new ConditionEval(cond, a, b, Iterables.size(a) == Iterables.size(b) ? 1 : -1);
+ }
+
+ /**
+ * Checks if is empty.
+ *
+ * @param a
+ * the a
+ * @return true, if is empty
+ */
+ protected boolean isEmpty(final Iterable> a) {
+ return (a == null) || Iterables.isEmpty(a);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java
new file mode 100644
index 0000000..41a617a
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java
@@ -0,0 +1,35 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Returns true if the titles in the given documents contains the same numbers, false otherwise.
+ *
+ * @author claudio
+ *
+ */
+public class TitleVersionMatch extends AbstractCondition {
+
+ public TitleVersionMatch(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+ final String valueA = getFirstValue(a);
+ final String valueB = getFirstValue(b);
+
+ return new ConditionEval(cond, a, b, notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : -1);
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + ":" + super.toString();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java
new file mode 100644
index 0000000..8971842
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java
@@ -0,0 +1,59 @@
+package eu.dnetlib.pace.condition;
+
+import java.util.List;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.distance.eval.ConditionEval;
+import org.apache.commons.lang.StringUtils;
+
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
+ *
+ * @author claudio
+ */
+public class YearMatch extends AbstractCondition {
+
+ private int limit = 4;
+
+ public YearMatch(final Cond cond, final List fields) {
+ super(cond, fields);
+ }
+
+ // @Override
+ // public boolean verify(final Document a, final Document b) {
+ // boolean res = true;
+ // for (FieldDef fd : getFields()) {
+ //
+ // }
+ //
+ // return res;
+ // }
+
+ @Override
+ protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
+ final String valueA = getNumbers(getFirstValue(a));
+ final String valueB = getNumbers(getFirstValue(b));
+
+ final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
+ final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
+
+ return new ConditionEval(cond, a, b, lengthMatch && valueA.equals(valueB) || onemissing ? 1 : -1);
+ }
+
+ protected boolean checkLength(final String s) {
+ return s.length() == limit;
+ }
+
+ protected String getFirstValue(final Field value) {
+ return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
+ }
+
+ @Override
+ public String toString() {
+ return getClass().getSimpleName() + ":" + super.toString();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java
new file mode 100644
index 0000000..cb2e434
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java
@@ -0,0 +1,46 @@
+package eu.dnetlib.pace.config;
+
+/**
+ * Enumerates the distance Algos.
+ */
+public enum Algo {
+
+ /** The Jaro winkler. */
+ JaroWinkler,
+ /** The Jaro winkler title. */
+ JaroWinklerTitle,
+ /** The Levenstein. */
+ Levenstein,
+ /** The Levenstein distance for title matching */
+ LevensteinTitle,
+ /** The Level2 jaro winkler. */
+ Level2JaroWinkler,
+ /** The Level2 jaro winkler for title matching */
+ Level2JaroWinklerTitle,
+ /** The Level2 levenstein. */
+ Level2Levenstein,
+ /** The Sub string levenstein. */
+ SubStringLevenstein,
+ /** The Year levenstein. */
+ YearLevenstein,
+ /** The Sorted jaro winkler. */
+ SortedJaroWinkler,
+ /** The Sorted level2 jaro winkler. */
+ SortedLevel2JaroWinkler,
+ /** Compares two urls */
+ urlMatcher,
+ /** Exact match algo. */
+ ExactMatch,
+ /**
+ * Returns 0 for equal strings, 1 for different strings.
+ */
+ MustBeDifferent,
+ /** Always return 1.0 as distance. */
+ AlwaysMatch,
+ /** Person distance */
+ PersonCoAuthorSurnamesDistance,
+ PersonCoAnchorsDistance,
+ PersonDistance,
+ /** The Null. */
+ Null
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java
new file mode 100644
index 0000000..b287fdd
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java
@@ -0,0 +1,28 @@
+package eu.dnetlib.pace.config;
+
+/**
+ * The Enum Cond.
+ */
+public enum Cond {
+
+ /** The year match. */
+ yearMatch,
+ /** The title version match. */
+ titleVersionMatch,
+ /** The size match. */
+ sizeMatch,
+ /**
+ * Returns true if the field values are different
+ */
+ mustBeDifferent,
+ /** The Exact match. */
+ exactMatch,
+ /**
+ * The Exact match ignore case.
+ */
+ exactMatchIgnoreCase,
+ /** The Exact match specialized to recognize DOI values. */
+ doiExactMatch,
+ /** The Exact match that checks if pid type and value are the same */
+ pidMatch
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
new file mode 100644
index 0000000..7498c23
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@@ -0,0 +1,59 @@
+package eu.dnetlib.pace.config;
+
+import java.util.List;
+import java.util.Map;
+
+import eu.dnetlib.pace.condition.ConditionAlgo;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Interface for PACE configuration bean.
+ *
+ * @author claudio
+ */
+public interface Config {
+
+ /**
+ * Field configuration definitions.
+ *
+ * @return the list of definitions
+ */
+ public List model();
+
+ /**
+ * Field configuration definitions.
+ *
+ * @return the list of definitions
+ */
+ public Map modelMap();
+
+ /**
+ * Strict Pre-Condition definitions.
+ *
+ * @return the list of conditions
+ */
+ public List strictConditions();
+
+ /**
+ * Pre-Condition definitions.
+ *
+ * @return the list of conditions
+ */
+ public List conditions();
+
+ /**
+ * Clusterings.
+ *
+ * @return the list
+ */
+ public List clusterings();
+
+ /**
+ * Blacklists.
+ *
+ * @return the map
+ */
+ public Map> blacklists();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
new file mode 100644
index 0000000..5116f36
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@@ -0,0 +1,131 @@
+package eu.dnetlib.pace.config;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.antlr.stringtemplate.StringTemplate;
+import org.apache.commons.io.IOUtils;
+
+import com.google.common.collect.Maps;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+
+import eu.dnetlib.pace.condition.ConditionAlgo;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class DedupConfig implements Config {
+
+ private static final Log log = LogFactory.getLog(DedupConfig.class);
+
+ private static String CONFIG_TEMPLATE = "dedupConfig.st";
+
+ private PaceConfig pace;
+
+ private WfConfig wf;
+
+ private static Map defaults = Maps.newHashMap();
+
+ static {
+ defaults.put("threshold", "0");
+ defaults.put("run", "001");
+ defaults.put("entityType", "result");
+ defaults.put("orderField", "title");
+ defaults.put("queueMaxSize", "2000");
+ defaults.put("groupMaxSize", "10");
+ defaults.put("slidingWindowSize", "200");
+ defaults.put("rootBuilder", "result");
+ defaults.put("includeChildren", "true");
+ }
+
+ public DedupConfig() {}
+
+ public static DedupConfig load(final String json) {
+
+ final DedupConfig config = new Gson().fromJson(json, DedupConfig.class);
+
+ config.getPace().initModel();
+
+ return config;
+ }
+
+ public static DedupConfig loadDefault() throws IOException {
+ return loadDefault(new HashMap());
+ }
+
+ public static DedupConfig loadDefault(final Map params) throws IOException {
+
+ final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
+
+ for (final Entry e : defaults.entrySet()) {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ for (final Entry e : params.entrySet()) {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+
+ final String json = template.toString();
+ return load(json);
+ }
+
+ private String readFromClasspath(final String resource) throws IOException {
+ return IOUtils.toString(getClass().getResource(resource));
+ }
+
+ public PaceConfig getPace() {
+ return pace;
+ }
+
+ public void setPace(final PaceConfig pace) {
+ this.pace = pace;
+ }
+
+ public WfConfig getWf() {
+ return wf;
+ }
+
+ public void setWf(final WfConfig wf) {
+ this.wf = wf;
+ }
+
+ @Override
+ public String toString() {
+ return new GsonBuilder().setPrettyPrinting().create().toJson(this);
+ }
+
+ @Override
+ public List model() {
+ return getPace().getModel();
+ }
+
+ @Override
+ public Map modelMap() {
+ return getPace().getModelMap();
+ }
+
+ @Override
+ public List strictConditions() {
+ return getPace().getStrictConditionAlgos();
+ }
+
+ @Override
+ public List conditions() {
+ return getPace().getConditionAlgos();
+ }
+
+ @Override
+ public List clusterings() {
+ return getPace().getClustering();
+ }
+
+ @Override
+ public Map> blacklists() {
+ return getPace().getBlacklists();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
new file mode 100644
index 0000000..ffc67e7
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@@ -0,0 +1,104 @@
+package eu.dnetlib.pace.config;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import eu.dnetlib.pace.condition.ConditionAlgo;
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.CondDef;
+import eu.dnetlib.pace.model.FieldDef;
+import org.apache.commons.collections.CollectionUtils;
+
+public class PaceConfig {
+
+ private List model;
+ private List strictConditions;
+ private List conditions;
+ private List clustering;
+ private Map> blacklists;
+
+ private Map modelMap;
+
+ public PaceConfig() {}
+
+ public void initModel() {
+ modelMap = Maps.newHashMap();
+ for(FieldDef fd : getModel()) {
+ modelMap.put(fd.getName(), fd);
+ }
+ }
+
+ public List getModel() {
+ return model;
+ }
+
+ public void setModel(final List model) {
+ this.model = model;
+ }
+
+ public List getStrictConditions() {
+ return strictConditions;
+ }
+
+ public void setStrictConditions(final List strictConditions) {
+ this.strictConditions = strictConditions;
+ }
+
+ public List getConditions() {
+ return conditions;
+ }
+
+ public List getConditionAlgos() {
+ return asConditionAlgos(getConditions());
+ }
+
+ public List getStrictConditionAlgos() {
+ return asConditionAlgos(getStrictConditions());
+ }
+
+ public void setConditions(final List conditions) {
+ this.conditions = conditions;
+ }
+
+ public List getClustering() {
+ return clustering;
+ }
+
+ public void setClustering(final List clustering) {
+ this.clustering = clustering;
+ }
+
+ public Map> getBlacklists() {
+ return blacklists;
+ }
+
+ public void setBlacklists(final Map> blacklists) {
+ this.blacklists = blacklists;
+ }
+
+ public Map getModelMap() {
+ return modelMap;
+ }
+
+ public void setModelMap(final Map modelMap) {
+ this.modelMap = modelMap;
+ }
+
+ // helper
+
+ private List asConditionAlgos(final List defs) {
+ final List algos = Lists.newArrayList();
+ if (CollectionUtils.isEmpty(defs)) return algos;
+ for (final CondDef cd : defs) {
+ final List fields = getModel().stream()
+ .filter(fd -> cd.getFields().contains(fd.getName()))
+ .collect(Collectors.toList());
+ algos.add(cd.getConditionAlgo(fields));
+ }
+ return algos;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
new file mode 100644
index 0000000..0f1f696
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
@@ -0,0 +1,5 @@
+package eu.dnetlib.pace.config;
+
+public enum Type {
+ String, Int, List, JSON, URL
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
new file mode 100644
index 0000000..9e836eb
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
@@ -0,0 +1,254 @@
+package eu.dnetlib.pace.config;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import com.google.gson.GsonBuilder;
+import org.apache.commons.lang.StringUtils;
+
+public class WfConfig {
+
+ /**
+ * Entity type.
+ */
+ private String entityType = "";
+
+ /**
+ * Sub-Entity type refers to one of fields declared in the model. See eu.dnetlib.pace.config.PaceConfig.modelMap
+ */
+ private String subEntityType = "";
+
+ /**
+ * Sub-Entity value declares a value for subTypes to be considered.
+ */
+ private String subEntityValue = "";
+
+ /**
+ * Field name used to sort the values in the reducer phase.
+ */
+ private String orderField = "";
+
+ /**
+ * Column Families involved in the relations redirection.
+ */
+ private List rootBuilder = Lists.newArrayList();
+
+ /**
+ * Set of datasource namespace prefixes that won't be deduplicated.
+ */
+ private Set skipList = Sets.newHashSet();
+
+ /**
+ * Subprefix used to build the root id, allows multiple dedup runs.
+ */
+ private String dedupRun = "";
+
+ /**
+ * Similarity threshold.
+ */
+ private double threshold = 0;
+
+ /** The queue max size. */
+ private int queueMaxSize = 2000;
+
+ /** The group max size. */
+ private int groupMaxSize;
+
+ /** The sliding window size. */
+ private int slidingWindowSize;
+
+ /** The configuration id. */
+ private String configurationId;
+
+ /** The include children. */
+ private boolean includeChildren;
+
+ /** Default maximum number of allowed children. */
+ private final static int MAX_CHILDREN = 10;
+
+ /** Maximum number of allowed children. */
+ private int maxChildren = MAX_CHILDREN;
+
+ public WfConfig() {}
+
+ /**
+ * Instantiates a new dedup config.
+ *
+ * @param entityType
+ * the entity type
+ * @param orderField
+ * the order field
+ * @param rootBuilder
+ * the root builder families
+ * @param dedupRun
+ * the dedup run
+ * @param threshold
+ * the threshold
+ * @param skipList
+ * the skip list
+ * @param queueMaxSize
+ * the queue max size
+ * @param groupMaxSize
+ * the group max size
+ * @param slidingWindowSize
+ * the sliding window size
+ * @param includeChildren
+ * allows the children to be included in the representative records or not.
+ */
+ public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun,
+ final double threshold,
+ final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) {
+ super();
+ this.entityType = entityType;
+ this.orderField = orderField;
+ this.rootBuilder = rootBuilder;
+ this.dedupRun = cleanupStringNumber(dedupRun);
+ this.threshold = threshold;
+ this.skipList = skipList;
+ this.queueMaxSize = queueMaxSize;
+ this.groupMaxSize = groupMaxSize;
+ this.slidingWindowSize = slidingWindowSize;
+ this.includeChildren = includeChildren;
+ }
+
+ /**
+ * Cleanup string number.
+ *
+ * @param s
+ * the s
+ * @return the string
+ */
+ private String cleanupStringNumber(final String s) {
+ return s.contains("'") ? s.replaceAll("'", "") : s;
+ }
+
+ public boolean hasSubType() {
+ return StringUtils.isNotBlank(getSubEntityType()) && StringUtils.isNotBlank(getSubEntityValue());
+ }
+
+ public String getEntityType() {
+ return entityType;
+ }
+
+ public void setEntityType(final String entityType) {
+ this.entityType = entityType;
+ }
+
+ public String getSubEntityType() {
+ return subEntityType;
+ }
+
+ public void setSubEntityType(final String subEntityType) {
+ this.subEntityType = subEntityType;
+ }
+
+ public String getSubEntityValue() {
+ return subEntityValue;
+ }
+
+ public void setSubEntityValue(final String subEntityValue) {
+ this.subEntityValue = subEntityValue;
+ }
+
+ public String getOrderField() {
+ return orderField;
+ }
+
+ public void setOrderField(final String orderField) {
+ this.orderField = orderField;
+ }
+
+ public List getRootBuilder() {
+ return rootBuilder;
+ }
+
+ public void setRootBuilder(final List rootBuilder) {
+ this.rootBuilder = rootBuilder;
+ }
+
+ public Set getSkipList() {
+ return skipList != null ? skipList : new HashSet();
+ }
+
+ public void setSkipList(final Set skipList) {
+ this.skipList = skipList;
+ }
+
+ public String getDedupRun() {
+ return dedupRun;
+ }
+
+ public void setDedupRun(final String dedupRun) {
+ this.dedupRun = dedupRun;
+ }
+
+ public double getThreshold() {
+ return threshold;
+ }
+
+ public void setThreshold(final double threshold) {
+ this.threshold = threshold;
+ }
+
+ public int getQueueMaxSize() {
+ return queueMaxSize;
+ }
+
+ public void setQueueMaxSize(final int queueMaxSize) {
+ this.queueMaxSize = queueMaxSize;
+ }
+
+ public int getGroupMaxSize() {
+ return groupMaxSize;
+ }
+
+ public void setGroupMaxSize(final int groupMaxSize) {
+ this.groupMaxSize = groupMaxSize;
+ }
+
+ public int getSlidingWindowSize() {
+ return slidingWindowSize;
+ }
+
+ public void setSlidingWindowSize(final int slidingWindowSize) {
+ this.slidingWindowSize = slidingWindowSize;
+ }
+
+ public String getConfigurationId() {
+ return configurationId;
+ }
+
+ public void setConfigurationId(final String configurationId) {
+ this.configurationId = configurationId;
+ }
+
+ public boolean isIncludeChildren() {
+ return includeChildren;
+ }
+
+ public void setIncludeChildren(final boolean includeChildren) {
+ this.includeChildren = includeChildren;
+ }
+
+ public int getMaxChildren() {
+ return maxChildren;
+ }
+
+ public void setMaxChildren(final int maxChildren) {
+ this.maxChildren = maxChildren;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return new GsonBuilder().setPrettyPrinting().create().toJson(this);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java
new file mode 100644
index 0000000..f9d189f
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java
@@ -0,0 +1,15 @@
+package eu.dnetlib.pace.distance;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.distance.eval.ScoreResult;
+import eu.dnetlib.pace.model.Document;
+
+public abstract class AbstractDistance implements Distance {
+
+ protected abstract Document toDocument(A a);
+
+ @Override
+ public ScoreResult between(final A a, final A b, final Config config) {
+ return new DistanceScorer(config).distance(toDocument(a), toDocument(b));
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java
new file mode 100644
index 0000000..b354f06
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.distance;
+
+import java.util.Map;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+
+public abstract class ConfigurableDistanceAlgo extends AbstractPaceFunctions {
+
+ private Map params;
+
+ private double weigth;
+
+ public ConfigurableDistanceAlgo(final Map params, final double weight) {
+ this.params = params;
+ this.weigth = weight;
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ public double getWeigth() {
+ return weigth;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java
new file mode 100644
index 0000000..93a6e75
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java
@@ -0,0 +1,9 @@
+package eu.dnetlib.pace.distance;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.distance.eval.ScoreResult;
+
+public interface Distance {
+
+ public ScoreResult between(A a, A b, Config config);
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java
new file mode 100644
index 0000000..e9d0095
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java
@@ -0,0 +1,15 @@
+package eu.dnetlib.pace.distance;
+
+import eu.dnetlib.pace.model.Field;
+
+/**
+ * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
+ * objects.
+ */
+public interface DistanceAlgo {
+
+ public abstract double distance(Field a, Field b);
+
+ public double getWeight();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
new file mode 100644
index 0000000..0cbb6f4
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
@@ -0,0 +1,101 @@
+package eu.dnetlib.pace.distance;
+
+import java.util.Collection;
+import java.util.List;
+
+import eu.dnetlib.pace.condition.ConditionAlgo;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
+import eu.dnetlib.pace.distance.eval.DistanceEval;
+import eu.dnetlib.pace.distance.eval.DistanceEvalMap;
+import eu.dnetlib.pace.distance.eval.ScoreResult;
+import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * The distance between two documents is given by the weighted mean of the field distances
+ */
+public class DistanceScorer {
+
+ private Config config;
+
+ public DistanceScorer(final Config config) {
+ this.config = config;
+ }
+
+ public ScoreResult distance(final Document a, final Document b) {
+ final ScoreResult sr = new ScoreResult();
+
+ sr.setStrictConditions(verify(a, b, config.strictConditions()));
+ sr.setConditions(verify(a, b, config.conditions()));
+
+ final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model()));
+
+ for (final FieldDef fd : config.model()) {
+
+ dMap.updateDistance(fieldDistance(a, b, fd));
+ }
+ sr.setDistances(dMap);
+ return sr;
+ }
+
+ private ConditionEvalMap verify(final Document a, final Document b, final List conditions) {
+ final ConditionEvalMap res = new ConditionEvalMap();
+
+ for (final ConditionAlgo cd : conditions) {
+ final ConditionEvalMap map = cd.verify(a, b);
+ res.mergeFrom(map);
+
+ // commented out shortcuts
+ /*
+ if (map.anyNegative()) {
+ return res;
+ }
+ */
+
+ //if (strict && (res < 0)) return -1;
+ //cond += verify;
+ }
+ return res;
+ }
+
+ private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) {
+
+ final double w = fd.getWeight();
+ final Field va = getValue(a, fd);
+ final Field vb = getValue(b, fd);
+
+ final DistanceEval de = new DistanceEval(fd, va, vb);
+ if ((w == 0)) return de; // optimization for 0 weight
+ else {
+ if (va.isEmpty() || vb.isEmpty()) {
+ if (fd.isIgnoreMissing()) {
+ de.setDistance(-1);
+ } else {
+ de.setDistance(w);
+ }
+ } else {
+ if (va.getType().equals(vb.getType())) {
+ de.setDistance(w * fd.getDistanceAlgo().distance(va, vb));
+ } else {
+ throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
+ }
+ }
+ return de;
+ }
+ }
+
+ private Field getValue(final Document d, final FieldDef fd) {
+ return d.values(fd.getName());
+ }
+
+ private double sumWeights(final Collection fields) {
+ double sum = 0.0;
+ for (final FieldDef fd : fields) {
+ sum += fd.getWeight();
+ }
+ return sum;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java
new file mode 100644
index 0000000..7651479
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java
@@ -0,0 +1,12 @@
+package eu.dnetlib.pace.distance;
+
+import eu.dnetlib.pace.model.Document;
+
+public class PaceDocumentDistance extends AbstractDistance {
+
+ @Override
+ protected Document toDocument(Document a) {
+ return a;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java
new file mode 100644
index 0000000..8329604
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java
@@ -0,0 +1,100 @@
+package eu.dnetlib.pace.distance;
+
+import java.util.List;
+
+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Type;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+
+/**
+ * For the rest of the fields delegate the distance measure to the second string library.
+ */
+public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo {
+
+ // val aliases = Map(('â‚' to '₉') zip ('1' to '9'): _*) ++ Map(('â´' to 'â¹') zip ('4' to '9'): _*) ++ Map('¹' -> '1', '²' ->
+ // '2', * '³'
+ // -> '3')
+
+ /** The ssalgo. */
+ protected AbstractStringDistance ssalgo;
+
+ /** The weight. */
+ protected double weight = 0.0;
+
+ /**
+ * Instantiates a new second string distance algo.
+ *
+ * @param weight
+ * the weight
+ * @param ssalgo
+ * the ssalgo
+ */
+ protected SecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
+ this.ssalgo = ssalgo;
+ this.weight = weight;
+ }
+
+ /**
+ * Normalize.
+ *
+ * @param d
+ * the d
+ * @return the double
+ */
+ protected abstract double normalize(double d);
+
+ /**
+ * Distance.
+ *
+ * @param a
+ * the a
+ * @param b
+ * the b
+ * @return the double
+ */
+ public double distance(final String a, final String b) {
+ double score = ssalgo.score(a, b);
+ return normalize(score);
+ }
+
+ /**
+ * Distance.
+ *
+ * @param a
+ * the a
+ * @param b
+ * the b
+ * @return the double
+ */
+ protected double distance(final List a, final List b) {
+ return distance(concat(a), concat(b));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
+ */
+ @Override
+ public double distance(final Field a, final Field b) {
+ if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
+ if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
+
+ throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
+ }
+
+ /**
+ * To list.
+ *
+ * @param list
+ * the list
+ * @return the list
+ */
+ protected List toList(final Field list) {
+ return ((FieldList) list).stringList();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java
new file mode 100644
index 0000000..9044982
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java
@@ -0,0 +1,31 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class AlwaysMatch extends SecondStringDistanceAlgo {
+
+ public AlwaysMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(final String a, final String b) {
+ return 1.0;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java
new file mode 100644
index 0000000..ef95c02
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java
@@ -0,0 +1,31 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class ExactMatch extends SecondStringDistanceAlgo {
+
+ public ExactMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(final String a, final String b) {
+ return a.equals(b) ? 1.0 : 0;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java
new file mode 100644
index 0000000..87f6c4e
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java
@@ -0,0 +1,35 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
+public class JaroWinkler extends SecondStringDistanceAlgo {
+
+ public JaroWinkler(double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(String a, String b) {
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ return normalize(ssalgo.score(ca, cb));
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java
new file mode 100644
index 0000000..1419a07
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java
@@ -0,0 +1,36 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
+public class JaroWinklerTitle extends SecondStringDistanceAlgo {
+
+ public JaroWinklerTitle(double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(String a, String b) {
+ String ca = cleanup(a);
+ String cb = cleanup(b);
+
+ boolean check = checkNumbers(ca, cb);
+ return check ? 0.5 : normalize(ssalgo.score(ca, cb));
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java
new file mode 100644
index 0000000..3ad1cfa
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class Level2JaroWinkler extends SecondStringDistanceAlgo {
+
+ public Level2JaroWinkler(double w) {
+ super(w, new com.wcohen.ss.Level2JaroWinkler());
+ }
+
+ protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java
new file mode 100644
index 0000000..a1c3472
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java
@@ -0,0 +1,41 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
+
+ public Level2JaroWinklerTitle(final double w) {
+ super(w, new com.wcohen.ss.Level2JaroWinkler());
+ }
+
+ protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double distance(final String a, final String b) {
+ final String ca = cleanup(a);
+ final String cb = cleanup(b);
+
+ final boolean check = checkNumbers(ca, cb);
+
+ if (check) return 0.5;
+
+ final String cca = finalCleanup(ca);
+ final String ccb = finalCleanup(cb);
+
+ return ssalgo.score(cca, ccb);
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java
new file mode 100644
index 0000000..7a2b029
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class Level2Levenstein extends SecondStringDistanceAlgo {
+
+ public Level2Levenstein(double w) {
+ super(w, new com.wcohen.ss.Level2Levenstein());
+ }
+
+ protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return 1 / Math.pow(Math.abs(d) + 1, 0.1);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java
new file mode 100644
index 0000000..9dfce83
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java
@@ -0,0 +1,26 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class Levenstein extends SecondStringDistanceAlgo {
+
+ public Levenstein(double w) {
+ super(w, new com.wcohen.ss.Levenstein());
+ }
+
+ protected Levenstein(double w, AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(double d) {
+ return 1 / Math.pow(Math.abs(d) + 1, 0.1);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java
new file mode 100644
index 0000000..5452955
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java
@@ -0,0 +1,25 @@
+package eu.dnetlib.pace.distance.algo;
+
+
+public class LevensteinDate extends Levenstein {
+
+
+ public LevensteinDate(double w) {
+ super(w);
+ }
+
+
+ @Override
+ public double distance(String a, String b) {
+
+ return 1.0;
+ }
+
+
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java
new file mode 100644
index 0000000..281de31
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java
@@ -0,0 +1,45 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class LevensteinTitle extends SecondStringDistanceAlgo {
+
+ public LevensteinTitle(final double w) {
+ super(w, new com.wcohen.ss.Levenstein());
+ }
+
+ protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
+
+ @Override
+ public double distance(final String a, final String b) {
+ final String ca = cleanup(a);
+ final String cb = cleanup(b);
+
+ final boolean check = checkNumbers(ca, cb);
+
+ if (check) return 0.5;
+
+ final String cca = finalCleanup(ca);
+ final String ccb = finalCleanup(cb);
+
+ return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
+ }
+
+ private double normalize(final double score, final int la, final int lb) {
+ return 1 - (Math.abs(score) / Math.max(la, lb));
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return 1 / Math.pow(Math.abs(d) + 1, 0.1);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java
new file mode 100644
index 0000000..1177ed5
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java
@@ -0,0 +1,31 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+
+public class MustBeDifferent extends SecondStringDistanceAlgo {
+
+ public MustBeDifferent(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ @Override
+ public double distance(final String a, final String b) {
+ return !a.equals(b) ? 1.0 : 0;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java
new file mode 100644
index 0000000..8070a00
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java
@@ -0,0 +1,22 @@
+package eu.dnetlib.pace.distance.algo;
+
+import eu.dnetlib.pace.distance.DistanceAlgo;
+import eu.dnetlib.pace.model.Field;
+
+/**
+ * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
+ * NullDistanceAlgo.
+ */
+public class NullDistanceAlgo implements DistanceAlgo {
+
+ @Override
+ public double distance(Field a, Field b) {
+ return 0.0;
+ }
+
+ @Override
+ public double getWeight() {
+ return 0.0;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java
new file mode 100644
index 0000000..d834207
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java
@@ -0,0 +1,52 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+
+/**
+ * The Class SortedJaroWinkler.
+ */
+public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
+
+ /**
+ * Instantiates a new sorted jaro winkler.
+ *
+ * @param weight
+ * the weight
+ */
+ public SortedJaroWinkler(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
+
+ /**
+ * Instantiates a new sorted jaro winkler.
+ *
+ * @param weight
+ * the weight
+ * @param ssalgo
+ * the ssalgo
+ */
+ protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
+ */
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
+ */
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java
new file mode 100644
index 0000000..43ac190
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java
@@ -0,0 +1,52 @@
+package eu.dnetlib.pace.distance.algo;
+
+import com.wcohen.ss.AbstractStringDistance;
+
+/**
+ * The Class SortedJaroWinkler.
+ */
+public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
+
+ /**
+ * Instantiates a new sorted jaro winkler.
+ *
+ * @param weight
+ * the weight
+ */
+ public SortedLevel2JaroWinkler(final double weight) {
+ super(weight, new com.wcohen.ss.Level2JaroWinkler());
+ }
+
+ /**
+ * Instantiates a new sorted jaro winkler.
+ *
+ * @param weight
+ * the weight
+ * @param ssalgo
+ * the ssalgo
+ */
+ protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
+ */
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
+ */
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java
new file mode 100644
index 0000000..d47fbba
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java
@@ -0,0 +1,43 @@
+package eu.dnetlib.pace.distance.algo;
+
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+
+/**
+ * For the rest of the fields delegate the distance measure to the second string library.
+ */
+public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo {
+
+ /**
+ * Instantiates a new sorted second string distance algo.
+ *
+ * @param weight
+ * the weight
+ * @param ssalgo
+ * the ssalgo
+ */
+ protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field)
+ */
+ @Override
+ protected List toList(final Field list) {
+ FieldList fl = (FieldList) list;
+ List values = Lists.newArrayList(fl.stringList());
+ Collections.sort(values);
+ return values;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java
new file mode 100644
index 0000000..1fa358b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java
@@ -0,0 +1,90 @@
+package eu.dnetlib.pace.distance.algo;
+
+import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+import org.apache.commons.lang.StringUtils;
+
+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.config.Type;
+import eu.dnetlib.pace.model.Field;
+
+/**
+ * The Class SubStringLevenstein.
+ */
+public class SubStringLevenstein extends SecondStringDistanceAlgo {
+
+ /** The limit. */
+ protected int limit;
+
+ /**
+ * Instantiates a new sub string levenstein.
+ *
+ * @param w
+ * the w
+ */
+ public SubStringLevenstein(final double w) {
+ super(w, new com.wcohen.ss.Levenstein());
+ }
+
+ /**
+ * Instantiates a new sub string levenstein.
+ *
+ * @param w
+ * the w
+ * @param limit
+ * the limit
+ */
+ public SubStringLevenstein(final double w, final int limit) {
+ super(w, new com.wcohen.ss.Levenstein());
+ this.limit = limit;
+ }
+
+ /**
+ * Instantiates a new sub string levenstein.
+ *
+ * @param w
+ * the w
+ * @param limit
+ * the limit
+ * @param ssalgo
+ * the ssalgo
+ */
+ protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ this.limit = limit;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
+ */
+ @Override
+ public double distance(final Field a, final Field b) {
+ if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
+ return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
+
+ throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight()
+ */
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double)
+ */
+ @Override
+ protected double normalize(final double d) {
+ return 1 / Math.pow(Math.abs(d) + 1, 0.1);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java
new file mode 100644
index 0000000..46a438e
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java
@@ -0,0 +1,48 @@
+package eu.dnetlib.pace.distance.algo;
+
+import eu.dnetlib.pace.model.Field;
+import org.apache.commons.lang.StringUtils;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Map;
+
+public class UrlMatcher extends Levenstein {
+
+ private Map params;
+
+ public UrlMatcher(double weight, Map params) {
+ super(weight);
+ this.params = params;
+ }
+
+ @Override
+ public double distance(Field a, Field b) {
+
+ final URL urlA = asUrl(getFirstValue(a));
+ final URL urlB = asUrl(getFirstValue(b));
+
+ if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
+ return 0.0;
+ }
+
+ Double hostW = params.get("host").doubleValue();
+ Double pathW = params.get("path").doubleValue();
+
+ if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
+ return hostW * 0.5;
+ }
+
+ return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
+ }
+
+ private URL asUrl(final String value) {
+ try {
+ return new URL(value);
+ } catch (MalformedURLException e) {
+ // should not happen as checked by pace typing
+ throw new IllegalStateException("invalid URL: " + value);
+ }
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java
new file mode 100644
index 0000000..4e9796c
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java
@@ -0,0 +1,37 @@
+package eu.dnetlib.pace.distance.algo;
+
+
+public class YearLevenstein extends SubStringLevenstein {
+
+ public YearLevenstein(double w) {
+ super(w);
+ }
+
+ public YearLevenstein(double w, int limit) {
+ super(w, limit);
+ }
+
+ @Override
+ public double distance(String a, String b) {
+ boolean check = checkLength(a) && checkLength(b);
+ if (check) {
+ if (a.equals(b)) {
+ return 1.0;
+ } else {
+ return 0.5;
+ }
+ } else {
+ return 1.0;
+ }
+ }
+
+ protected boolean checkLength(String s) {
+ return getNumbers(s).length() == limit;
+ }
+
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java
new file mode 100644
index 0000000..49e526f
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.pace.distance.eval;
+
+import eu.dnetlib.pace.config.Cond;
+import eu.dnetlib.pace.model.Field;
+
+/**
+ * Created by claudio on 09/03/16.
+ */
+public class ConditionEval {
+
+ private Cond cond;
+
+ private Field a;
+
+ private Field b;
+
+ private int result;
+
+ public ConditionEval(final Cond cond, final Field a, final Field b, final int result) {
+ this.cond = cond;
+ this.a = a;
+ this.b = b;
+ this.result = result;
+ }
+
+ public Field getA() {
+ return a;
+ }
+
+ public void setA(final Field a) {
+ this.a = a;
+ }
+
+ public Field getB() {
+ return b;
+ }
+
+ public void setB(final Field b) {
+ this.b = b;
+ }
+
+ public int getResult() {
+ return result;
+ }
+
+ public void setResult(final int result) {
+ this.result = result;
+ }
+
+ public Cond getCond() {
+ return cond;
+ }
+
+ public void setCond(final Cond cond) {
+ this.cond = cond;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java
new file mode 100644
index 0000000..a851596
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java
@@ -0,0 +1,38 @@
+package eu.dnetlib.pace.distance.eval;
+
+import java.util.HashMap;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+
+/**
+ * Created by claudio on 09/03/16.
+ */
+public class ConditionEvalMap extends HashMap {
+
+
+ public ConditionEvalMap mergeFrom(ConditionEvalMap map) {
+ putAll(map);
+ return this;
+ }
+
+ public boolean anyNegative() {
+ return values().stream()
+ .allMatch(ec -> ec.getResult() < 0);
+ }
+
+ public boolean isZero() {
+ return result() == 0;
+ }
+
+ public int result() {
+ int res = 0;
+ for(ConditionEval ec : values()) {
+ final int verify = ec.getResult();
+ if (verify < 0) return -1;
+ res += verify;
+ }
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java
new file mode 100644
index 0000000..a943d4c
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.pace.distance.eval;
+
+import eu.dnetlib.pace.config.Algo;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldDef;
+
+/**
+ * Created by claudio on 09/03/16.
+ */
+public class DistanceEval {
+
+ private FieldDef fieldDef;
+
+ private Field a;
+
+ private Field b;
+
+ private double distance = 0.0;
+
+ public DistanceEval(final FieldDef fieldDef, final Field a, final Field b) {
+ this.fieldDef = fieldDef;
+ this.a = a;
+ this.b = b;
+ }
+
+ public Field getA() {
+ return a;
+ }
+
+ public void setA(final Field a) {
+ this.a = a;
+ }
+
+ public Field getB() {
+ return b;
+ }
+
+ public void setB(final Field b) {
+ this.b = b;
+ }
+
+ public FieldDef getFieldDef() {
+ return fieldDef;
+ }
+
+ public void setFieldDef(final FieldDef fieldDef) {
+ this.fieldDef = fieldDef;
+ }
+
+ public double getDistance() {
+ return distance;
+ }
+
+ public void setDistance(final double distance) {
+ this.distance = distance;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java
new file mode 100644
index 0000000..764e603
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java
@@ -0,0 +1,32 @@
+package eu.dnetlib.pace.distance.eval;
+
+import java.util.HashMap;
+
+/**
+ * Created by claudio on 10/03/16.
+ */
+public class DistanceEvalMap extends HashMap {
+
+ private double sumWeights;
+
+ private double sumDistances = 0.0;
+
+ public DistanceEvalMap(final double sumWeights) {
+ this.sumWeights = sumWeights;
+ }
+
+ public void updateDistance(final DistanceEval d) {
+
+ put(d.getFieldDef().getName(), d);
+ if (d.getDistance() >= 0) {
+ sumDistances += d.getDistance();
+ } else {
+ sumWeights -= d.getFieldDef().getWeight();
+ }
+ }
+
+ public double distance() {
+ return sumWeights == 0 ? 0 : sumDistances / sumWeights;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java
new file mode 100644
index 0000000..a61cf6e
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java
@@ -0,0 +1,55 @@
+package eu.dnetlib.pace.distance.eval;
+
+import com.google.gson.GsonBuilder;
+
+/**
+ * Created by claudio on 09/03/16.
+ */
+public class ScoreResult {
+
+ private ConditionEvalMap strictConditions;
+
+ private ConditionEvalMap conditions;
+
+ private DistanceEvalMap distances;
+
+ public double getScore() {
+
+ if (getStrictConditions().result() > 0) return 1.0;
+ // if (getStrictConditions().result() < 0) return 0.0;
+ if (getConditions().result() < 0) return 0.0;
+
+ return getDistances().distance();
+ }
+
+
+ public ConditionEvalMap getStrictConditions() {
+ return strictConditions;
+ }
+
+ public void setStrictConditions(final ConditionEvalMap strictConditions) {
+ this.strictConditions = strictConditions;
+ }
+
+ public ConditionEvalMap getConditions() {
+ return conditions;
+ }
+
+ public void setConditions(final ConditionEvalMap conditions) {
+ this.conditions = conditions;
+ }
+
+ public DistanceEvalMap getDistances() {
+ return distances;
+ }
+
+ public void setDistances(final DistanceEvalMap distances) {
+ this.distances = distances;
+ }
+
+ @Override
+ public String toString() {
+ final GsonBuilder b = new GsonBuilder();
+ return b.setPrettyPrinting().create().toJson(this);
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java
new file mode 100644
index 0000000..b418b75
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java
@@ -0,0 +1,74 @@
+package eu.dnetlib.pace.model;
+
+import eu.dnetlib.pace.config.Type;
+
+/**
+ * The Class AbstractField.
+ */
+public abstract class AbstractField implements Field {
+
+ /** The type. */
+ protected Type type = Type.String;
+
+ /** The name. */
+ protected String name;
+
+ /**
+ * Instantiates a new abstract field.
+ */
+ protected AbstractField() {}
+
+ /**
+ * Instantiates a new abstract field.
+ *
+ * @param type
+ * the type
+ * @param name
+ * the name
+ */
+ protected AbstractField(final Type type, final String name) {
+ this.type = type;
+ this.name = name;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#getName()
+ */
+ @Override
+ public String getName() {
+ return name;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#getType()
+ */
+ @Override
+ public Type getType() {
+ return type;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#setName(java.lang.String)
+ */
+ @Override
+ public void setName(final String name) {
+ this.name = name;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type)
+ */
+ @Override
+ public void setType(final Type type) {
+ this.type = type;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
new file mode 100644
index 0000000..5909788
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
@@ -0,0 +1,77 @@
+package eu.dnetlib.pace.model;
+
+import java.util.List;
+import java.util.Map;
+
+import com.google.gson.Gson;
+import eu.dnetlib.pace.clustering.*;
+
+public class ClusteringDef {
+
+ private Clustering name;
+
+ private List fields;
+
+ private Map params;
+
+ public ClusteringDef() {}
+
+ public Clustering getName() {
+ return name;
+ }
+
+ public void setName(final Clustering name) {
+ this.name = name;
+ }
+
+ public ClusteringFunction getClusteringFunction() {
+ switch (getName()) {
+ case acronyms:
+ return new Acronyms(getParams());
+ case ngrams:
+ return new Ngrams(getParams());
+ case ngrampairs:
+ return new NgramPairs(getParams());
+ case sortedngrampairs:
+ return new SortedNgramPairs(getParams());
+ case suffixprefix:
+ return new SuffixPrefix(getParams());
+ case spacetrimmingfieldvalue:
+ return new SpaceTrimmingFieldValue(getParams());
+ case immutablefieldvalue:
+ return new ImmutableFieldValue(getParams());
+ case personhash:
+ return new PersonHash(getParams());
+ case personclustering:
+ return new PersonClustering(getParams());
+ case lowercase:
+ return new LowercaseClustering(getParams());
+ case urlclustering:
+ return new UrlClustering(getParams());
+ default:
+ return new RandomClusteringFunction(getParams());
+ }
+ }
+
+ public List getFields() {
+ return fields;
+ }
+
+ public void setFields(final List fields) {
+ this.fields = fields;
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ public void setParams(final Map params) {
+ this.params = params;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java
new file mode 100644
index 0000000..33f30fa
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java
@@ -0,0 +1,61 @@
+package eu.dnetlib.pace.model;
+
+import java.util.List;
+
+import com.google.gson.Gson;
+import eu.dnetlib.pace.condition.*;
+import eu.dnetlib.pace.config.Cond;
+
+public class CondDef {
+
+ private Cond name;
+
+ private List fields;
+
+ public CondDef() {}
+
+ public ConditionAlgo getConditionAlgo(final List fields) {
+ switch (getName()) {
+ case yearMatch:
+ return new YearMatch(getName(), fields);
+ case titleVersionMatch:
+ return new TitleVersionMatch(getName(), fields);
+ case sizeMatch:
+ return new SizeMatch(getName(), fields);
+ case exactMatch:
+ return new ExactMatch(getName(), fields);
+ case mustBeDifferent:
+ return new MustBeDifferent(getName(), fields);
+ case exactMatchIgnoreCase:
+ return new ExactMatchIgnoreCase(getName(), fields);
+ case doiExactMatch:
+ return new DoiExactMatch(getName(), fields);
+ case pidMatch:
+ return new PidMatch(getName(), fields);
+ default:
+ return new AlwaysTrueCondition(getName(), fields);
+ }
+ }
+
+ public Cond getName() {
+ return name;
+ }
+
+ public void setName(final Cond name) {
+ this.name = name;
+ }
+
+ public List getFields() {
+ return fields;
+ }
+
+ public void setFields(final List fields) {
+ this.fields = fields;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java
new file mode 100644
index 0000000..fcacadd
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java
@@ -0,0 +1,39 @@
+package eu.dnetlib.pace.model;
+
+import java.util.Set;
+
+/**
+ * The Interface Document. Models the common operations available on a Pace Document.
+ */
+public interface Document {
+
+ /**
+ * Gets the identifier.
+ *
+ * @return the identifier
+ */
+ String getIdentifier();
+
+ /**
+ * Fields.
+ *
+ * @return the iterable
+ */
+ Iterable fields();
+
+ /**
+ * Values.
+ *
+ * @param name
+ * the name
+ * @return the field list
+ */
+ Field values(String name);
+
+ /**
+ * Field names.
+ *
+ * @return the sets the
+ */
+ Set fieldNames();
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java
new file mode 100644
index 0000000..9c97ce3
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java
@@ -0,0 +1,54 @@
+package eu.dnetlib.pace.model;
+
+import eu.dnetlib.pace.config.Type;
+
+/**
+ * The Interface Field.
+ */
+public interface Field extends Iterable {
+
+ /**
+ * Gets the name.
+ *
+ * @return the name
+ */
+ public String getName();
+
+ /**
+ * Sets the name.
+ *
+ * @param name
+ * the new name
+ */
+ public void setName(String name);
+
+ /**
+ * Gets the type.
+ *
+ * @return the type
+ */
+ public Type getType();
+
+ /**
+ * Sets the type.
+ *
+ * @param type
+ * the new type
+ */
+ public void setType(Type type);
+
+ /**
+ * Checks if is empty.
+ *
+ * @return true, if is empty
+ */
+ public boolean isEmpty();
+
+ /**
+ * String value.
+ *
+ * @return the string
+ */
+ public String stringValue();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
new file mode 100644
index 0000000..776c203
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@@ -0,0 +1,163 @@
+package eu.dnetlib.pace.model;
+
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.Splitter;
+import com.google.common.collect.Lists;
+import com.google.gson.Gson;
+import eu.dnetlib.pace.config.Algo;
+import eu.dnetlib.pace.config.Type;
+import eu.dnetlib.pace.distance.*;
+import eu.dnetlib.pace.distance.algo.*;
+
+/**
+ * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
+ */
+public class FieldDef {
+
+ public final static String PATH_SEPARATOR = "/";
+
+ private Algo algo;
+
+ private String name;
+
+ private String path;
+
+ private boolean ignoreMissing;
+
+ private Type type;
+
+ private boolean overrideMatch;
+
+ private double weight;
+
+ private int limit = -1;
+
+ private Map params;
+
+ public FieldDef() {}
+
+ // def apply(s: String): Field[A]
+ public Field apply(final Type type, final String s) {
+ switch (type) {
+ case Int:
+ return new FieldValueImpl(type, name, Integer.parseInt(s));
+ case String:
+ return new FieldValueImpl(type, name, s);
+ case List:
+ return new FieldListImpl(name, type);
+ default:
+ throw new IllegalArgumentException("Casting not implemented for type " + type);
+ }
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public String getPath() {
+ return path;
+ }
+
+ public List getPathList() {
+ return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
+ }
+
+ public DistanceAlgo getDistanceAlgo() {
+ switch (getAlgo()) {
+ case JaroWinkler:
+ return new JaroWinkler(getWeight());
+ case JaroWinklerTitle:
+ return new JaroWinklerTitle(getWeight());
+ case Level2JaroWinkler:
+ return new Level2JaroWinkler(getWeight());
+ case Level2JaroWinklerTitle:
+ return new Level2JaroWinklerTitle(getWeight());
+ case Level2Levenstein:
+ return new Level2Levenstein(getWeight());
+ case Levenstein:
+ return new Levenstein(getWeight());
+ case LevensteinTitle:
+ return new LevensteinTitle(getWeight());
+ case SubStringLevenstein:
+ return new SubStringLevenstein(getWeight(), getLimit());
+ case YearLevenstein:
+ return new YearLevenstein(getWeight(), getLimit());
+ case SortedJaroWinkler:
+ return new SortedJaroWinkler(getWeight());
+ case SortedLevel2JaroWinkler:
+ return new SortedLevel2JaroWinkler(getWeight());
+ case urlMatcher:
+ return new UrlMatcher(getWeight(), getParams());
+ case ExactMatch:
+ return new ExactMatch(getWeight());
+ case MustBeDifferent:
+ return new MustBeDifferent(getWeight());
+ case AlwaysMatch:
+ return new AlwaysMatch(getWeight());
+ case Null:
+ return new NullDistanceAlgo();
+ default:
+ return new NullDistanceAlgo();
+ }
+ }
+
+ public boolean isIgnoreMissing() {
+ return ignoreMissing;
+ }
+
+ public Type getType() {
+ return type;
+ }
+
+ public void setType(final Type type) {
+ this.type = type;
+ }
+
+ public boolean isOverrideMatch() {
+ return overrideMatch;
+ }
+
+ public void setOverrideMatch(final boolean overrideMatch) {
+ this.overrideMatch = overrideMatch;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+ public double getWeight() {
+ return weight;
+ }
+
+ public void setWeight(final double weight) {
+ this.weight = weight;
+ }
+
+ public Algo getAlgo() {
+ return algo;
+ }
+
+ public void setAlgo(final Algo algo) {
+ this.algo = algo;
+ }
+
+ public int getLimit() {
+ return limit;
+ }
+
+ public void setLimit(final int limit) {
+ this.limit = limit;
+ }
+
+ public Map getParams() {
+ return params;
+ }
+
+ public void setParams(final Map params) {
+ this.params = params;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java
new file mode 100644
index 0000000..3b771fa
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java
@@ -0,0 +1,17 @@
+package eu.dnetlib.pace.model;
+
+import java.util.List;
+
+/**
+ * The Interface FieldList.
+ */
+public interface FieldList extends List, Field {
+
+ /**
+ * String list.
+ *
+ * @return the list
+ */
+ public List stringList();
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
new file mode 100644
index 0000000..17d0cae
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
@@ -0,0 +1,327 @@
+package eu.dnetlib.pace.model;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.ListIterator;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.google.gson.JsonObject;
+import eu.dnetlib.pace.config.Type;
+
+/**
+ * The Class FieldListImpl.
+ */
+public class FieldListImpl extends AbstractField implements FieldList {
+
+ /** The fields. */
+ private List fields;
+
+ /**
+ * Instantiates a new field list impl.
+ */
+ public FieldListImpl() {
+ fields = Lists.newArrayList();
+ }
+
+ /**
+ * Instantiates a new field list impl.
+ *
+ * @param name
+ * the name
+ */
+ public FieldListImpl(final String name, final Type type) {
+ super(type, name);
+ fields = Lists.newArrayList();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#add(java.lang.Object)
+ */
+ @Override
+ public boolean add(final Field f) {
+ return fields.add(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#add(int, java.lang.Object)
+ */
+ @Override
+ public void add(final int i, final Field f) {
+ fields.add(i, f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#addAll(java.util.Collection)
+ */
+ @Override
+ public boolean addAll(final Collection extends Field> f) {
+ return fields.addAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#addAll(int, java.util.Collection)
+ */
+ @Override
+ public boolean addAll(final int i, final Collection extends Field> f) {
+ return fields.addAll(i, f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#clear()
+ */
+ @Override
+ public void clear() {
+ fields.clear();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#contains(java.lang.Object)
+ */
+ @Override
+ public boolean contains(final Object o) {
+ return fields.contains(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#containsAll(java.util.Collection)
+ */
+ @Override
+ public boolean containsAll(final Collection> f) {
+ return fields.containsAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#get(int)
+ */
+ @Override
+ public Field get(final int i) {
+ return fields.get(i);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#indexOf(java.lang.Object)
+ */
+ @Override
+ public int indexOf(final Object o) {
+ return fields.indexOf(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#isEmpty()
+ */
+ @Override
+ public boolean isEmpty() {
+ return Iterables.all(fields, new Predicate() {
+
+ @Override
+ public boolean apply(final Field f) {
+ return f.isEmpty();
+ }
+ });
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Iterable#iterator()
+ */
+ @Override
+ public Iterator iterator() {
+ return fields.iterator();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#lastIndexOf(java.lang.Object)
+ */
+ @Override
+ public int lastIndexOf(final Object o) {
+ return fields.lastIndexOf(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#listIterator()
+ */
+ @Override
+ public ListIterator listIterator() {
+ return fields.listIterator();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#listIterator(int)
+ */
+ @Override
+ public ListIterator listIterator(final int i) {
+ return fields.listIterator(i);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#remove(java.lang.Object)
+ */
+ @Override
+ public boolean remove(final Object o) {
+ return fields.remove(o);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#remove(int)
+ */
+ @Override
+ public Field remove(final int i) {
+ return fields.remove(i);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#removeAll(java.util.Collection)
+ */
+ @Override
+ public boolean removeAll(final Collection> f) {
+ return fields.removeAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#retainAll(java.util.Collection)
+ */
+ @Override
+ public boolean retainAll(final Collection> f) {
+ return fields.retainAll(f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#set(int, java.lang.Object)
+ */
+ @Override
+ public Field set(final int i, final Field f) {
+ return fields.set(i, f);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#size()
+ */
+ @Override
+ public int size() {
+ return fields.size();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#subList(int, int)
+ */
+ @Override
+ public List subList(final int from, final int to) {
+ return fields.subList(from, to);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#toArray()
+ */
+ @Override
+ public Object[] toArray() {
+ return fields.toArray();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.List#toArray(java.lang.Object[])
+ */
+ @Override
+ public T[] toArray(final T[] t) {
+ return fields.toArray(t);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#stringValue()
+ */
+ @Override
+ public String stringValue() {
+ switch (getType()) {
+
+ case List:
+ case Int:
+ case String:
+ return Joiner.on(" ").join(stringList());
+ case JSON:
+ final String json = new Gson().toJson(stringList());
+ return json;
+ default:
+ throw new IllegalArgumentException("Unknown type: " + getType().toString());
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.FieldList#stringList()
+ */
+ @Override
+ public List stringList() {
+ return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer()));
+ }
+
+ private Function getValuesTransformer() {
+ return new Function() {
+
+ @Override
+ public String apply(final Field f) {
+ return f.stringValue();
+ }
+ };
+ }
+
+ @Override
+ public String toString() {
+ return stringList().toString();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
new file mode 100644
index 0000000..8613327
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
@@ -0,0 +1,23 @@
+package eu.dnetlib.pace.model;
+
+/**
+ * The Interface FieldValue.
+ */
+public interface FieldValue extends Field {
+
+ /**
+ * Gets the value.
+ *
+ * @return the value
+ */
+ public Object getValue();
+
+ /**
+ * Sets the value.
+ *
+ * @param value
+ * the new value
+ */
+ public void setValue(final Object value);
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
new file mode 100644
index 0000000..ea31ec3
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
@@ -0,0 +1,126 @@
+package eu.dnetlib.pace.model;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.List;
+
+import eu.dnetlib.pace.config.Type;
+import org.apache.commons.collections.iterators.SingletonIterator;
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * The Class FieldValueImpl.
+ */
+public class FieldValueImpl extends AbstractField implements FieldValue {
+
+ /** The value. */
+ private Object value = null;
+
+ /**
+ * Instantiates a new field value impl.
+ */
+ public FieldValueImpl() {}
+
+ /**
+ * Instantiates a new field value impl.
+ *
+ * @param type
+ * the type
+ * @param name
+ * the name
+ * @param value
+ * the value
+ */
+ public FieldValueImpl(final Type type, final String name, final Object value) {
+ super(type, name);
+ this.value = value;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#isEmpty()
+ */
+ @Override
+ public boolean isEmpty() {
+ if (value == null) return false;
+
+ switch (type) {
+ case String:
+ case JSON:
+ return value.toString().isEmpty();
+ case List:
+ List> list = (List>) value;
+ return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
+ case URL:
+ String str = value.toString();
+ return StringUtils.isBlank(str) || !isValidURL(str);
+ default:
+ return true;
+ }
+ }
+
+ private boolean isValidURL(final String s) {
+ try {
+ new URL(s);
+ return true;
+ } catch (MalformedURLException e) {
+ return false;
+ }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.FieldValue#getValue()
+ */
+ @Override
+ public Object getValue() {
+ return value;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
+ */
+ @Override
+ public void setValue(final Object value) {
+ this.value = value;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.Field#stringValue()
+ */
+ @Override
+ // @SuppressWarnings("unchecked")
+ public String stringValue() {
+ return String.valueOf(getValue());
+ // switch (getType()) {
+ //
+ // case Int:
+ // return String.valueOf(getValue());
+ // case List:
+ // return Joiner.on(" ").join((List) getValue());
+ // case String:
+ // return (String) getValue();
+ // default:
+ // throw new IllegalArgumentException("Unknown type: " + getType().toString());
+ // }
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Iterable#iterator()
+ */
+ @Override
+ @SuppressWarnings("unchecked")
+ public Iterator iterator() {
+ return new SingletonIterator(this);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java
new file mode 100644
index 0000000..74935de
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java
@@ -0,0 +1,146 @@
+package eu.dnetlib.pace.model;
+
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+/**
+ * The Class MapDocument.
+ */
+public class MapDocument implements Document {
+
+ /** The identifier. */
+ private String identifier;
+
+ /** The field map. */
+ private Map fieldMap;
+
+ /**
+ * Instantiates a new map document.
+ */
+ public MapDocument() {
+ identifier = null;
+ fieldMap = Maps.newHashMap();
+ }
+
+ /**
+ * Instantiates a new map document.
+ *
+ * @param identifier
+ * the identifier
+ * @param fieldMap
+ * the field map
+ */
+ public MapDocument(final String identifier, final Map fieldMap) {
+ this.setIdentifier(identifier);
+ this.fieldMap = fieldMap;
+ }
+
+ /**
+ * Instantiates a new map document.
+ *
+ * @param identifier
+ * the identifier
+ * @param data
+ * the data
+ */
+ public MapDocument(final String identifier, final byte[] data) {
+ final MapDocument doc = MapDocumentSerializer.decode(data);
+
+ this.fieldMap = doc.fieldMap;
+ this.identifier = doc.identifier;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#fields()
+ */
+ @Override
+ public Iterable fields() {
+ return Lists.newArrayList(Iterables.concat(fieldMap.values()));
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
+ */
+ @Override
+ public Field values(final String name) {
+ return fieldMap.get(name);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#fieldNames()
+ */
+ @Override
+ public Set fieldNames() {
+ return fieldMap.keySet();
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return MapDocumentSerializer.toString(this);
+ // return String.format("Document(%s)", fieldMap.toString());
+ }
+
+ /**
+ * To byte array.
+ *
+ * @return the byte[]
+ */
+ public byte[] toByteArray() {
+ return MapDocumentSerializer.toByteArray(this);
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see eu.dnetlib.pace.model.document.Document#getIdentifier()
+ */
+ @Override
+ public String getIdentifier() {
+ return identifier;
+ }
+
+ /**
+ * Sets the identifier.
+ *
+ * @param identifier
+ * the new identifier
+ */
+ public void setIdentifier(final String identifier) {
+ this.identifier = identifier;
+ }
+
+ /**
+ * Gets the field map.
+ *
+ * @return the field map
+ */
+ public Map getFieldMap() {
+ return fieldMap;
+ }
+
+ /**
+ * Sets the field map.
+ *
+ * @param fieldMap
+ * the field map
+ */
+ public void setFieldMap(final Map fieldMap) {
+ this.fieldMap = fieldMap;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
new file mode 100644
index 0000000..7217b2b
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.pace.model;
+
+import java.util.Comparator;
+
+import com.google.common.collect.Iterables;
+
+import eu.dnetlib.pace.clustering.NGramUtils;
+
+/**
+ * The Class MapDocumentComparator.
+ */
+public class MapDocumentComparator implements Comparator {
+
+ /** The comparator field. */
+ private String comparatorField;
+
+ private final FieldList emptyField = new FieldListImpl();
+
+ /**
+ * Instantiates a new map document comparator.
+ *
+ * @param comparatorField
+ * the comparator field
+ */
+ public MapDocumentComparator(final String comparatorField) {
+ this.comparatorField = comparatorField;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(final Document d1, final Document d2) {
+
+ if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0;
+
+ final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue();
+ final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue();
+
+ if ((o1 == null) || (o2 == null)) return 0;
+
+ final String to1 = NGramUtils.cleanupForOrdering(o1);
+ final String to2 = NGramUtils.cleanupForOrdering(o2);
+
+ return to1.compareTo(to2);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
new file mode 100644
index 0000000..e5b3522
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
@@ -0,0 +1,101 @@
+package eu.dnetlib.pace.model;
+
+import java.lang.reflect.Type;
+
+import com.google.gson.GsonBuilder;
+import com.google.gson.InstanceCreator;
+import com.google.gson.JsonDeserializationContext;
+import com.google.gson.JsonDeserializer;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParseException;
+
+/**
+ * The Class MapDocumentSerializer.
+ */
+public class MapDocumentSerializer implements InstanceCreator {
+
+ @Override
+ public MapDocument createInstance(final Type type) {
+ return new MapDocument();
+ }
+
+ /**
+ * Decode.
+ *
+ * @param s
+ * the String
+ * @return the map document
+ */
+ public static MapDocument decode(final String s) {
+ final GsonBuilder gson = new GsonBuilder();
+
+ gson.registerTypeAdapter(Field.class, new JsonDeserializer() {
+
+ @Override
+ public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
+ final FieldListImpl fl = new FieldListImpl();
+ if (json.isJsonObject()) {
+
+ fl.add(handleJsonObject(json.getAsJsonObject()));
+
+ } else if (json.isJsonArray()) {
+
+ for (final JsonElement e : json.getAsJsonArray()) {
+ if (e.isJsonObject()) {
+ fl.add(handleJsonObject(e.getAsJsonObject()));
+ }
+ }
+ }
+ return fl;
+ }
+
+ private Field handleJsonObject(final JsonObject o) {
+ final FieldListImpl fl = new FieldListImpl();
+ final String name = o.get("name").getAsString();
+ final String type = o.get("type").getAsString();
+ final String value = o.get("value").getAsString();
+ fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
+ return fl;
+ }
+ });
+
+ return gson.create().fromJson(s, MapDocument.class);
+ }
+
+ /**
+ * Decode.
+ *
+ * @param bytes
+ * the bytes
+ * @return the map document
+ */
+ public static MapDocument decode(final byte[] bytes) {
+ return decode(new String(bytes));
+ }
+
+ /**
+ * To string.
+ *
+ * @param doc
+ * the doc
+ * @return the string
+ */
+ public static String toString(final MapDocument doc) {
+ final GsonBuilder b = new GsonBuilder();
+ return b.setPrettyPrinting().create().toJson(doc);
+
+ }
+
+ /**
+ * To byte array.
+ *
+ * @param doc
+ * the doc
+ * @return the byte[]
+ */
+ public static byte[] toByteArray(final MapDocument doc) {
+ return toString(doc).getBytes();
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
new file mode 100644
index 0000000..ec33406
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@@ -0,0 +1,155 @@
+package eu.dnetlib.pace.model;
+
+import java.nio.charset.Charset;
+import java.text.Normalizer;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.hash.Hashing;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.util.Capitalise;
+import eu.dnetlib.pace.util.DotAbbreviations;
+
+public class Person {
+
+ private static final String UTF8 = "UTF-8";
+ private List name = Lists.newArrayList();
+ private List surname = Lists.newArrayList();
+ private List fullname = Lists.newArrayList();
+ private final String original;
+
+ private static Set particles = null;
+
+ public Person(String s, final boolean aggressive) {
+ original = s;
+ s = Normalizer.normalize(s, Normalizer.Form.NFD);
+ s = s.replaceAll("\\(.+\\)", "");
+ s = s.replaceAll("\\[.+\\]", "");
+ s = s.replaceAll("\\{.+\\}", "");
+ s = s.replaceAll("\\s+-\\s+", "-");
+ s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
+ s = s.replaceAll("\\d", " ");
+ s = s.replaceAll("\\n", " ");
+ s = s.replaceAll("\\.", " ");
+ s = s.replaceAll("\\s+", " ");
+
+ if (aggressive) {
+ s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
+ // s = s.replaceAll("[\\W&&[^,-]]", "");
+ }
+
+ if (s.contains(",")) {
+ final String[] arr = s.split(",");
+ if (arr.length == 1) {
+ fullname = splitTerms(arr[0]);
+ } else if (arr.length > 1) {
+ surname = splitTerms(arr[0]);
+ name = splitTerms(arr[1]);
+ fullname.addAll(surname);
+ fullname.addAll(name);
+ }
+ } else {
+ fullname = splitTerms(s);
+
+ int lastInitialPosition = fullname.size();
+ boolean hasSurnameInUpperCase = false;
+
+ for (int i = 0; i < fullname.size(); i++) {
+ final String term = fullname.get(i);
+ if (term.length() == 1) {
+ lastInitialPosition = i;
+ } else if (term.equals(term.toUpperCase())) {
+ hasSurnameInUpperCase = true;
+ }
+ }
+
+ if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
+ name = fullname.subList(0, lastInitialPosition + 1);
+ surname = fullname.subList(lastInitialPosition + 1, fullname.size());
+ } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
+ for (final String term : fullname) {
+ if ((term.length() > 1) && term.equals(term.toUpperCase())) {
+ surname.add(term);
+ } else {
+ name.add(term);
+ }
+ }
+ }
+ }
+ }
+
+ private List splitTerms(final String s) {
+ if (particles == null) {
+ particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
+ }
+
+ final List list = Lists.newArrayList();
+ for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
+ if (!particles.contains(part.toLowerCase())) {
+ list.add(part);
+ }
+ }
+ return list;
+ }
+
+ public List getName() {
+ return name;
+ }
+
+ public String getNameString() {
+ return Joiner.on(" ").join(getName());
+ }
+
+ public List getSurname() {
+ return surname;
+ }
+
+ public List getFullname() {
+ return fullname;
+ }
+
+ public String getOriginal() {
+ return original;
+ }
+
+ public String hash() {
+ return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
+ }
+
+ public String getNormalisedFirstName() {
+ return Joiner.on(" ").join(getCapitalFirstnames());
+ }
+
+ public String getNormalisedSurname() {
+ return Joiner.on(" ").join(getCapitalSurname());
+ }
+
+ public String getSurnameString() {
+ return Joiner.on(" ").join(getSurname());
+ }
+
+ public String getNormalisedFullname() {
+ return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
+ }
+
+ public List getCapitalFirstnames() {
+ return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
+ }
+
+ public List getCapitalSurname() {
+ return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
+ }
+
+ public List getNameWithAbbreviations() {
+ return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
+ }
+
+ public boolean isAccurate() {
+ return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty());
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
new file mode 100644
index 0000000..a900a60
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
@@ -0,0 +1,118 @@
+package eu.dnetlib.pace.model;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+public class PersonComparatorUtils {
+
+ private static final int MAX_FULLNAME_LENGTH = 50;
+
+ public static Set getNgramsForPerson(String fullname) {
+
+ Set set = Sets.newHashSet();
+
+ if (fullname.length() > MAX_FULLNAME_LENGTH) {
+ return set;
+ }
+
+ Person p = new Person(fullname, true);
+
+ if (p.isAccurate()) {
+ for (String name : p.getName()) {
+ for (String surname : p.getSurname()) {
+ set.add((name.charAt(0) + "_" + surname).toLowerCase());
+ }
+ }
+ } else {
+ List list = p.getFullname();
+ for (int i = 0; i < list.size(); i++) {
+ if (list.get(i).length() > 1) {
+ for (int j = 0; j < list.size(); j++) {
+ if (i != j) {
+ set.add((list.get(j).charAt(0) + "_" + list.get(i)).toLowerCase());
+ }
+ }
+ }
+ }
+ }
+
+ return set;
+ }
+
+ public static boolean areSimilar(String s1, String s2) {
+ Person p1 = new Person(s1, true);
+ Person p2 = new Person(s2, true);
+
+ if (p1.isAccurate() && p2.isAccurate()) {
+ return verifyNames(p1.getName(), p2.getName()) && verifySurnames(p1.getSurname(), p2.getSurname());
+ } else {
+ return verifyFullnames(p1.getFullname(), p2.getFullname());
+ }
+ }
+
+ private static boolean verifyNames(List list1, List list2) {
+ return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ }
+
+ private static boolean verifySurnames(List list1, List list2) {
+ if (list1.size() != list2.size()) {
+ return false;
+ }
+ for (int i = 0; i < list1.size(); i++) {
+ if (!list1.get(i).equalsIgnoreCase(list2.get(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private static boolean verifyFullnames(List list1, List list2) {
+ Collections.sort(list1);
+ Collections.sort(list2);
+ return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ }
+
+ private static List extractExtendedNames(List list) {
+ ArrayList res = Lists.newArrayList();
+ for (String s : list) {
+ if (s.length() > 1) {
+ res.add(s.toLowerCase());
+ }
+ }
+ return res;
+ }
+
+ private static List extractInitials(List list) {
+ ArrayList res = Lists.newArrayList();
+ for (String s : list) {
+ res.add(s.substring(0, 1).toLowerCase());
+ }
+ return res;
+ }
+
+ private static boolean verifySimilarity(List list1, List list2) {
+ if (list1.size() > list2.size()) {
+ return verifySimilarity(list2, list1);
+ }
+
+ // NB: List2 is greater than list1 (or equal)
+ int pos = -1;
+ for (String s : list1) {
+ int curr = list2.indexOf(s);
+ if (curr > pos) {
+ list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
+ pos = curr;
+ } else {
+ return false;
+ }
+ }
+ return true;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java
new file mode 100644
index 0000000..3dd70f7
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java
@@ -0,0 +1,57 @@
+package eu.dnetlib.pace.model.adaptor;
+
+import java.util.List;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.reflect.TypeToken;
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * Created by claudio on 01/03/16.
+ */
+public class Pid {
+
+ private static final Log log = LogFactory.getLog(Pid.class);
+
+ private String value;
+
+ private String type;
+
+ public static List fromOafJson(final List json) {
+
+ log.debug(String.format("\nPid: %s", json));
+
+ final GsonBuilder gb = new GsonBuilder();
+ gb.registerTypeAdapter(Pid.class, new PidOafSerialiser());
+ final Gson gson = gb.create();
+
+ return Lists.newArrayList(Iterables.transform(json, new Function() {
+ @Override
+ public Pid apply(final String s) {
+ return gson.fromJson(s, Pid.class);
+ }
+ }));
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public void setType(final String type) {
+ this.type = type;
+ }
+
+ public String getValue() {
+ return value;
+ }
+
+ public void setValue(final String value) {
+ this.value = value;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java
new file mode 100644
index 0000000..8acaee6
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.pace.model.adaptor;
+
+import java.lang.reflect.Type;
+import java.util.List;
+
+import com.google.common.collect.Lists;
+import com.google.gson.*;
+import eu.dnetlib.pace.model.gt.GTAuthor;
+
+/**
+ * Created by claudio on 01/03/16.
+ */
+public class PidOafSerialiser implements JsonDeserializer {
+
+ private static final String VALUE = "value";
+
+ private static final String QUALIFIER = "qualifier";
+ private static final String CLASSID = "classid";
+
+ @Override
+ public Pid deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException {
+
+ final Pid pid = new Pid();
+
+ pid.setType(getType(json));
+ pid.setValue(getValue(json));
+
+ return pid;
+ }
+
+ private String getValue(final JsonElement json) {
+ final JsonObject obj =json.getAsJsonObject();
+ return obj.get(VALUE).getAsString();
+
+ }
+
+ private String getType(final JsonElement json) {
+
+ final JsonObject obj =json.getAsJsonObject();
+
+ if (!obj.has(QUALIFIER))
+ throw new IllegalArgumentException("pid does not contain any type: " + json.toString());
+
+ final JsonObject qualifier = obj.getAsJsonObject(QUALIFIER);
+
+ final JsonElement classid = qualifier.get(CLASSID);
+
+ return classid.getAsString();
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java
new file mode 100644
index 0000000..17bd49d
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java
@@ -0,0 +1,129 @@
+package eu.dnetlib.pace.model.gt;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+import com.google.gson.Gson;
+
+public class Author implements Comparable {
+
+ private String id;
+ private String fullname;
+ private String firstname;
+ private String secondnames;
+
+ private List matches = Lists.newArrayList();
+ private Set coauthors = Sets.newHashSet();
+ private SubjectsMap subjectsMap = new SubjectsMap();
+
+ public Author() {
+ super();
+ }
+
+ public Author(final Author a) {
+ this.id = a.getId();
+ this.fullname = a.getFullname();
+ this.firstname = a.getFirstname();
+ this.secondnames = a.getSecondnames();
+
+ this.matches = a.getMatches();
+ this.coauthors = a.getCoauthors();
+ this.subjectsMap = a.getSubjectsMap();
+ }
+
+ public boolean hasMatches() {
+ return (getMatches() != null) && !getMatches().isEmpty();
+ }
+
+ public boolean hasCoauthors() {
+ return (getCoauthors() != null) && !getCoauthors().isEmpty();
+ }
+
+ public boolean isWellFormed() {
+ return StringUtils.isNotBlank(getSecondnames()) && StringUtils.isNotBlank(getFirstname());
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(final String id) {
+ this.id = id;
+ }
+
+ public String getFullname() {
+ return fullname;
+ }
+
+ public void setFullname(final String fullname) {
+ this.fullname = fullname;
+ }
+
+ public String getFirstname() {
+ return firstname;
+ }
+
+ public void setFirstname(final String firstname) {
+ this.firstname = firstname;
+ }
+
+ public String getSecondnames() {
+ return secondnames;
+ }
+
+ public void setSecondnames(final String secondnames) {
+ this.secondnames = secondnames;
+ }
+
+ public List getMatches() {
+ return matches;
+ }
+
+ public void setMatches(final List matches) {
+ this.matches = matches;
+ }
+
+ public Set getCoauthors() {
+ return coauthors;
+ }
+
+ public void setCoauthors(final Set coauthors) {
+ this.coauthors = coauthors;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+ @Override
+ public int hashCode() {
+ return getId().hashCode();
+ }
+
+ @Override
+ public int compareTo(final Author o) {
+ return ComparisonChain.start()
+ .compare(this.getId(), o.getId(), Ordering.natural().nullsLast())
+ .result();
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ return (o instanceof Author) && getId().equals(((Author) o).getId());
+ }
+
+ public SubjectsMap getSubjectsMap() {
+ return subjectsMap;
+ }
+
+ public void setSubjectsMap(final SubjectsMap subjectsMap) {
+ this.subjectsMap = subjectsMap;
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java
new file mode 100644
index 0000000..c3f2576
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java
@@ -0,0 +1,37 @@
+package eu.dnetlib.pace.model.gt;
+
+import com.google.gson.Gson;
+
+public class AuthorSet {
+
+ private String id;
+ private Authors authors;
+
+ public AuthorSet(final String id, final Authors authors) {
+ super();
+ this.id = id;
+ this.authors = authors;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(final String id) {
+ this.id = id;
+ }
+
+ public Authors getAuthors() {
+ return authors;
+ }
+
+ public void setAuthors(final Authors authors) {
+ this.authors = authors;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java
new file mode 100644
index 0000000..e74c438
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java
@@ -0,0 +1,54 @@
+package eu.dnetlib.pace.model.gt;
+
+import java.util.Collection;
+import java.util.HashSet;
+
+import com.google.common.collect.ComparisonChain;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
+import com.google.gson.Gson;
+
+public class Authors extends HashSet implements Comparable {
+
+ private static final long serialVersionUID = -6878376220805286142L;
+
+ public Authors() {
+ super();
+ }
+
+ public Authors(final Collection authors) {
+ super(authors);
+ }
+
+ public Authors(final Author author) {
+ super(Sets.newHashSet(author));
+ }
+
+ @Override
+ public int compareTo(final Authors a) {
+ return ComparisonChain.start()
+ .compare(this.size(), a.size(), Ordering.natural().nullsLast())
+ .result();
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ final boolean res = o instanceof Authors;
+ return res && (Sets.intersection(this, (Authors) o).size() == this.size());
+ }
+
+ @Override
+ public int hashCode() {
+ int res = 0;
+ for (final Author a : this) {
+ res += a.hashCode();
+ }
+ return res;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java
new file mode 100644
index 0000000..d4ce32d
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java
@@ -0,0 +1,50 @@
+package eu.dnetlib.pace.model.gt;
+
+import com.google.gson.Gson;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+public class CoAuthor extends Author {
+
+ private static final Log log = LogFactory.getLog(CoAuthor.class);
+ private String anchorId = null;
+
+ public CoAuthor() {
+ super();
+ }
+
+ public CoAuthor(final Author author) {
+ super(author);
+ }
+
+ public boolean hasAnchorId() {
+ return StringUtils.isNotBlank(getAnchorId());
+ }
+
+ public String getAnchorId() {
+ return anchorId;
+ }
+
+ public void setAnchorId(final String anchorId) {
+ this.anchorId = anchorId;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+
+ @Override
+ public int hashCode() {
+ return getId() != null ? getId().hashCode() : getFullname().hashCode();
+ }
+
+ @Override
+ public boolean equals(final Object o) {
+ return (o instanceof CoAuthor) && StringUtils.isNotBlank(getId()) ?
+ getId().equals(((CoAuthor) o).getId()) :
+ getFullname().equals(((CoAuthor) o).getFullname());
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java
new file mode 100644
index 0000000..90898f6
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java
@@ -0,0 +1,36 @@
+package eu.dnetlib.pace.model.gt;
+
+import com.google.gson.Gson;
+
+public class CoAuthorSet {
+
+ private Author author;
+ private Authors coAuthors;
+
+ public CoAuthorSet(final Author author, final Authors coAuthors) {
+ super();
+ this.author = author;
+ this.coAuthors = coAuthors;
+ }
+
+ public Author getAuthor() {
+ return author;
+ }
+
+ public void setAuthor(final Author author) {
+ this.author = author;
+ }
+
+ public Authors getCoAuthors() {
+ return coAuthors;
+ }
+
+ public void setCoAuthors(final Authors coAuthors) {
+ this.coAuthors = coAuthors;
+ }
+
+ @Override
+ public String toString() {
+ return new Gson().toJson(this);
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java
new file mode 100644
index 0000000..a48e2d8
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java
@@ -0,0 +1,40 @@
+package eu.dnetlib.pace.model.gt;
+
+import java.util.Set;
+
+import com.google.gson.Gson;
+
+public class CoAuthorSetLite {
+
+ private String id;
+
+ private Set coAuthors;
+
+ public CoAuthorSetLite(final String id, final Set coAuthors) {
+ super();
+ this.id = id;
+ this.coAuthors = coAuthors;
+ }
+
+ public Set getCoAuthors() {
+ return coAuthors;
+ }
+
+ public void setCoAuthors(final Set