diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml
index a7a83821e..6198bd81e 100644
--- a/dhp-common/pom.xml
+++ b/dhp-common/pom.xml
@@ -52,6 +52,8 @@
+ true
+ ${scala.binary.version}
${scala.version}
@@ -81,11 +83,11 @@
org.apache.spark
- spark-core_2.11
+ spark-core_${scala.binary.version}
org.apache.spark
- spark-sql_2.11
+ spark-sql_${scala.binary.version}
@@ -159,7 +161,7 @@
eu.dnetlib.dhp
- dhp-schemas
+ ${dhp-schemas.artifact}
diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
index 12174a5c5..fd7f44fc9 100644
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@@ -20,7 +20,7 @@
net.alchim31.maven
scala-maven-plugin
- 4.0.1
+ ${net.alchim31.maven.version}
scala-compile-first
@@ -39,8 +39,9 @@
+ true
+ ${scala.binary.version}
${scala.version}
- -target:jvm-1.8
@@ -68,7 +69,6 @@
commons-io
commons-io
-
org.antlr
stringtemplate
@@ -89,17 +89,22 @@
org.apache.commons
commons-math3
-
com.jayway.jsonpath
json-path
-
com.ibm.icu
icu4j
-
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
index e984f5d18..3da8eb490 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
@@ -24,11 +23,10 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
protected abstract Collection doApply(Config conf, String s);
@Override
- public Collection apply(Config conf, List fields) {
+ public Collection apply(Config conf, List fields) {
return fields
.stream()
.filter(f -> !f.isEmpty())
- .map(Field::stringValue)
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
deleted file mode 100644
index f0e93b8ba..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
+++ /dev/null
@@ -1,60 +0,0 @@
-
-package eu.dnetlib.pace.clustering;
-
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.regex.Pattern;
-
-import com.google.common.collect.Maps;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Document;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldListImpl;
-import eu.dnetlib.pace.model.MapDocument;
-
-public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
-
- public static Collection filterAndCombine(final MapDocument a, final Config conf) {
- Document filtered = filter(a, conf.blacklists());
- return combine(filtered, conf);
- }
-
- private static MapDocument filter(final MapDocument a, final Map> blacklists) {
- if (blacklists == null || blacklists.isEmpty()) {
- return a;
- }
-
- final Map filtered = Maps.newHashMap(a.getFieldMap());
-
- for (final Entry> e : blacklists.entrySet()) {
- Field fields = a.getFieldMap().get(e.getKey());
- if (fields != null) {
- final FieldListImpl fl = new FieldListImpl();
-
- for (Field f : fields) {
- if (!isBlackListed(f.stringValue(), e.getValue())) {
- fl.add(f);
- }
- }
-
- filtered.put(e.getKey(), fl);
- }
- }
-
- return new MapDocument(a.getIdentifier(), filtered);
- }
-
- private static boolean isBlackListed(String value, List blacklist) {
- for (Pattern pattern : blacklist) {
- if (pattern.matcher(value).matches()) {
- return true;
- }
- }
-
- return false;
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
deleted file mode 100644
index 3a6f17e20..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java
+++ /dev/null
@@ -1,64 +0,0 @@
-
-package eu.dnetlib.pace.clustering;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.stream.Collectors;
-
-import org.apache.commons.lang3.StringUtils;
-
-import com.google.common.collect.Sets;
-
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.ClusteringDef;
-import eu.dnetlib.pace.model.Document;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldValueImpl;
-
-public class ClusteringCombiner {
-
- private static String SEPARATOR = ":";
- private static String COLLAPSE_ON = "collapseOn";
-
- public static Collection combine(final Document a, final Config conf) {
- final Collection res = Sets.newLinkedHashSet();
- for (final ClusteringDef cd : conf.clusterings()) {
- for (final String fieldName : cd.getFields()) {
- String prefix = getPrefix(cd, fieldName);
-
- Field values = a.values(fieldName);
- List fields = new ArrayList<>();
-
- if (values instanceof FieldValueImpl) {
- fields.add(values);
- } else {
- fields.addAll((List) values);
- }
-
- res
- .addAll(
- cd
- .clusteringFunction()
- .apply(conf, fields)
- .stream()
- .map(k -> prefix + SEPARATOR + k)
- .collect(Collectors.toList()));
- }
- }
- return res;
- }
-
- private static String getPrefix(ClusteringDef cd, String fieldName) {
- return cd.getName() + SEPARATOR +
- cd
- .getParams()
- .keySet()
- .stream()
- .filter(k -> k.contains(COLLAPSE_ON))
- .findFirst()
- .map(k -> StringUtils.substringAfter(k, SEPARATOR))
- .orElse(fieldName);
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
index e72535160..8b7852418 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -6,11 +6,10 @@ import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
public interface ClusteringFunction {
- public Collection apply(Config config, List fields);
+ public Collection apply(Config config, List fields);
public Map getParams();
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
index 60861aafd..38299adb4 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
@@ -6,9 +6,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
-import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
@@ -40,11 +38,10 @@ public class KeywordsClustering extends AbstractClusteringFunction {
}
@Override
- public Collection apply(final Config conf, List fields) {
+ public Collection apply(final Config conf, List fields) {
return fields
.stream()
.filter(f -> !f.isEmpty())
- .map(Field::stringValue)
.map(this::cleanup)
.map(this::normalize)
.map(s -> filterAllStopWords(s))
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
index dc6f8f775..5a385961a 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@@ -9,7 +9,6 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("lnfi")
@@ -22,11 +21,10 @@ public class LastNameFirstInitial extends AbstractClusteringFunction {
}
@Override
- public Collection apply(Config conf, List fields) {
+ public Collection apply(Config conf, List fields) {
return fields
.stream()
.filter(f -> !f.isEmpty())
- .map(Field::stringValue)
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
index 403d187fa..a3a6c4881 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -11,7 +11,6 @@ import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
@ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction {
@@ -21,10 +20,10 @@ public class LowercaseClustering extends AbstractClusteringFunction {
}
@Override
- public Collection apply(Config conf, List fields) {
+ public Collection apply(Config conf, List fields) {
Collection c = Sets.newLinkedHashSet();
- for (Field f : fields) {
- c.addAll(doApply(conf, f.stringValue()));
+ for (String f : fields) {
+ c.addAll(doApply(conf, f));
}
return c;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
index aa12f1279..6ee80b86e 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
@@ -8,15 +8,15 @@ import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
public class NGramUtils extends AbstractPaceFunctions {
+ static private final NGramUtils NGRAMUTILS = new NGramUtils();
private static final int SIZE = 100;
- private static Set stopwords = AbstractPaceFunctions
+ private static final Set stopwords = AbstractPaceFunctions
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
public static String cleanupForOrdering(String s) {
- NGramUtils utils = new NGramUtils();
- return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE))
+ return (NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE))
.substring(0, SIZE)
.replaceAll(" ", "");
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
index 0656312c7..aa06aa408 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -2,7 +2,6 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
-import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -14,7 +13,11 @@ import eu.dnetlib.pace.config.Config;
public class NgramPairs extends Ngrams {
public NgramPairs(Map params) {
- super(params);
+ super(params, false);
+ }
+
+ public NgramPairs(Map params, boolean sorted) {
+ super(params, sorted);
}
@Override
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
index bcc10a869..96c305a16 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -8,8 +8,15 @@ import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction {
+ private final boolean sorted;
+
public Ngrams(Map params) {
+ this(params, false);
+ }
+
+ public Ngrams(Map params, boolean sorted) {
super(params);
+ this.sorted = sorted;
}
@Override
@@ -19,20 +26,21 @@ public class Ngrams extends AbstractClusteringFunction {
protected Collection getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
- final Collection ngrams = new LinkedHashSet();
+ final Collection ngrams = sorted ? new TreeSet<>() : new LinkedHashSet();
final StringTokenizer st = new StringTokenizer(s);
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (!token.isEmpty()) {
-
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
- String ngram = (token + " ").substring(i, ngramLen + i).trim();
- if (ngrams.size() >= max) {
- return ngrams;
- }
+ String ngram = token.substring(i, Math.min(ngramLen + i, token.length())).trim();
+
if (ngram.length() >= minNgramLen) {
ngrams.add(ngram);
+
+ if (ngrams.size() >= max) {
+ return ngrams;
+ }
}
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
index 83b92f22c..b4a04ce65 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -12,7 +12,6 @@ import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personClustering")
@@ -27,19 +26,19 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
}
@Override
- public Collection apply(final Config conf, final List fields) {
+ public Collection apply(final Config conf, final List fields) {
final Set hashes = Sets.newHashSet();
- for (final Field f : fields) {
+ for (final String f : fields) {
- final Person person = new Person(f.stringValue(), false);
+ final Person person = new Person(f, false);
if (StringUtils.isNotBlank(person.getNormalisedFirstName())
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
} else {
- for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
- for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
+ for (final String token1 : tokens(f, MAX_TOKENS)) {
+ for (final String token2 : tokens(f, MAX_TOKENS)) {
if (!token1.equals(token2)) {
hashes.add(firstLC(token1) + token2);
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
index 1fc9f1747..b085ae26d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -13,7 +13,7 @@ import eu.dnetlib.pace.config.Config;
public class SortedNgramPairs extends NgramPairs {
public SortedNgramPairs(Map params) {
- super(params);
+ super(params, false);
}
@Override
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
index 122e01179..5b267ad10 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -11,7 +11,6 @@ import java.util.stream.Collectors;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
@ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
@@ -23,12 +22,11 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
}
@Override
- public Collection apply(final Config conf, List fields) {
+ public Collection apply(final Config conf, List fields) {
try {
return fields
.stream()
.filter(f -> !f.isEmpty())
- .map(Field::stringValue)
.map(this::asUrl)
.map(URL::getHost)
.collect(Collectors.toCollection(HashSet::new));
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index 9902508b8..b440686de 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -16,13 +16,11 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.model.FieldListImpl;
/**
* Set of common functions for the framework
@@ -51,28 +49,25 @@ public abstract class AbstractPaceFunctions {
protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
// html regex for normalization
- public final String HTML_REGEX = "<[^>]*>";
+ public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
// doi prefix for normalization
- public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+ public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
- private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
+ private static Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
- private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
-
- protected final static FieldList EMPTY_FIELD = new FieldListImpl();
+ private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected String concat(final List l) {
return Joiner.on(" ").skipNulls().join(l);
}
protected String cleanup(final String s) {
-
- final String s1 = s.replaceAll(HTML_REGEX, "");
+ final String s1 = HTML_REGEX.matcher(s).replaceAll("");
final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2);
final String s4 = fixXML(s3);
@@ -162,11 +157,6 @@ public abstract class AbstractPaceFunctions {
return sb.toString().replaceAll("\\s+", " ");
}
- protected String getFirstValue(final Field values) {
- return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue()
- : "";
- }
-
protected boolean notNull(final String s) {
return s != null;
}
@@ -316,7 +306,7 @@ public abstract class AbstractPaceFunctions {
}
public String normalizePid(String pid) {
- return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
+ return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
}
// get the list of keywords into the input string
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
index 0623b468f..4d823d129 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@@ -3,7 +3,7 @@ package eu.dnetlib.pace.config;
import java.util.List;
import java.util.Map;
-import java.util.regex.Pattern;
+import java.util.function.Predicate;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
@@ -30,13 +30,6 @@ public interface Config {
*/
public Map decisionTree();
- /**
- * Field configuration definitions.
- *
- * @return the list of definitions
- */
- public Map modelMap();
-
/**
* Clusterings.
*
@@ -49,7 +42,7 @@ public interface Config {
*
* @return the map
*/
- public Map> blacklists();
+ public Map> blacklists();
/**
* Translation map.
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
index ee24ff476..ac0ef08e4 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@@ -4,18 +4,19 @@ package eu.dnetlib.pace.config;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
+import java.util.AbstractMap;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
+import java.util.function.Predicate;
import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -27,9 +28,6 @@ import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
public class DedupConfig implements Config, Serializable {
-
- private static final Log log = LogFactory.getLog(DedupConfig.class);
-
private static String CONFIG_TEMPLATE = "dedupConfig.st";
private PaceConfig pace;
@@ -37,7 +35,7 @@ public class DedupConfig implements Config, Serializable {
private WfConfig wf;
@JsonIgnore
- private Map> blacklists;
+ private Map> blacklists;
private static Map defaults = Maps.newHashMap();
@@ -72,19 +70,29 @@ public class DedupConfig implements Config, Serializable {
.getBlacklists()
.entrySet()
.stream()
+ .map(
+ e -> new AbstractMap.SimpleEntry>(e.getKey(),
+ e
+ .getValue()
+ .stream()
+ .filter(s -> !StringUtils.isBlank(s))
+ .map(Pattern::compile)
+ .collect(Collectors.toList())))
.collect(
Collectors
.toMap(
e -> e.getKey(),
- e -> e
+ e -> (Predicate & Serializable) s -> e
.getValue()
.stream()
- .filter(s -> !StringUtils.isBlank(s))
- .map(Pattern::compile)
- .collect(Collectors.toList())));
+ .filter(p -> p.matcher(s).matches())
+ .findFirst()
+ .isPresent()))
+
+ ;
return config;
- } catch (IOException e) {
+ } catch (IOException | PatternSyntaxException e) {
throw new PaceException("Error in parsing configuration json", e);
}
@@ -152,18 +160,13 @@ public class DedupConfig implements Config, Serializable {
return getPace().getModel();
}
- @Override
- public Map modelMap() {
- return getPace().getModelMap();
- }
-
@Override
public List clusterings() {
return getPace().getClustering();
}
@Override
- public Map> blacklists() {
+ public Map> blacklists() {
return blacklists;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
index b4afad9c8..f1bc49f4a 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@@ -28,6 +28,10 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
@JsonIgnore
private Map translationMap;
+ public Map getModelMap() {
+ return modelMap;
+ }
+
@JsonIgnore
private Map modelMap;
@@ -101,13 +105,4 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
public void setSynonyms(Map> synonyms) {
this.synonyms = synonyms;
}
-
- public Map getModelMap() {
- return modelMap;
- }
-
- public void setModelMap(final Map modelMap) {
- this.modelMap = modelMap;
- }
-
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java
deleted file mode 100644
index c11d461ab..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java
+++ /dev/null
@@ -1,72 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import eu.dnetlib.pace.config.Type;
-
-/**
- * The Class AbstractField.
- */
-public abstract class AbstractField implements Field {
-
- /** The type. */
- protected Type type = Type.String;
-
- /** The name. */
- protected String name;
-
- /**
- * Instantiates a new abstract field.
- */
- protected AbstractField() {
- }
-
- /**
- * Instantiates a new abstract field.
- *
- * @param type
- * the type
- * @param name
- * the name
- */
- protected AbstractField(final Type type, final String name) {
- this.type = type;
- this.name = name;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#getName()
- */
- @Override
- public String getName() {
- return name;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#getType()
- */
- @Override
- public Type getType() {
- return type;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#setName(java.lang.String)
- */
- @Override
- public void setName(final String name) {
- this.name = name;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type)
- */
- @Override
- public void setType(final Type type) {
- this.type = type;
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java
deleted file mode 100644
index d9c06d4e4..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java
+++ /dev/null
@@ -1,40 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.util.Set;
-
-/**
- * The Interface Document. Models the common operations available on a Pace Document.
- */
-public interface Document {
-
- /**
- * Gets the identifier.
- *
- * @return the identifier
- */
- String getIdentifier();
-
- /**
- * Fields.
- *
- * @return the iterable
- */
- Iterable fields();
-
- /**
- * Values.
- *
- * @param name
- * the name
- * @return the field list
- */
- Field values(String name);
-
- /**
- * Field names.
- *
- * @return the sets the
- */
- Set fieldNames();
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java
deleted file mode 100644
index d5712cf2f..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java
+++ /dev/null
@@ -1,57 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.io.Serializable;
-
-import eu.dnetlib.pace.config.Type;
-
-/**
- * The Interface Field.
- */
-public interface Field extends Iterable, Serializable {
-
- /**
- * Gets the name.
- *
- * @return the name
- */
- public String getName();
-
- /**
- * Sets the name.
- *
- * @param name
- * the new name
- */
- public void setName(String name);
-
- /**
- * Gets the type.
- *
- * @return the type
- */
- public Type getType();
-
- /**
- * Sets the type.
- *
- * @param type
- * the new type
- */
- public void setType(Type type);
-
- /**
- * Checks if is empty.
- *
- * @return true, if is empty
- */
- public boolean isEmpty();
-
- /**
- * String value.
- *
- * @return the string
- */
- public String stringValue();
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
index 8b123f2d5..f34545e6d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@@ -39,20 +39,6 @@ public class FieldDef implements Serializable {
public FieldDef() {
}
- // def apply(s: String): Field[A]
- public Field apply(final Type type, final String s) {
- switch (type) {
- case Int:
- return new FieldValueImpl(type, name, Integer.parseInt(s));
- case String:
- return new FieldValueImpl(type, name, s);
- case List:
- return new FieldListImpl(name, type);
- default:
- throw new IllegalArgumentException("Casting not implemented for type " + type);
- }
- }
-
public String getName() {
return name;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java
deleted file mode 100644
index b47795d8b..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java
+++ /dev/null
@@ -1,25 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.util.List;
-
-/**
- * The Interface FieldList.
- */
-public interface FieldList extends List, Field {
-
- /**
- * String list.
- *
- * @return the list
- */
- public List stringList();
-
- /**
- * Double[] Array
- *
- * @return the double[] array
- */
- public double[] doubleArray();
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
deleted file mode 100644
index ca23a0bfc..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java
+++ /dev/null
@@ -1,315 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.ListIterator;
-
-import com.fasterxml.jackson.core.JsonProcessingException;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-
-import eu.dnetlib.pace.config.Type;
-
-/**
- * The Class FieldListImpl.
- */
-public class FieldListImpl extends AbstractField implements FieldList {
-
- /** The fields. */
- private List fields;
-
- /**
- * Instantiates a new field list impl.
- */
- public FieldListImpl() {
- fields = Lists.newArrayList();
- }
-
- /**
- * Instantiates a new field list impl.
- *
- * @param name
- * the name
- */
- public FieldListImpl(final String name, final Type type) {
- super(type, name);
- fields = Lists.newArrayList();
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#add(java.lang.Object)
- */
- @Override
- public boolean add(final Field f) {
- return fields.add(f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#add(int, java.lang.Object)
- */
- @Override
- public void add(final int i, final Field f) {
- fields.add(i, f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#addAll(java.util.Collection)
- */
- @Override
- public boolean addAll(final Collection extends Field> f) {
- return fields.addAll(f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#addAll(int, java.util.Collection)
- */
- @Override
- public boolean addAll(final int i, final Collection extends Field> f) {
- return fields.addAll(i, f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#clear()
- */
- @Override
- public void clear() {
- fields.clear();
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#contains(java.lang.Object)
- */
- @Override
- public boolean contains(final Object o) {
- return fields.contains(o);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#containsAll(java.util.Collection)
- */
- @Override
- public boolean containsAll(final Collection> f) {
- return fields.containsAll(f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#get(int)
- */
- @Override
- public Field get(final int i) {
- return fields.get(i);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#indexOf(java.lang.Object)
- */
- @Override
- public int indexOf(final Object o) {
- return fields.indexOf(o);
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#isEmpty()
- */
- @Override
- public boolean isEmpty() {
- return Iterables.all(fields, f -> f.isEmpty());
- }
-
- /*
- * (non-Javadoc)
- * @see java.lang.Iterable#iterator()
- */
- @Override
- public Iterator iterator() {
- return fields.iterator();
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#lastIndexOf(java.lang.Object)
- */
- @Override
- public int lastIndexOf(final Object o) {
- return fields.lastIndexOf(o);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#listIterator()
- */
- @Override
- public ListIterator listIterator() {
- return fields.listIterator();
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#listIterator(int)
- */
- @Override
- public ListIterator listIterator(final int i) {
- return fields.listIterator(i);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#remove(java.lang.Object)
- */
- @Override
- public boolean remove(final Object o) {
- return fields.remove(o);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#remove(int)
- */
- @Override
- public Field remove(final int i) {
- return fields.remove(i);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#removeAll(java.util.Collection)
- */
- @Override
- public boolean removeAll(final Collection> f) {
- return fields.removeAll(f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#retainAll(java.util.Collection)
- */
- @Override
- public boolean retainAll(final Collection> f) {
- return fields.retainAll(f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#set(int, java.lang.Object)
- */
- @Override
- public Field set(final int i, final Field f) {
- return fields.set(i, f);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#size()
- */
- @Override
- public int size() {
- return fields.size();
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#subList(int, int)
- */
- @Override
- public List subList(final int from, final int to) {
- return fields.subList(from, to);
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#toArray()
- */
- @Override
- public Object[] toArray() {
- return fields.toArray();
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.List#toArray(java.lang.Object[])
- */
- @Override
- public T[] toArray(final T[] t) {
- return fields.toArray(t);
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#stringValue()
- */
- @Override
- public String stringValue() {
- switch (getType()) {
-
- case List:
- case Int:
- case String:
- return Joiner.on(" ").join(stringList());
- case JSON:
- String json;
- try {
- json = new ObjectMapper().writeValueAsString(this);
- } catch (JsonProcessingException e) {
- json = null;
- }
- return json;
- default:
- throw new IllegalArgumentException("Unknown type: " + getType().toString());
- }
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.FieldList#stringList()
- */
- @Override
- public List stringList() {
- return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer()));
- }
-
- private Function getValuesTransformer() {
- return new Function() {
-
- @Override
- public String apply(final Field f) {
- return f.stringValue();
- }
- };
- }
-
- @Override
- public double[] doubleArray() {
- return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d -> d).toArray();
- }
-
- private Function getDouble() {
-
- return new Function() {
- @Override
- public Double apply(final Field f) {
- return Double.parseDouble(f.stringValue());
- }
- };
- }
-
- @Override
- public String toString() {
- return stringList().toString();
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
deleted file mode 100644
index b20f21a5c..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java
+++ /dev/null
@@ -1,26 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-/**
- * The Interface FieldValue.
- */
-public interface FieldValue extends Field {
-
- /**
- * Gets the value.
- *
- * @return the value
- */
- public Object getValue();
-
- /**
- * Sets the value.
- *
- * @param value
- * the new value
- */
- public void setValue(final Object value);
-
- public double[] doubleArrayValue();
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
deleted file mode 100644
index eff54abfb..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java
+++ /dev/null
@@ -1,135 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-
-import org.apache.commons.lang3.StringUtils;
-
-import eu.dnetlib.pace.config.Type;
-
-/**
- * The Class FieldValueImpl.
- */
-public class FieldValueImpl extends AbstractField implements FieldValue {
-
- /** The value. */
- private Object value = null;
-
- /**
- * Instantiates a new field value impl.
- */
- public FieldValueImpl() {
- }
-
- /**
- * Instantiates a new field value impl.
- *
- * @param type
- * the type
- * @param name
- * the name
- * @param value
- * the value
- */
- public FieldValueImpl(final Type type, final String name, final Object value) {
- super(type, name);
- this.value = value;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#isEmpty()
- */
- @Override
- public boolean isEmpty() {
- if (value == null)
- return false;
-
- switch (type) {
- case String:
- case JSON:
- return value.toString().isEmpty();
- case List:
- try {
- List> list = (List>) value;
- return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
- } catch (Exception e) {
- throw new RuntimeException(value.toString());
- }
- case URL:
- String str = value.toString();
- return StringUtils.isBlank(str) || !isValidURL(str);
- case DoubleArray:
- return doubleArrayValue().length == 0;
- default:
- return true;
- }
- }
-
- private boolean isValidURL(final String s) {
- try {
- new URL(s);
- return true;
- } catch (MalformedURLException e) {
- return false;
- }
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.FieldValue#getValue()
- */
- @Override
- public Object getValue() {
- return value;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
- */
- @Override
- public void setValue(final Object value) {
- this.value = value;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.Field#stringValue()
- */
- @Override
- // @SuppressWarnings("unchecked")
- public String stringValue() {
- return String.valueOf(getValue());
- // switch (getType()) {
- //
- // case Int:
- // return String.valueOf(getValue());
- // case List:
- // return Joiner.on(" ").join((List) getValue());
- // case String:
- // return (String) getValue();
- // default:
- // throw new IllegalArgumentException("Unknown type: " + getType().toString());
- // }
- }
-
- public double[] doubleArrayValue() {
- return (double[]) getValue();
- }
-
- /*
- * (non-Javadoc)
- * @see java.lang.Iterable#iterator()
- */
- @Override
- @SuppressWarnings("unchecked")
- public Iterator iterator() {
- return Collections.singleton((Field) this).iterator();
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java
deleted file mode 100644
index c2860ca3b..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java
+++ /dev/null
@@ -1,143 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.io.Serializable;
-import java.util.Map;
-import java.util.Set;
-
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-/**
- * The Class MapDocument.
- */
-public class MapDocument implements Document, Serializable {
-
- /** The identifier. */
- private String identifier;
-
- /** The field map. */
- private Map fieldMap;
-
- /**
- * Instantiates a new map document.
- */
- public MapDocument() {
- identifier = null;
- fieldMap = Maps.newHashMap();
- }
-
- /**
- * Instantiates a new map document.
- *
- * @param identifier
- * the identifier
- * @param fieldMap
- * the field map
- */
- public MapDocument(final String identifier, final Map fieldMap) {
- this.setIdentifier(identifier);
- this.fieldMap = fieldMap;
- }
-
- /**
- * Instantiates a new map document.
- *
- * @param identifier
- * the identifier
- * @param data
- * the data
- */
- public MapDocument(final String identifier, final byte[] data) {
- final MapDocument doc = MapDocumentSerializer.decode(data);
-
- this.fieldMap = doc.fieldMap;
- this.identifier = doc.identifier;
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.document.Document#fields()
- */
- @Override
- public Iterable fields() {
- return Lists.newArrayList(Iterables.concat(fieldMap.values()));
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
- */
- @Override
- public Field values(final String name) {
- return fieldMap.get(name);
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.document.Document#fieldNames()
- */
- @Override
- public Set fieldNames() {
- return fieldMap.keySet();
- }
-
- /*
- * (non-Javadoc)
- * @see java.lang.Object#toString()
- */
- @Override
- public String toString() {
- return MapDocumentSerializer.toString(this);
- // return String.format("Document(%s)", fieldMap.toString());
- }
-
- /**
- * To byte array.
- *
- * @return the byte[]
- */
- public byte[] toByteArray() {
- return MapDocumentSerializer.toByteArray(this);
- }
-
- /*
- * (non-Javadoc)
- * @see eu.dnetlib.pace.model.document.Document#getIdentifier()
- */
- @Override
- public String getIdentifier() {
- return identifier;
- }
-
- /**
- * Sets the identifier.
- *
- * @param identifier
- * the new identifier
- */
- public void setIdentifier(final String identifier) {
- this.identifier = identifier;
- }
-
- /**
- * Gets the field map.
- *
- * @return the field map
- */
- public Map getFieldMap() {
- return fieldMap;
- }
-
- /**
- * Sets the field map.
- *
- * @param fieldMap
- * the field map
- */
- public void setFieldMap(final Map fieldMap) {
- this.fieldMap = fieldMap;
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
deleted file mode 100644
index a77dcbc0c..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java
+++ /dev/null
@@ -1,52 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.util.Comparator;
-
-import com.google.common.collect.Iterables;
-
-import eu.dnetlib.pace.clustering.NGramUtils;
-
-/**
- * The Class MapDocumentComparator.
- */
-public class MapDocumentComparator implements Comparator {
-
- /** The comparator field. */
- private String comparatorField;
-
- private final FieldList emptyField = new FieldListImpl();
-
- /**
- * Instantiates a new map document comparator.
- *
- * @param comparatorField
- * the comparator field
- */
- public MapDocumentComparator(final String comparatorField) {
- this.comparatorField = comparatorField;
- }
-
- /*
- * (non-Javadoc)
- * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
- */
- @Override
- public int compare(final Document d1, final Document d2) {
-
- if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty())
- return 0;
-
- final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue();
- final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue();
-
- if ((o1 == null) || (o2 == null))
- return 0;
-
- final String to1 = NGramUtils.cleanupForOrdering(o1);
- final String to2 = NGramUtils.cleanupForOrdering(o2);
-
- return to1.compareTo(to2);
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
deleted file mode 100644
index d71f780ad..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java
+++ /dev/null
@@ -1,103 +0,0 @@
-
-package eu.dnetlib.pace.model;
-
-import java.lang.reflect.Type;
-
-import com.google.gson.GsonBuilder;
-import com.google.gson.InstanceCreator;
-import com.google.gson.JsonDeserializationContext;
-import com.google.gson.JsonDeserializer;
-import com.google.gson.JsonElement;
-import com.google.gson.JsonObject;
-import com.google.gson.JsonParseException;
-
-/**
- * The Class MapDocumentSerializer.
- */
-public class MapDocumentSerializer implements InstanceCreator {
-
- @Override
- public MapDocument createInstance(final Type type) {
- return new MapDocument();
- }
-
- /**
- * Decode.
- *
- * @param s
- * the String
- * @return the map document
- */
- public static MapDocument decode(final String s) {
- final GsonBuilder gson = new GsonBuilder();
-
- gson.registerTypeAdapter(Field.class, new JsonDeserializer() {
-
- @Override
- public Field deserialize(final JsonElement json, final Type typeOfT,
- final JsonDeserializationContext context) throws JsonParseException {
- final FieldListImpl fl = new FieldListImpl();
- if (json.isJsonObject()) {
-
- fl.add(handleJsonObject(json.getAsJsonObject()));
-
- } else if (json.isJsonArray()) {
-
- for (final JsonElement e : json.getAsJsonArray()) {
- if (e.isJsonObject()) {
- fl.add(handleJsonObject(e.getAsJsonObject()));
- }
- }
- }
- return fl;
- }
-
- private Field handleJsonObject(final JsonObject o) {
- final FieldListImpl fl = new FieldListImpl();
- final String name = o.get("name").getAsString();
- final String type = o.get("type").getAsString();
- final String value = o.get("value").getAsString();
- fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
- return fl;
- }
- });
-
- return gson.create().fromJson(s, MapDocument.class);
- }
-
- /**
- * Decode.
- *
- * @param bytes
- * the bytes
- * @return the map document
- */
- public static MapDocument decode(final byte[] bytes) {
- return decode(new String(bytes));
- }
-
- /**
- * To string.
- *
- * @param doc
- * the doc
- * @return the string
- */
- public static String toString(final MapDocument doc) {
- final GsonBuilder b = new GsonBuilder();
- return b.setPrettyPrinting().create().toJson(doc);
-
- }
-
- /**
- * To byte array.
- *
- * @param doc
- * the doc
- * @return the byte[]
- */
- public static byte[] toByteArray(final MapDocument doc) {
- return toString(doc).getBytes();
- }
-
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
new file mode 100644
index 000000000..f0ded0570
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
@@ -0,0 +1,65 @@
+
+package eu.dnetlib.pace.model;
+
+import java.util.Comparator;
+
+import org.apache.spark.sql.Row;
+
+import eu.dnetlib.pace.clustering.NGramUtils;
+
+/**
+ * The Class MapDocumentComparator.
+ */
+public class RowDataOrderingComparator implements Comparator {
+
+ /** The comparator field. */
+ private final int comparatorField;
+ private final int identityFieldPosition;
+
+ /**
+ * Instantiates a new map document comparator.
+ *
+ * @param comparatorField
+ * the comparator field
+ */
+ public RowDataOrderingComparator(final int comparatorField, int identityFieldPosition) {
+ this.comparatorField = comparatorField;
+ this.identityFieldPosition = identityFieldPosition;
+ }
+
+ /*
+ * (non-Javadoc)
+ * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ public int compare(final Row d1, final Row d2) {
+ if (d1 == null)
+ return d2 == null ? 0 : -1;
+ else if (d2 == null) {
+ return 1;
+ }
+
+ final String o1 = d1.getString(comparatorField);
+ final String o2 = d2.getString(comparatorField);
+
+ if (o1 == null)
+ return o2 == null ? 0 : -1;
+ else if (o2 == null) {
+ return 1;
+ }
+
+ final String to1 = NGramUtils.cleanupForOrdering(o1);
+ final String to2 = NGramUtils.cleanupForOrdering(o2);
+
+ int res = to1.compareTo(to2);
+ if (res == 0) {
+ res = o1.compareTo(o2);
+ if (res == 0) {
+ return d1.getString(identityFieldPosition).compareTo(d2.getString(identityFieldPosition));
+ }
+ }
+
+ return res;
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala
new file mode 100644
index 000000000..b3f56bcdb
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala
@@ -0,0 +1,131 @@
+package eu.dnetlib.pace.model
+
+import eu.dnetlib.pace.config.{DedupConfig, Type}
+import eu.dnetlib.pace.util.{BlockProcessor, SparkReporter}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.catalyst.expressions.Literal
+import org.apache.spark.sql.expressions._
+import org.apache.spark.sql.functions.{col, lit, udf}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{Column, Dataset, Row, functions}
+
+import java.util.function.Predicate
+import java.util.stream.Collectors
+import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+case class SparkDeduper(conf: DedupConfig) extends Serializable {
+
+ val model: SparkModel = SparkModel(conf)
+
+ val dedup: (Dataset[Row] => Dataset[Row]) = df => {
+ df.transform(filterAndCleanup)
+ .transform(generateClustersWithCollect)
+ .transform(processBlocks)
+ }
+
+
+ val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
+ val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
+ if (conf.blacklists.containsKey(fdef.getName)) {
+ res.withColumn(
+ fdef.getName + "_filtered",
+ filterColumnUDF(fdef).apply(new Column(fdef.getName))
+ )
+ } else {
+ res
+ }
+ })
+
+ df_with_filters
+ }
+
+ def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
+ val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
+
+ if (blacklist == null) {
+ throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
+ } else {
+ fdef.getType match {
+ case Type.List | Type.JSON =>
+ udf[Array[String], Array[String]](values => {
+ values.filter((v: String) => !blacklist.test(v))
+ })
+
+ case _ =>
+ udf[String, String](v => {
+ if (blacklist.test(v)) ""
+ else v
+ })
+ }
+ }
+ }
+
+ val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
+ var df_with_clustering_keys: Dataset[Row] = null
+
+ for ((cd, idx) <- conf.clusterings().zipWithIndex) {
+ val inputColumns = cd.getFields().foldLeft(Seq[Column]())((acc, fName) => {
+ val column = if (conf.blacklists.containsKey(fName))
+ Seq(col(fName + "_filtered"))
+ else
+ Seq(col(fName))
+
+ acc ++ column
+ })
+
+ // Add 'key' column with the value generated by the given clustering definition
+ val ds: Dataset[Row] = df_with_filters
+ .withColumn("clustering", lit(cd.getName + "::" + idx))
+ .withColumn("key", functions.explode(clusterValuesUDF(cd).apply(functions.array(inputColumns: _*))))
+ // Add position column having the position of the row within the set of rows having the same key value ordered by the sorting value
+ .withColumn("position", functions.row_number().over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName))))
+
+ if (df_with_clustering_keys == null)
+ df_with_clustering_keys = ds
+ else
+ df_with_clustering_keys = df_with_clustering_keys.union(ds)
+ }
+
+ //TODO: analytics
+
+ val df_with_blocks = df_with_clustering_keys
+ // filter out rows with position exceeding the maxqueuesize parameter
+ .filter(col("position").leq(conf.getWf.getQueueMaxSize))
+ .groupBy("clustering", "key")
+ .agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block"))
+ .filter(functions.size(new Column("block")).gt(1))
+
+ df_with_blocks
+ }
+
+ def clusterValuesUDF(cd: ClusteringDef) = {
+ udf[mutable.WrappedArray[String], mutable.WrappedArray[Any]](values => {
+ values.flatMap(f => cd.clusteringFunction().apply(conf, Seq(f.toString).asJava).asScala)
+ })
+ }
+
+ val processBlocks: (Dataset[Row] => Dataset[Row]) = df => {
+ df.filter(functions.size(new Column("block")).geq(new Literal(2, DataTypes.IntegerType)))
+ .withColumn("relations", processBlock(df.sqlContext.sparkContext).apply(new Column("block")))
+ .select(functions.explode(new Column("relations")).as("relation"))
+ }
+
+ def processBlock(implicit sc: SparkContext) = {
+ val accumulators = SparkReporter.constructAccumulator(conf, sc)
+
+ udf[Array[(String, String)], mutable.WrappedArray[Row]](block => {
+ val reporter = new SparkReporter(accumulators)
+
+ val mapDocuments = block.asJava.stream()
+ .sorted(new RowDataOrderingComparator(model.orderingFieldPosition, model.identityFieldPosition))
+ .limit(conf.getWf.getQueueMaxSize)
+ .collect(Collectors.toList[Row]())
+
+ new BlockProcessor(conf, model.identityFieldPosition, model.orderingFieldPosition).processSortedRows(mapDocuments, reporter)
+
+ reporter.getRelations.asScala.toArray
+ }).asNondeterministic()
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
new file mode 100644
index 000000000..95325ace0
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala
@@ -0,0 +1,108 @@
+package eu.dnetlib.pace.model
+
+import com.jayway.jsonpath.{Configuration, JsonPath}
+import eu.dnetlib.pace.config.{DedupConfig, Type}
+import eu.dnetlib.pace.util.MapDocumentUtil
+import org.apache.spark.sql.catalyst.encoders.RowEncoder
+import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
+import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
+import org.apache.spark.sql.{Dataset, Row}
+
+import java.util.regex.Pattern
+import scala.collection.JavaConverters._
+
+case class SparkModel(conf: DedupConfig) {
+ private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")
+
+ private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")
+
+ val identifierFieldName = "identifier"
+
+ val orderingFieldName = if (!conf.getWf.getOrderField.isEmpty) conf.getWf.getOrderField else identifierFieldName
+
+ val schema: StructType = {
+ // create an implicit identifier field
+ val identifier = new FieldDef()
+ identifier.setName(identifierFieldName)
+ identifier.setType(Type.String)
+
+ // Construct a Spark StructType representing the schema of the model
+ (Seq(identifier) ++ conf.getPace.getModel.asScala)
+ .foldLeft(
+ new StructType()
+ )((resType, fieldDef) => {
+ resType.add(fieldDef.getType match {
+ case Type.List | Type.JSON =>
+ StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.StringType), true, Metadata.empty)
+ case Type.DoubleArray =>
+ StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.DoubleType), true, Metadata.empty)
+ case _ =>
+ StructField(fieldDef.getName, DataTypes.StringType, true, Metadata.empty)
+ })
+ })
+
+
+ }
+
+ val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
+
+ val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
+
+ val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
+ df.map(r => rowFromJson(r))(RowEncoder(schema))
+ }
+
+ def rowFromJson(json: String): Row = {
+ val documentContext =
+ JsonPath.using(Configuration.defaultConfiguration.addOptions(com.jayway.jsonpath.Option.SUPPRESS_EXCEPTIONS)).parse(json)
+ val values = new Array[Any](schema.size)
+
+ values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
+
+ schema.fieldNames.zipWithIndex.foldLeft(values) {
+ case ((res, (fname, index))) => {
+ val fdef = conf.getPace.getModelMap.get(fname)
+
+ if (fdef != null) {
+ res(index) = fdef.getType match {
+ case Type.String | Type.Int =>
+ MapDocumentUtil.truncateValue(
+ MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
+ fdef.getLength
+ )
+
+ case Type.URL =>
+ var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
+ if (!URL_REGEX.matcher(uv).matches)
+ uv = ""
+ uv
+
+ case Type.List | Type.JSON =>
+ MapDocumentUtil.truncateList(
+ MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
+ fdef.getSize
+ ).toArray
+
+ case Type.StringConcat =>
+ val jpaths = CONCAT_REGEX.split(fdef.getPath)
+
+ MapDocumentUtil.truncateValue(
+ jpaths
+ .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
+ .mkString(" "),
+ fdef.getLength
+ )
+
+ case Type.DoubleArray =>
+ MapDocumentUtil.getJPathArray(fdef.getPath, json)
+ }
+ }
+
+ res
+ }
+ }
+
+ new GenericRowWithSchema(values, schema)
+ }
+}
+
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
index f53655a8e..4d31df5b3 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
@@ -6,12 +6,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("alwaysMatch")
-public class AlwaysMatch extends AbstractComparator {
+public class AlwaysMatch extends AbstractComparator {
public AlwaysMatch(final Map params) {
super(params, new com.wcohen.ss.JaroWinkler());
@@ -26,7 +25,7 @@ public class AlwaysMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(final Object a, final Object b, final Config conf) {
return 1.0;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
index 047e121e3..5c6939e60 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@@ -1,25 +1,19 @@
package eu.dnetlib.pace.tree;
-import java.util.Comparator;
import java.util.List;
import java.util.Map;
-import java.util.function.Function;
import java.util.stream.Collectors;
-import java.util.stream.Stream;
-import com.google.common.collect.Iterables;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.Person;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("authorsMatch")
-public class AuthorsMatch extends AbstractComparator {
+public class AuthorsMatch extends AbstractListComparator {
Map params;
@@ -49,24 +43,16 @@ public class AuthorsMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(final List a, final List b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
- if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
+ if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
return 1.0;
- List aList = ((FieldList) a)
- .stringList()
- .stream()
- .map(author -> new Person(author, false))
- .collect(Collectors.toList());
- List bList = ((FieldList) b)
- .stringList()
- .stream()
- .map(author -> new Person(author, false))
- .collect(Collectors.toList());
+ List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+ List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
common = 0;
// compare each element of List1 with each element of List2
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
index f3da29e8e..1d898ad83 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
@@ -5,11 +5,11 @@ import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("cityMatch")
-public class CityMatch extends AbstractComparator {
+public class CityMatch extends AbstractStringComparator {
private Map params;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
index 82d84794f..d255612ba 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
@@ -1,21 +1,14 @@
package eu.dnetlib.pace.tree;
-import java.util.HashMap;
-import java.util.List;
import java.util.Map;
-import java.util.stream.Collectors;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.model.FieldValueImpl;
-import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("cosineSimilarity")
-public class CosineSimilarity extends AbstractComparator {
+public class CosineSimilarity extends AbstractComparator {
Map params;
@@ -24,15 +17,16 @@ public class CosineSimilarity extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(Object a, Object b, Config config) {
+ return compare((double[]) a, (double[]) b, config);
+ }
- if (a.isEmpty() || b.isEmpty())
+ public double compare(final double[] a, final double[] b, final Config conf) {
+
+ if (a.length == 0 || b.length == 0)
return -1;
- double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
- double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
-
- return cosineSimilarity(aVector, bVector);
+ return cosineSimilarity(a, b);
}
double cosineSimilarity(double[] a, double[] b) {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
index 24f3dfc02..d3c5bc10d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
@@ -3,7 +3,6 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
@@ -21,8 +20,8 @@ public class DoiExactMatch extends ExactMatchIgnoreCase {
}
@Override
- protected String getValue(final Field f) {
- return super.getValue(f).replaceAll(PREFIX, "");
+ protected String toString(final Object f) {
+ return super.toString(f).replaceAll(PREFIX, "");
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
index efafe6573..c28274652 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
@@ -5,7 +5,6 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("domainExactMatch")
@@ -16,10 +15,10 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
}
@Override
- protected String getValue(final Field f) {
+ protected String toString(final Object f) {
try {
- return asUrl(super.getValue(f)).getHost();
+ return asUrl(super.toString(f)).getHost();
} catch (MalformedURLException e) {
return "";
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
index 85ce6744d..35357c553 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
@@ -6,11 +6,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("exactMatch")
-public class ExactMatch extends AbstractComparator {
+public class ExactMatch extends AbstractStringComparator {
public ExactMatch(Map params) {
super(params, new com.wcohen.ss.JaroWinkler());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
index 307f02246..85c57ad40 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
@@ -4,30 +4,26 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("exactMatchIgnoreCase")
-public class ExactMatchIgnoreCase extends AbstractComparator {
+public class ExactMatchIgnoreCase extends AbstractStringComparator {
public ExactMatchIgnoreCase(Map params) {
super(params);
}
@Override
- public double compare(Field a, Field b, final Config conf) {
+ public double compare(String a, String b, final Config conf) {
- final String fa = getValue(a);
- final String fb = getValue(b);
-
- if (fa.isEmpty() || fb.isEmpty())
+ if (a.isEmpty() || b.isEmpty())
return -1;
- return fa.equalsIgnoreCase(fb) ? 1 : 0;
+ return a.equalsIgnoreCase(b) ? 1 : 0;
}
- protected String getValue(final Field f) {
- return getFirstValue(f);
+ protected String toString(final Object object) {
+ return toFirstString(object);
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
index bdef1225d..238cb16ce 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
@@ -10,13 +10,11 @@ import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("instanceTypeMatch")
-public class InstanceTypeMatch extends AbstractComparator {
+public class InstanceTypeMatch extends AbstractListComparator {
final Map translationMap = new HashMap<>();
@@ -42,21 +40,18 @@ public class InstanceTypeMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(final List a, final List b, final Config conf) {
if (a == null || b == null) {
return -1;
}
- final List sa = ((FieldList) a).stringList();
- final List sb = ((FieldList) b).stringList();
-
- if (sa.isEmpty() || sb.isEmpty()) {
+ if (a.isEmpty() || b.isEmpty()) {
return -1;
}
- final Set ca = sa.stream().map(this::translate).collect(Collectors.toSet());
- final Set cb = sb.stream().map(this::translate).collect(Collectors.toSet());
+ final Set ca = a.stream().map(this::translate).collect(Collectors.toSet());
+ final Set cb = b.stream().map(this::translate).collect(Collectors.toSet());
// if at least one is a jolly type, it must produce a match
if (ca.contains("*") || cb.contains("*"))
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
index 7511e5ec9..2cb411d26 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java
@@ -6,12 +6,12 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinkler")
-public class JaroWinkler extends AbstractComparator {
+public class JaroWinkler extends AbstractStringComparator {
public JaroWinkler(Map params) {
super(params, new com.wcohen.ss.JaroWinkler());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
index 4f4c68e47..576b9281d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java
@@ -7,11 +7,11 @@ import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName")
-public class JaroWinklerNormalizedName extends AbstractComparator {
+public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map params;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
index d97d8d061..6ba7dd2a4 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java
@@ -6,12 +6,12 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinklerTitle")
-public class JaroWinklerTitle extends AbstractComparator {
+public class JaroWinklerTitle extends AbstractStringComparator {
public JaroWinklerTitle(Map params) {
super(params, new com.wcohen.ss.JaroWinkler());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
index e5c69a852..3897e37f8 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
@@ -10,16 +10,18 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
+import com.jayway.jsonpath.Configuration;
+import com.jayway.jsonpath.DocumentContext;
+import com.jayway.jsonpath.JsonPath;
+import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
@ComparatorClass("jsonListMatch")
-public class JsonListMatch extends AbstractComparator {
+public class JsonListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map params;
@@ -34,11 +36,7 @@ public class JsonListMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
-
- final List sa = ((FieldList) a).stringList();
- final List sb = ((FieldList) b).stringList();
-
+ public double compare(final List sa, final List sb, final Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
@@ -65,14 +63,17 @@ public class JsonListMatch extends AbstractComparator {
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
// parameters
-
+ final DocumentContext documentContext = JsonPath
+ .using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS))
+ .parse(json);
// for each path in the param list
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key);
- String value = MapDocumentUtil.getJPathString(path, json);
+ String value = MapDocumentUtil.getJPathString(path, documentContext);
if (value == null || value.isEmpty())
value = "";
- st.append(value + "::");
+ st.append(value);
+ st.append("::");
}
st.setLength(st.length() - 2);
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java
index 0d69e5177..53acb4dc8 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java
@@ -5,11 +5,11 @@ import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("keywordMatch")
-public class KeywordMatch extends AbstractComparator {
+public class KeywordMatch extends AbstractStringComparator {
Map params;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java
index d483049d7..970f975f6 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java
@@ -5,11 +5,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("level2JaroWinkler")
-public class Level2JaroWinkler extends AbstractComparator {
+public class Level2JaroWinkler extends AbstractStringComparator {
public Level2JaroWinkler(Map params) {
super(params, new com.wcohen.ss.Level2JaroWinkler());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java
index a87a6079a..e351058f9 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java
@@ -6,11 +6,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("level2JaroWinklerTitle")
-public class Level2JaroWinklerTitle extends AbstractComparator {
+public class Level2JaroWinklerTitle extends AbstractStringComparator {
public Level2JaroWinklerTitle(Map params) {
super(params, new com.wcohen.ss.Level2JaroWinkler());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java
index 5ac19ee2e..e66602e4f 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java
@@ -5,11 +5,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("level2Levenstein")
-public class Level2Levenstein extends AbstractComparator {
+public class Level2Levenstein extends AbstractStringComparator {
public Level2Levenstein(Map params) {
super(params, new com.wcohen.ss.Level2Levenstein());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java
index 4072f52aa..0871f8176 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java
@@ -5,11 +5,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("levenstein")
-public class Levenstein extends AbstractComparator {
+public class Levenstein extends AbstractStringComparator {
public Levenstein(Map params) {
super(params, new com.wcohen.ss.Levenstein());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java
index 896e93f09..877cb95ab 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java
@@ -9,11 +9,11 @@ import org.apache.commons.logging.LogFactory;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("levensteinTitle")
-public class LevensteinTitle extends AbstractComparator {
+public class LevensteinTitle extends AbstractStringComparator {
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java
index 796edf49e..341c0a62b 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java
@@ -6,14 +6,14 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@ComparatorClass("levensteinTitleIgnoreVersion")
-public class LevensteinTitleIgnoreVersion extends AbstractComparator {
+public class LevensteinTitleIgnoreVersion extends AbstractStringComparator {
public LevensteinTitleIgnoreVersion(Map params) {
super(params, new com.wcohen.ss.Levenstein());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java
index 8abe37d96..059db8de5 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java
@@ -3,15 +3,10 @@ package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import java.util.stream.Collectors;
-import com.google.common.collect.Sets;
-
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
@@ -20,7 +15,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
* @author miconis
* */
@ComparatorClass("listContainsMatch")
-public class ListContainsMatch extends AbstractComparator {
+public class ListContainsMatch extends AbstractListComparator {
private Map params;
private boolean CASE_SENSITIVE;
@@ -38,11 +33,7 @@ public class ListContainsMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
-
- List sa = ((FieldList) a).stringList();
- List sb = ((FieldList) b).stringList();
-
+ public double compare(List sa, List sb, Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java
index ee4b58d9c..b9d62cf16 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java
@@ -6,11 +6,11 @@ import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("mustBeDifferent")
-public class MustBeDifferent extends AbstractComparator {
+public class MustBeDifferent extends AbstractStringComparator {
public MustBeDifferent(Map params) {
super(params, new com.wcohen.ss.Levenstein());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java
index 8b400122f..3ae1dcde0 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java
@@ -4,7 +4,6 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@@ -13,13 +12,13 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
* NullDistanceAlgo.
*/
@ComparatorClass("null")
-public class NullDistanceAlgo implements Comparator {
+public class NullDistanceAlgo implements Comparator {
public NullDistanceAlgo(Map params) {
}
@Override
- public double compare(Field a, Field b, Config config) {
+ public double compare(Object a, Object b, Config config) {
return 0;
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
index ebe25bab4..2c003a170 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java
@@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("numbersComparator")
-public class NumbersComparator extends AbstractComparator {
+public class NumbersComparator extends AbstractStringComparator {
Map params;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java
index 52f99d018..820436d2e 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java
@@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("numbersMatch")
-public class NumbersMatch extends AbstractComparator {
+public class NumbersMatch extends AbstractStringComparator {
public NumbersMatch(Map params) {
super(params);
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java
index 08e4d5d84..a7c580973 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java
@@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("romansMatch")
-public class RomansMatch extends AbstractComparator {
+public class RomansMatch extends AbstractStringComparator {
public RomansMatch(Map params) {
super(params);
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java
index 01cb3dd63..fb99ddb14 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java
@@ -4,11 +4,8 @@ package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
-import com.google.common.collect.Iterables;
-
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
@@ -17,7 +14,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
* @author claudio
*/
@ComparatorClass("sizeMatch")
-public class SizeMatch extends AbstractComparator {
+public class SizeMatch extends AbstractListComparator {
/**
* Instantiates a new size match.
@@ -30,23 +27,12 @@ public class SizeMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(final List a, final List b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
- return -1;
+ return -1.0;
- return Iterables.size(a) == Iterables.size(b) ? 1 : 0;
- }
-
- /**
- * Checks if is empty.
- *
- * @param a
- * the a
- * @return true, if is empty
- */
- protected boolean isEmpty(final Iterable> a) {
- return (a == null) || Iterables.isEmpty(a);
+ return a.size() == b.size() ? 1.0 : 0.0;
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
index cef6de504..bca417b60 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
@@ -4,7 +4,7 @@ package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
@@ -13,7 +13,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
* @author miconis
* */
@ComparatorClass("stringContainsMatch")
-public class StringContainsMatch extends AbstractComparator {
+public class StringContainsMatch extends AbstractStringComparator {
private Map params;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
index c74deadc9..b4dbef3bb 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java
@@ -2,6 +2,7 @@
package eu.dnetlib.pace.tree;
import java.util.HashSet;
+import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -11,13 +12,11 @@ import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("stringListMatch")
-public class StringListMatch extends AbstractComparator {
+public class StringListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map params;
@@ -32,10 +31,10 @@ public class StringListMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(final List a, final List b, final Config conf) {
- final Set pa = new HashSet<>(((FieldList) a).stringList());
- final Set pb = new HashSet<>(((FieldList) b).stringList());
+ final Set pa = new HashSet<>(a);
+ final Set pb = new HashSet<>(b);
if (pa.isEmpty() || pb.isEmpty()) {
return -1; // return undefined if one of the two lists is empty
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java
index 23be3f752..3f8c40599 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java
@@ -8,25 +8,24 @@ import org.apache.commons.lang3.StringUtils;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* The Class SubStringLevenstein.
*/
@ComparatorClass("subStringLevenstein")
-public class SubStringLevenstein extends AbstractComparator {
+public class SubStringLevenstein extends AbstractStringComparator {
- /** The limit. */
+ /**
+ * The limit.
+ */
protected int limit;
/**
* Instantiates a new sub string levenstein.
- *
- * @param w
- * the w
+ *
+ * @param w the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
@@ -39,11 +38,9 @@ public class SubStringLevenstein extends AbstractComparator {
/**
* Instantiates a new sub string levenstein.
- *
- * @param w
- * the w
- * @param limit
- * the limit
+ *
+ * @param w the w
+ * @param limit the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
@@ -52,13 +49,10 @@ public class SubStringLevenstein extends AbstractComparator {
/**
* Instantiates a new sub string levenstein.
- *
- * @param w
- * the w
- * @param limit
- * the limit
- * @param ssalgo
- * the ssalgo
+ *
+ * @param w the w
+ * @param limit the limit
+ * @param ssalgo the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
@@ -71,11 +65,8 @@ public class SubStringLevenstein extends AbstractComparator {
* eu.dnetlib.pace.model.Field)
*/
@Override
- public double distance(final Field a, final Field b, final Config conf) {
- if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
- return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
-
- throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
+ public double distance(final String a, final String b, final Config conf) {
+ return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
}
/*
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java
index db1faf9e2..8d99ac27f 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java
@@ -1,12 +1,10 @@
package eu.dnetlib.pace.tree;
-import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
@@ -16,17 +14,14 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
*
*/
@ComparatorClass("titleVersionMatch")
-public class TitleVersionMatch extends AbstractComparator {
+public class TitleVersionMatch extends AbstractStringComparator {
public TitleVersionMatch(final Map params) {
super(params);
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
- final String valueA = getFirstValue(a);
- final String valueB = getFirstValue(b);
-
+ public double compare(final String valueA, final String valueB, final Config conf) {
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
@@ -38,4 +33,7 @@ public class TitleVersionMatch extends AbstractComparator {
return getClass().getSimpleName() + ":" + super.toString();
}
+ protected String toString(final Object object) {
+ return toFirstString(object);
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java
index f4f00a908..722236be6 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java
@@ -8,7 +8,6 @@ import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("urlMatcher")
@@ -31,9 +30,9 @@ public class UrlMatcher extends Levenstein {
}
@Override
- public double distance(Field a, Field b, final Config conf) {
- final URL urlA = asUrl(getFirstValue(a));
- final URL urlB = asUrl(getFirstValue(b));
+ public double distance(String a, String b, final Config conf) {
+ final URL urlA = asUrl(a);
+ final URL urlB = asUrl(b);
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
@@ -58,4 +57,7 @@ public class UrlMatcher extends Levenstein {
}
}
+ protected String toString(final Object object) {
+ return toFirstString(object);
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java
index 7ee8c8bad..95f796f6a 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java
@@ -6,8 +6,7 @@ import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
@@ -16,7 +15,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
* @author claudio
*/
@ComparatorClass("yearMatch")
-public class YearMatch extends AbstractComparator {
+public class YearMatch extends AbstractStringComparator {
private int limit = 4;
@@ -25,7 +24,7 @@ public class YearMatch extends AbstractComparator {
}
@Override
- public double compare(final Field a, final Field b, final Config conf) {
+ public double compare(final String a, final String b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
@@ -42,8 +41,8 @@ public class YearMatch extends AbstractComparator {
return s.length() == limit;
}
- protected String getFirstValue(final Field value) {
- return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
+ protected String getFirstValue(final String value) {
+ return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
}
@Override
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java
index 3ecffb289..8a957c5e3 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java
@@ -4,15 +4,14 @@ package eu.dnetlib.pace.tree.support;
import java.util.List;
import java.util.Map;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
+public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
@@ -69,8 +68,8 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
* the b
* @return the double
*/
- public double distance(final String a, final String b, final Config conf) {
+ protected double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; // return -1 if a field is missing
}
@@ -78,49 +77,50 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
return normalize(score);
}
- /**
- * Distance.
- *
- * @param a
- * the a
- * @param b
- * the b
- * @return the double
- */
- protected double distance(final List a, final List b, final Config conf) {
- return distance(concat(a), concat(b), conf);
- }
-
- public double distance(final Field a, final Field b, final Config conf) {
- if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
- return distance(a.stringValue(), b.stringValue(), conf);
- if (a.getType().equals(Type.List) && b.getType().equals(Type.List))
- return distance(toList(a), toList(b), conf);
-
- throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
- }
-
- @Override
- public double compare(final Field a, final Field b, final Config conf) {
+ protected double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
- if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
- return distance(a.stringValue(), b.stringValue(), conf);
- if (a.getType().equals(Type.List) && b.getType().equals(Type.List))
- return distance(toList(a), toList(b), conf);
-
- throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
+ return distance(a, b, conf);
}
/**
- * To list.
+ * Convert the given argument to a List of Strings
*
- * @param list
- * the list
+ * @param object
+ * function argument
* @return the list
*/
- protected List toList(final Field list) {
- return ((FieldList) list).stringList();
+ protected List toList(final Object object) {
+ if (object instanceof List) {
+ return (List) object;
+ }
+
+ return Lists.newArrayList(object.toString());
+ }
+
+ /**
+ * Convert the given argument to a String
+ *
+ * @param object
+ * function argument
+ * @return the list
+ */
+ protected String toString(final Object object) {
+ if (object instanceof List) {
+ List l = (List) object;
+ return Joiner.on(" ").join(l);
+ }
+
+ return object.toString();
+ }
+
+ protected String toFirstString(final Object object) {
+ if (object instanceof List) {
+ List l = (List) object;
+ return l.isEmpty() ? "" : l.get(0);
+ }
+
+ return object.toString();
}
public double getWeight() {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java
new file mode 100644
index 000000000..3f35350bd
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java
@@ -0,0 +1,39 @@
+
+package eu.dnetlib.pace.tree.support;
+
+import java.util.List;
+import java.util.Map;
+
+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.config.Config;
+
+abstract public class AbstractListComparator extends AbstractComparator> {
+ protected AbstractListComparator(Map params) {
+ super(params);
+ }
+
+ protected AbstractListComparator(Map params, AbstractStringDistance ssalgo) {
+ super(params, ssalgo);
+ }
+
+ protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ protected AbstractListComparator(AbstractStringDistance ssalgo) {
+ super(ssalgo);
+ }
+
+ @Override
+ public double compare(Object a, Object b, Config conf) {
+ return compare(toList(a), toList(b), conf);
+ }
+
+ public double compare(final List a, final List b, final Config conf) {
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
+
+ return distance(concat(a), concat(b), conf);
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java
index 8927f2e14..06c806b92 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java
@@ -8,10 +8,7 @@ import java.util.Map;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldList;
-
-public abstract class AbstractSortedComparator extends AbstractComparator {
+public abstract class AbstractSortedComparator extends AbstractListComparator {
/**
* Instantiates a new sorted second string compare algo.
@@ -30,11 +27,14 @@ public abstract class AbstractSortedComparator extends AbstractComparator {
}
@Override
- protected List toList(final Field list) {
- FieldList fl = (FieldList) list;
- List values = Lists.newArrayList(fl.stringList());
- Collections.sort(values);
- return values;
- }
+ protected List toList(final Object object) {
+ if (object instanceof List) {
+ List fl = (List) object;
+ List values = Lists.newArrayList(fl);
+ Collections.sort(values);
+ return values;
+ }
+ return Lists.newArrayList(object.toString());
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java
new file mode 100644
index 000000000..037ff6634
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java
@@ -0,0 +1,46 @@
+
+package eu.dnetlib.pace.tree.support;
+
+import java.util.Map;
+
+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.config.Config;
+
+public abstract class AbstractStringComparator extends AbstractComparator {
+ protected AbstractStringComparator(Map params) {
+ super(params);
+ }
+
+ protected AbstractStringComparator(Map params, AbstractStringDistance ssalgo) {
+ super(params, ssalgo);
+ }
+
+ protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
+
+ protected AbstractStringComparator(AbstractStringDistance ssalgo) {
+ super(ssalgo);
+ }
+
+ public double distance(final String a, final String b, final Config conf) {
+ if (a.isEmpty() || b.isEmpty()) {
+ return -1; // return -1 if a field is missing
+ }
+ double score = ssalgo.score(a, b);
+ return normalize(score);
+ }
+
+ @Override
+ public double compare(Object a, Object b, Config conf) {
+ return compare(toString(a), toString(b), conf);
+ }
+
+ public double compare(final String a, final String b, final Config conf) {
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
+ return distance(a, b, conf);
+ }
+
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java
index b11ca5429..15a39921b 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java
@@ -2,13 +2,11 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Field;
-public interface Comparator {
+public interface Comparator {
/*
* return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm)
*/
- public double compare(Field a, Field b, Config conf);
-
+ public double compare(Object a, Object b, Config conf);
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java
index 0d5c80f53..46e66378e 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java
@@ -6,7 +6,6 @@ import java.io.Serializable;
import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.util.PaceException;
/**
@@ -17,12 +16,12 @@ public class FieldStats implements Serializable {
private double weight; // weight for the field (to be used in the aggregation)
private double threshold; // threshold for the field (to be used in some kind of aggregations)
private double result; // the result of the comparison
- private Field a;
- private Field b;
+ private Object a;
+ private Object b;
private boolean countIfUndefined;
- public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) {
+ public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
this.weight = weight;
this.threshold = threshold;
this.result = result;
@@ -63,19 +62,19 @@ public class FieldStats implements Serializable {
this.countIfUndefined = countIfUndefined;
}
- public Field getA() {
+ public Object getA() {
return a;
}
- public void setA(Field a) {
+ public void setA(Object a) {
this.a = a;
}
- public Field getB() {
+ public Object getB() {
return b;
}
- public void setB(Field b) {
+ public void setB(Object b) {
this.b = b;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java
index 8dff818e8..60559412d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java
@@ -7,10 +7,19 @@ public enum MatchType {
public static MatchType parse(String value) {
- try {
- return MatchType.valueOf(value);
- } catch (IllegalArgumentException e) {
- return MatchType.UNDEFINED; // return UNDEFINED if the enum is not parsable
+ if (MATCH.name().equals(value)) {
+ return MATCH;
+ } else if (NO_MATCH.name().equals(value)) {
+ return NO_MATCH;
+ } else {
+ return UNDEFINED;
}
+
+// try {
+// return MatchType.valueOf(value);
+// }
+// catch (IllegalArgumentException e) {
+// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
+// }
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
index a754f13cd..0973fdf1e 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java
@@ -3,17 +3,17 @@ package eu.dnetlib.pace.tree.support;
import java.io.IOException;
import java.io.Serializable;
-import java.io.StringWriter;
import java.util.List;
-import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.ArrayType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.StringType;
-import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
-import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
public class TreeNodeDef implements Serializable {
@@ -46,31 +46,27 @@ public class TreeNodeDef implements Serializable {
}
// function for the evaluation of the node
- public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
+ public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats();
// for each field in the node, it computes the
for (FieldConf fieldConf : fields) {
-
double weight = fieldConf.getWeight();
-
double result;
+ Object value1 = getJavaValue(doc1, fieldConf.getField());
+ Object value2 = getJavaValue(doc2, fieldConf.getField());
+
// if the param specifies a cross comparison (i.e. compare elements from different fields), compute the
// result for both sides and return the maximum
- if (fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) {
- String crossField = fieldConf.getParams().get(CROSS_COMPARE);
- double result1 = comparator(fieldConf)
- .compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf);
- double result2 = comparator(fieldConf)
- .compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
+ String crossField = fieldConf.getParams().get(CROSS_COMPARE);
+ if (crossField != null) {
+ double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf);
+ double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf);
result = Math.max(result1, result2);
} else {
- result = comparator(fieldConf)
- .compare(
- doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()),
- conf);
+ result = comparator(fieldConf).compare(value1, value2, conf);
}
stats
@@ -81,13 +77,27 @@ public class TreeNodeDef implements Serializable {
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
result,
fieldConf.isCountIfUndefined(),
- doc1.getFieldMap().get(fieldConf.getField()),
- doc2.getFieldMap().get(fieldConf.getField())));
+ value1,
+ value2));
}
return stats;
}
+ public Object getJavaValue(Row row, String name) {
+ int pos = row.fieldIndex(name);
+ if (pos >= 0) {
+ DataType dt = row.schema().fields()[pos].dataType();
+ if (dt instanceof StringType) {
+ return row.getString(pos);
+ } else if (dt instanceof ArrayType) {
+ return row.getList(pos);
+ }
+ }
+
+ return null;
+ }
+
private Comparator comparator(final FieldConf field) {
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java
index 04e16be34..263504dbb 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java
@@ -3,9 +3,9 @@ package eu.dnetlib.pace.tree.support;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.spark.sql.Row;
import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
/**
@@ -21,72 +21,72 @@ public class TreeProcessor {
this.config = config;
}
- public boolean compare(final MapDocument a, final MapDocument b) {
+ // row based copies
+
+ public boolean compare(final Row a, final Row b) {
// evaluate the decision tree
return evaluateTree(a, b).getResult() == MatchType.MATCH;
}
- public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2) {
+ public TreeStats evaluateTree(final Row doc1, final Row doc2) {
TreeStats treeStats = new TreeStats();
- String current = "start";
+ String nextNodeName = "start";
- while (MatchType.parse(current) == MatchType.UNDEFINED) {
+ do {
- TreeNodeDef currentNode = config.decisionTree().get(current);
+ TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
// throw an exception if the node doesn't exist
if (currentNode == null)
- throw new PaceException("Missing tree node: " + current);
+ throw new PaceException("Missing tree node: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
- treeStats.addNodeStats(current, stats);
+ treeStats.addNodeStats(nextNodeName, stats);
// if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
- current = currentNode.getUndefined();
+ nextNodeName = currentNode.getUndefined();
}
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
- current = currentNode.getPositive();
+ nextNodeName = currentNode.getPositive();
} else {
- current = currentNode.getNegative();
+ nextNodeName = currentNode.getNegative();
}
- }
+ } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
- treeStats.setResult(MatchType.parse(current));
+ treeStats.setResult(MatchType.parse(nextNodeName));
return treeStats;
}
- public double computeScore(final MapDocument doc1, final MapDocument doc2) {
- String current = "start";
+ public double computeScore(final Row doc1, final Row doc2) {
+ String nextNodeName = "start";
double score = 0.0;
- while (MatchType.parse(current) == MatchType.UNDEFINED) {
+ do {
- TreeNodeDef currentNode = config.decisionTree().get(current);
+ TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
// throw an exception if the node doesn't exist
if (currentNode == null)
- throw new PaceException("The Tree Node doesn't exist: " + current);
+ throw new PaceException("The Tree Node doesn't exist: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
score = stats.getFinalScore(currentNode.getAggregation());
// if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
- current = currentNode.getUndefined();
+ nextNodeName = currentNode.getUndefined();
}
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
- current = currentNode.getPositive();
+ nextNodeName = currentNode.getPositive();
} else {
- current = currentNode.getNegative();
+ nextNodeName = currentNode.getNegative();
}
-
- }
+ } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
return score;
}
-
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java
index 4053a123c..c2b0ddda7 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java
@@ -1,20 +1,19 @@
package eu.dnetlib.pace.util;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.types.ArrayType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.StringType;
-import com.google.common.collect.Lists;
-
-import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.model.MapDocumentComparator;
import eu.dnetlib.pace.tree.support.TreeProcessor;
public class BlockProcessor {
@@ -25,6 +24,9 @@ public class BlockProcessor {
private DedupConfig dedupConf;
+ private final int identifierFieldPos;
+ private final int orderFieldPos;
+
public static void constructAccumulator(final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators
@@ -47,152 +49,80 @@ public class BlockProcessor {
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
}
- public BlockProcessor(DedupConfig dedupConf) {
+ public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
this.dedupConf = dedupConf;
+ this.identifierFieldPos = identifierFieldPos;
+ this.orderFieldPos = orderFieldPos;
}
- public void processSortedBlock(final String key, final List documents, final Reporter context) {
+ public void processSortedRows(final List documents, final Reporter context) {
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
- process(prepare(documents), context);
+ processRows(documents, context);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
- public void process(final String key, final Iterable documents, final Reporter context) {
+ private void processRows(final List queue, final Reporter context) {
- final Queue q = prepare(documents);
+ for (int pivotPos = 0; pivotPos < queue.size(); pivotPos++) {
+ final Row pivot = queue.get(pivotPos);
- if (q.size() > 1) {
-// log.info("reducing key: '" + key + "' records: " + q.size());
- process(simplifyQueue(q, key, context), context);
-
- } else {
- context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
- }
- }
-
- private Queue prepare(final Iterable documents) {
- final Queue queue = new PriorityQueue<>(100,
- new MapDocumentComparator(dedupConf.getWf().getOrderField()));
-
- final Set seen = new HashSet();
- final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
-
- documents.forEach(doc -> {
- if (queue.size() <= queueMaxSize) {
- final String id = doc.getIdentifier();
-
- if (!seen.contains(id)) {
- seen.add(id);
- queue.add(doc);
- }
- }
- });
-
- return queue;
- }
-
- private Queue simplifyQueue(final Queue queue, final String ngram,
- final Reporter context) {
- final Queue q = new LinkedList<>();
-
- String fieldRef = "";
- final List tempResults = Lists.newArrayList();
-
- while (!queue.isEmpty()) {
- final MapDocument result = queue.remove();
-
- final String orderFieldName = dedupConf.getWf().getOrderField();
- final Field orderFieldValue = result.values(orderFieldName);
- if (!orderFieldValue.isEmpty()) {
- final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
- if (field.equals(fieldRef)) {
- tempResults.add(result);
- } else {
- populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
- tempResults.clear();
- tempResults.add(result);
- fieldRef = field;
- }
- } else {
- context
- .incrementCounter(
- dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1);
- }
- }
- populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
-
- return q;
- }
-
- private void populateSimplifiedQueue(final Queue q,
- final List tempResults,
- final Reporter context,
- final String fieldRef,
- final String ngram) {
- WfConfig wf = dedupConf.getWf();
- if (tempResults.size() < wf.getGroupMaxSize()) {
- q.addAll(tempResults);
- } else {
- context
- .incrementCounter(
- wf.getEntityType(),
- String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()),
- tempResults.size());
-// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
- }
- }
-
- private void process(final Queue queue, final Reporter context) {
-
- while (!queue.isEmpty()) {
-
- final MapDocument pivot = queue.remove();
- final String idPivot = pivot.getIdentifier();
-
- WfConfig wf = dedupConf.getWf();
- final Field fieldsPivot = pivot.values(wf.getOrderField());
- final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
+ final String idPivot = pivot.getString(identifierFieldPos); // identifier
+ final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
+ final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
+ final WfConfig wf = dedupConf.getWf();
if (fieldPivot != null) {
int i = 0;
- for (final MapDocument curr : queue) {
- final String idCurr = curr.getIdentifier();
+ for (int windowPos = pivotPos + 1; windowPos < queue.size(); windowPos++) {
+ final Row curr = queue.get(windowPos);
+ final String idCurr = curr.getString(identifierFieldPos); // identifier
if (mustSkip(idCurr)) {
-
context.incrementCounter(wf.getEntityType(), "skip list", 1);
-
break;
}
- if (i > wf.getSlidingWindowSize()) {
+ if (++i > wf.getSlidingWindowSize()) {
break;
}
- final Field fieldsCurr = curr.values(wf.getOrderField());
- final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null
- : fieldsCurr.stringValue();
+ final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
+ final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
-
}
}
}
}
}
+ public Object getJavaValue(Row row, int pos) {
+ DataType dt = row.schema().fields()[pos].dataType();
+ if (dt instanceof StringType) {
+ return row.getString(pos);
+ } else if (dt instanceof ArrayType) {
+ return row.getList(pos);
+ }
+
+ return null;
+ }
+
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {
- writeSimilarity(context, idPivot, idCurr);
+ if (idPivot.compareTo(idCurr) <= 0) {
+ writeSimilarity(context, idPivot, idCurr);
+ } else {
+ writeSimilarity(context, idCurr, idPivot);
+ }
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
@@ -211,7 +141,6 @@ public class BlockProcessor {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
- context.emit(type, to, from);
}
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
deleted file mode 100644
index 40f502e11..000000000
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
+++ /dev/null
@@ -1,276 +0,0 @@
-
-package eu.dnetlib.pace.util;
-
-import java.util.*;
-
-import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-import com.google.common.collect.Lists;
-
-import eu.dnetlib.pace.clustering.NGramUtils;
-import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.config.WfConfig;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.MapDocument;
-import eu.dnetlib.pace.model.MapDocumentComparator;
-import eu.dnetlib.pace.tree.*;
-import eu.dnetlib.pace.tree.support.TreeProcessor;
-
-public class BlockProcessorForTesting {
-
- public static final List accumulators = new ArrayList<>();
-
- private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class);
-
- private DedupConfig dedupConf;
-
- public static void constructAccumulator(final DedupConfig dedupConf) {
- accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
- accumulators
- .add(
- String
- .format(
- "%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
- accumulators
- .add(
- String
- .format(
- "%s::%s", dedupConf.getWf().getEntityType(),
- String
- .format(
- "Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(),
- dedupConf.getWf().getGroupMaxSize())));
- accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"));
- accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
- accumulators
- .add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
- }
-
- public BlockProcessorForTesting(DedupConfig dedupConf) {
- this.dedupConf = dedupConf;
- }
-
- public void processSortedBlock(final String key, final List documents, final Reporter context,
- boolean useTree, boolean noMatch) {
- if (documents.size() > 1) {
-// log.info("reducing key: '" + key + "' records: " + q.size());
- process(prepare(documents), context, useTree, noMatch);
-
- } else {
- context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
- }
- }
-
- public void process(final String key, final Iterable documents, final Reporter context,
- boolean useTree, boolean noMatch) {
-
- final Queue q = prepare(documents);
-
- if (q.size() > 1) {
-// log.info("reducing key: '" + key + "' records: " + q.size());
- process(simplifyQueue(q, key, context), context, useTree, noMatch);
-
- } else {
- context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
- }
- }
-
- private Queue prepare(final Iterable documents) {
- final Queue queue = new PriorityQueue<>(100,
- new MapDocumentComparator(dedupConf.getWf().getOrderField()));
-
- final Set seen = new HashSet();
- final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
-
- documents.forEach(doc -> {
- if (queue.size() <= queueMaxSize) {
- final String id = doc.getIdentifier();
-
- if (!seen.contains(id)) {
- seen.add(id);
- queue.add(doc);
- }
- }
- });
-
- return queue;
- }
-
- private Queue simplifyQueue(final Queue queue, final String ngram,
- final Reporter context) {
- final Queue q = new LinkedList<>();
-
- String fieldRef = "";
- final List tempResults = Lists.newArrayList();
-
- while (!queue.isEmpty()) {
- final MapDocument result = queue.remove();
-
- final String orderFieldName = dedupConf.getWf().getOrderField();
- final Field orderFieldValue = result.values(orderFieldName);
- if (!orderFieldValue.isEmpty()) {
- final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
- if (field.equals(fieldRef)) {
- tempResults.add(result);
- } else {
- populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
- tempResults.clear();
- tempResults.add(result);
- fieldRef = field;
- }
- } else {
- context
- .incrementCounter(
- dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1);
- }
- }
- populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
-
- return q;
- }
-
- private void populateSimplifiedQueue(final Queue q,
- final List tempResults,
- final Reporter context,
- final String fieldRef,
- final String ngram) {
- WfConfig wf = dedupConf.getWf();
- if (tempResults.size() < wf.getGroupMaxSize()) {
- q.addAll(tempResults);
- } else {
- context
- .incrementCounter(
- wf.getEntityType(),
- String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()),
- tempResults.size());
-// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
- }
- }
-
- private void process(final Queue queue, final Reporter context, boolean useTree, boolean noMatch) {
-
- while (!queue.isEmpty()) {
-
- final MapDocument pivot = queue.remove();
- final String idPivot = pivot.getIdentifier();
-
- WfConfig wf = dedupConf.getWf();
- final Field fieldsPivot = pivot.values(wf.getOrderField());
- final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
-
- if (fieldPivot != null) {
- int i = 0;
- for (final MapDocument curr : queue) {
- final String idCurr = curr.getIdentifier();
-
- if (mustSkip(idCurr)) {
-
- context.incrementCounter(wf.getEntityType(), "skip list", 1);
-
- break;
- }
-
- if (i > wf.getSlidingWindowSize()) {
- break;
- }
-
- final Field fieldsCurr = curr.values(wf.getOrderField());
- final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null
- : fieldsCurr.stringValue();
-
- if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
-
- // draws no match relations (test purpose)
- if (noMatch) {
- emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
- } else {
- // use the decision tree implementation or the "normal" implementation of the similarity
- // score (valid only for publications)
- if (useTree)
- emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
- else
- emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
- }
-// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
-// emitOutput(true, idPivot, idCurr, context);
-// }
-
- }
- }
- }
- }
- }
-
- protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) {
- Map params = new HashMap<>();
- InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
- double compare = instanceTypeMatch
- .compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf);
- return compare >= 1.0;
- }
-
- private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
- // if the score gives 1, the publications are equivalent
- Map params = new HashMap<>();
- params.put("jpath_value", "$.value");
- params.put("jpath_classid", "$.qualifier.classid");
- params.put("mode", "count");
-
- double score = 0.0;
-
- // levenstein title
- LevensteinTitle levensteinTitle = new LevensteinTitle(params);
- if (levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
- score += 0.2;
- }
-
- // pid
- JsonListMatch jsonListMatch = new JsonListMatch(params);
- if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
- score += 0.5;
- }
-
- // title version
- TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
- double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
- if (result1 < 0 || result1 >= 1.0) {
- score += 0.1;
- }
-
- // authors match
- params.remove("mode");
- AuthorsMatch authorsMatch = new AuthorsMatch(params);
- double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
- if (result2 < 0 || result2 >= 0.6) {
- score += 0.2;
- }
-
- return score >= 0.5;
- }
-
- private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
-
- if (result) {
- writeSimilarity(context, idPivot, idCurr);
- context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
- } else {
- context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
- }
- }
-
- private boolean mustSkip(final String idPivot) {
- return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
- }
-
- private String getNsPrefix(final String id) {
- return StringUtils.substringBetween(id, "|", "::");
- }
-
- private void writeSimilarity(final Reporter context, final String from, final String to) {
- final String type = dedupConf.getWf().getEntityType();
-
- context.emit(type, from, to);
- }
-}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
index 84d49bd5c..c885f2aeb 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java
@@ -18,6 +18,7 @@ package eu.dnetlib.pace.util;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
index 2c1a1700b..a59b6248b 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java
@@ -2,19 +2,20 @@
package eu.dnetlib.pace.util;
import java.math.BigDecimal;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Predicate;
-import java.util.stream.Collectors;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
-import com.jayway.jsonpath.Configuration;
+import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
-import com.jayway.jsonpath.Option;
+import com.jayway.jsonpath.spi.cache.Cache;
+import com.jayway.jsonpath.spi.cache.CacheProvider;
-import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
public class MapDocumentUtil {
@@ -22,103 +23,20 @@ public class MapDocumentUtil {
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX);
- public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
- MapDocument m = new MapDocument();
- m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));
- Map stringField = new HashMap<>();
- conf.getPace().getModel().forEach(fdef -> {
- switch (fdef.getType()) {
- case String:
- case Int:
- stringField
- .put(
- fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(),
- truncateValue(getJPathString(fdef.getPath(), json), fdef.getLength())));
- break;
- case URL:
- String uv = getJPathString(fdef.getPath(), json);
- if (!urlFilter.test(uv))
- uv = "";
- stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv));
- break;
- case List:
- case JSON:
- FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType());
- truncateList(getJPathList(fdef.getPath(), json, fdef.getType()), fdef.getSize())
- .stream()
- .map(item -> new FieldValueImpl(Type.String, fdef.getName(), item))
- .forEach(fi::add);
- stringField.put(fdef.getName(), fi);
- break;
- case DoubleArray:
- stringField
- .put(
- fdef.getName(),
- new FieldValueImpl(Type.DoubleArray,
- fdef.getName(),
- getJPathArray(fdef.getPath(), json)));
- break;
- case StringConcat:
- String[] jpaths = fdef.getPath().split("\\|\\|\\|");
- stringField
- .put(
- fdef.getName(),
- new FieldValueImpl(Type.String,
- fdef.getName(),
- truncateValue(
- Arrays
- .stream(jpaths)
- .map(jpath -> getJPathString(jpath, json))
- .collect(Collectors.joining(" ")),
- fdef.getLength())));
- break;
+ static {
+ CacheProvider.setCache(new Cache() {
+ private final ConcurrentHashMap jsonPathCache = new ConcurrentHashMap();
+
+ @Override
+ public JsonPath get(String key) {
+ return jsonPathCache.get(key);
+ }
+
+ @Override
+ public void put(String key, JsonPath value) {
+ jsonPathCache.put(key, value);
}
});
- m.setFieldMap(stringField);
- return m;
- }
-
- public static List getJPathList(String path, String json, Type type) {
- if (type == Type.List)
- return JsonPath
- .using(
- Configuration
- .defaultConfiguration()
- .addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS))
- .parse(json)
- .read(path);
- Object jresult;
- List result = new ArrayList<>();
- try {
- jresult = JsonPath.read(json, path);
- } catch (Throwable e) {
- return result;
- }
- if (jresult instanceof JSONArray) {
-
- ((JSONArray) jresult).forEach(it -> {
-
- try {
- result.add(new ObjectMapper().writeValueAsString(it));
- } catch (JsonProcessingException e) {
-
- }
- });
- return result;
- }
-
- if (jresult instanceof LinkedHashMap) {
- try {
- result.add(new ObjectMapper().writeValueAsString(jresult));
- } catch (JsonProcessingException e) {
-
- }
- return result;
- }
- if (jresult instanceof String) {
- result.add((String) jresult);
- }
- return result;
}
public static String getJPathString(final String jsonPath, final String json) {
@@ -174,4 +92,54 @@ public class MapDocumentUtil {
return list.subList(0, size);
}
+ public static String getJPathString(final String jsonPath, final DocumentContext json) {
+ try {
+ Object o = json.read(jsonPath);
+ if (o instanceof String)
+ return (String) o;
+ if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
+ return (String) ((JSONArray) o).get(0);
+ return "";
+ } catch (Exception e) {
+ return "";
+ }
+ }
+
+ public static List getJPathList(String path, DocumentContext json, Type type) {
+ // if (type == Type.List)
+ // return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST,
+ // Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
+ Object jresult;
+ List result = new ArrayList<>();
+ try {
+ jresult = json.read(path);
+ } catch (Throwable e) {
+ return result;
+ }
+
+ if (jresult instanceof JSONArray) {
+ ((JSONArray) jresult).forEach(it -> {
+ try {
+ result.add(new ObjectMapper().writeValueAsString(it));
+ } catch (JsonProcessingException e) {
+
+ }
+ });
+ return result;
+ }
+
+ if (jresult instanceof LinkedHashMap) {
+ try {
+ result.add(new ObjectMapper().writeValueAsString(jresult));
+ } catch (JsonProcessingException e) {
+
+ }
+ return result;
+ }
+ if (jresult instanceof String) {
+ result.add((String) jresult);
+ }
+ return result;
+ }
+
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java
new file mode 100644
index 000000000..437fe783b
--- /dev/null
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java
@@ -0,0 +1,85 @@
+
+package eu.dnetlib.pace.util;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.spark.SparkContext;
+import org.apache.spark.util.LongAccumulator;
+
+import eu.dnetlib.pace.config.DedupConfig;
+import scala.Serializable;
+import scala.Tuple2;
+
+public class SparkReporter implements Serializable, Reporter {
+
+ private final List> relations = new ArrayList<>();
+
+ private final Map accumulators;
+
+ public SparkReporter(Map accumulators) {
+ this.accumulators = accumulators;
+ }
+
+ public void incrementCounter(
+ String counterGroup,
+ String counterName,
+ long delta,
+ Map accumulators) {
+
+ final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
+ if (accumulators.containsKey(accumulatorName)) {
+ accumulators.get(accumulatorName).add(delta);
+ }
+ }
+
+ @Override
+ public void incrementCounter(String counterGroup, String counterName, long delta) {
+
+ incrementCounter(counterGroup, counterName, delta, accumulators);
+ }
+
+ @Override
+ public void emit(String type, String from, String to) {
+ relations.add(new Tuple2<>(from, to));
+ }
+
+ public List> getRelations() {
+ return relations;
+ }
+
+ public static Map constructAccumulator(
+ final DedupConfig dedupConf, final SparkContext context) {
+
+ Map accumulators = new HashMap<>();
+
+ String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
+ accumulators.put(acc1, context.longAccumulator(acc1));
+ String acc2 = String
+ .format(
+ "%s::%s",
+ dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
+ accumulators.put(acc2, context.longAccumulator(acc2));
+ String acc3 = String
+ .format(
+ "%s::%s",
+ dedupConf.getWf().getEntityType(),
+ String
+ .format(
+ "Skipped records for count(%s) >= %s",
+ dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
+ accumulators.put(acc3, context.longAccumulator(acc3));
+ String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
+ accumulators.put(acc4, context.longAccumulator(acc4));
+ String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
+ accumulators.put(acc5, context.longAccumulator(acc5));
+ String acc6 = String
+ .format(
+ "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
+ accumulators.put(acc6, context.longAccumulator(acc6));
+
+ return accumulators;
+ }
+}
diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
index 2a37701aa..d3f502f35 100644
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java
@@ -3,57 +3,42 @@ package eu.dnetlib.pace;
import java.io.IOException;
import java.io.StringWriter;
-import java.nio.charset.StandardCharsets;
import java.util.List;
-import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
-import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldListImpl;
-import eu.dnetlib.pace.model.FieldValueImpl;
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();
try {
- IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
+ IOUtils.copy(getClass().getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
- protected Field title(final String s) {
- return new FieldValueImpl(Type.String, "title", s);
+ protected String title(final String s) {
+ return s;
}
- protected Field person(final String s) {
- return new FieldValueImpl(Type.JSON, "person", s);
+ protected String person(final String s) {
+ return s;
}
- protected Field url(final String s) {
- return new FieldValueImpl(Type.URL, "url", s);
+ protected String url(final String s) {
+ return s;
}
- protected Field array(final double[] a) {
- return new FieldValueImpl(Type.DoubleArray, "array", a);
- }
-
- protected Field createFieldList(List strings, String fieldName) {
-
- List fieldValueStream = strings
- .stream()
- .map(s -> new FieldValueImpl(Type.String, fieldName, s))
- .collect(Collectors.toList());
-
- FieldListImpl a = new FieldListImpl();
- a.addAll(fieldValueStream);
-
+ protected double[] array(final double[] a) {
return a;
+ }
+
+ protected List createFieldList(List strings, String fieldName) {
+ return strings;
}
}
diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
index 9873278b9..f9a1ea9e2 100644
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java
@@ -2,14 +2,12 @@
package eu.dnetlib.pace.clustering;
import java.util.Map;
-import java.util.Set;
-import java.util.stream.Collectors;
-import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
@@ -37,7 +35,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "http://www.test.it/path/to/resource";
System.out.println(s);
- System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
+ System.out.println(urlClustering.apply(conf, Lists.newArrayList(s)));
}
@Test
@@ -51,7 +49,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(ngram.apply(conf, Lists.newArrayList(s)));
}
@Test
@@ -63,7 +61,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(np.apply(conf, Lists.newArrayList(s)));
}
@Test
@@ -75,15 +73,15 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s1 = "University of Pisa";
System.out.println(s1);
- System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
+ System.out.println(np.apply(conf, Lists.newArrayList(s1)));
final String s2 = "Pisa University";
System.out.println(s2);
- System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
+ System.out.println(np.apply(conf, Lists.newArrayList(s2)));
final String s3 = "Parco Tecnologico Agroalimentare Umbria";
System.out.println(s3);
- System.out.println(np.apply(conf, Lists.newArrayList(title(s3))));
+ System.out.println(np.apply(conf, Lists.newArrayList(s3)));
}
@@ -97,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(acro.apply(conf, Lists.newArrayList(s)));
}
@Test
@@ -109,12 +107,12 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
params.put("len", 3);
params.put("max", 1);
- System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
+ System.out.println(sp.apply(conf, Lists.newArrayList("Framework for general-purpose deduplication")));
}
@Test
@@ -127,7 +125,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
}
@Test
@@ -138,31 +136,31 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
s = "A Physical Education Teacher Is Like...: Examining Turkish Students Perceptions of Physical Education Teachers Through Metaphor Analysis";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
s = "Structure of a Eukaryotic Nonribosomal Peptide Synthetase Adenylation Domain That Activates a Large Hydroxamate Amino Acid in Siderophore Biosynthesis";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
s = "Performance Evaluation";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
s = "JRC Open Power Plants Database (JRC-PPDB-OPEN)";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
s = "JRC Open Power Plants Database";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
s = "niivue/niivue: 0.21.1";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
}
@@ -175,7 +173,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
- System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(sp.apply(conf, Lists.newArrayList(s)));
}
@Test
@@ -184,35 +182,35 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final ClusteringFunction cf = new KeywordsClustering(params);
final String s = "Polytechnic University of Turin";
System.out.println(s);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s)));
final String s1 = "POLITECNICO DI TORINO";
System.out.println(s1);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
final String s6 = "National and Kapodistrian University of Athens";
System.out.println("s6 = " + s6);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s6))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
System.out.println("s7 = " + s7);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s7))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
}
@@ -222,11 +220,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N.";
System.out.println("s = " + s);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s)));
final String s1 = "Manghi, Paolo";
System.out.println("s1 = " + s1);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
}
@@ -236,11 +234,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo";
System.out.println("s = " + s);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s)));
final String s1 = "Manghi, P.";
System.out.println("s = " + s1);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
}
@@ -250,7 +248,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong";
System.out.println("s = " + s);
- System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
+ System.out.println(cf.apply(conf, Lists.newArrayList(s)));
}
}
diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
index 5c846c058..b37e16cf5 100644
--- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@@ -3,19 +3,16 @@ package eu.dnetlib.pace.comparators;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.*;
-import org.junit.jupiter.api.*;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.TestInstance;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.config.Type;
-import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.tree.*;
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
@@ -99,8 +96,8 @@ public class ComparatorTest extends AbstractPaceTest {
@Test
public void listContainsMatchTest() {
- Field a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
- Field b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
+ List a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
+ List b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
params.put("string", "Article");
params.put("bool", "XOR");
@@ -214,31 +211,32 @@ public class ComparatorTest extends AbstractPaceTest {
final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
- Field a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
- Field b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
+ List a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
+ List b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
double result = instanceTypeMatch.compare(a, b, conf);
assertEquals(1.0, result);
- Field c = createFieldList(
+ List c = createFieldList(
Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
result = instanceTypeMatch.compare(c, b, conf);
assertEquals(1.0, result);
- Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
- Field e = createFieldList(
+ List d = createFieldList(
+ Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
+ List e = createFieldList(
Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
result = instanceTypeMatch.compare(d, e, conf);
assertEquals(1.0, result);
- Field g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType");
+ List g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType");
result = instanceTypeMatch.compare(e, g, conf);
assertEquals(0.0, result);
- Field h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType");
+ List h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType");
result = instanceTypeMatch.compare(a, h, conf);
assertEquals(1.0, result);
@@ -249,15 +247,15 @@ public class ComparatorTest extends AbstractPaceTest {
AuthorsMatch authorsMatch = new AuthorsMatch(params);
- Field a = createFieldList(
+ List a = createFieldList(
Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
- Field b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
+ List b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
double result = authorsMatch.compare(a, b, conf);
assertEquals(1.0, result);
- Field c = createFieldList(Arrays.asList("Manghi, Paolo"), "authors");
- Field d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
+ List