diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml
index b66976ea6..cd6305051 100644
--- a/dhp-pace-core/pom.xml
+++ b/dhp-pace-core/pom.xml
@@ -81,9 +81,12 @@
org.apache.spark
- spark-catalyst_2.11
- 2.4.0.cloudera2
- compile
+ spark-core_2.11
+
+
+
+ org.apache.spark
+ spark-sql_2.11
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
index b7a70d607..3da8eb490 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -1,8 +1,5 @@
-package eu.dnetlib.pace.clustering;
-import eu.dnetlib.pace.common.AbstractPaceFunctions;
-import eu.dnetlib.pace.config.Config;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.HashSet;
@@ -10,32 +7,39 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
protected Map params;
-
+
public AbstractClusteringFunction(final Map params) {
this.params = params;
}
protected abstract Collection doApply(Config conf, String s);
-
+
@Override
public Collection apply(Config conf, List fields) {
- return fields.stream().filter(f -> !f.isEmpty())
- .map(this::normalize)
- .map(s -> filterAllStopWords(s))
- .map(s -> doApply(conf, s))
- .map(c -> filterBlacklisted(c, ngramBlacklist))
- .flatMap(c -> c.stream())
- .filter(StringUtils::isNotBlank)
- .collect(Collectors.toCollection(HashSet::new));
+ return fields
+ .stream()
+ .filter(f -> !f.isEmpty())
+ .map(this::normalize)
+ .map(s -> filterAllStopWords(s))
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
}
public Map getParams() {
return params;
}
-
+
protected Integer param(String name) {
return params.get(name);
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
index d3008332d..9072fbb4b 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@@ -6,6 +7,7 @@ import java.util.Set;
import java.util.StringTokenizer;
import com.google.common.collect.Sets;
+
import eu.dnetlib.pace.config.Config;
@ClusteringClass("acronyms")
@@ -19,16 +21,16 @@ public class Acronyms extends AbstractClusteringFunction {
protected Collection doApply(Config conf, String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
}
-
+
private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
-
+
final Set acronyms = Sets.newLinkedHashSet();
-
+
for (int i = 0; i < maxAcronyms; i++) {
-
+
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
-
+
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (sb.length() > maxLen) {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
index e67767171..3bb845b15 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.lang.annotation.ElementType;
@@ -9,5 +10,5 @@ import java.lang.annotation.Target;
@Target(ElementType.TYPE)
public @interface ClusteringClass {
- public String value();
-}
\ No newline at end of file
+ public String value();
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
index 4660d2b6c..8b7852418 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -1,15 +1,16 @@
-package eu.dnetlib.pace.clustering;
-import eu.dnetlib.pace.config.Config;
+package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
+import eu.dnetlib.pace.config.Config;
+
public interface ClusteringFunction {
-
+
public Collection apply(Config config, List fields);
-
+
public Map getParams();
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
index 7f342f69c..bc8844aee 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@@ -5,6 +6,7 @@ import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
+
import eu.dnetlib.pace.config.Config;
@ClusteringClass("immutablefieldvalue")
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
index 73ba221c3..38299adb4 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java
@@ -1,50 +1,54 @@
-package eu.dnetlib.pace.clustering;
-import eu.dnetlib.pace.config.Config;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.stream.Collectors;
+import org.apache.commons.lang3.StringUtils;
+
+import eu.dnetlib.pace.config.Config;
+
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
- public KeywordsClustering(Map params) {
- super(params);
- }
+ public KeywordsClustering(Map params) {
+ super(params);
+ }
- @Override
- protected Collection doApply(final Config conf, String s) {
+ @Override
+ protected Collection doApply(final Config conf, String s) {
- //takes city codes and keywords codes without duplicates
- Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
- Set cities = getCities(s, params.getOrDefault("windowSize", 4));
+ // takes city codes and keywords codes without duplicates
+ Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
+ Set cities = getCities(s, params.getOrDefault("windowSize", 4));
- //list of combination to return as result
- final Collection combinations = new LinkedHashSet();
+ // list of combination to return as result
+ final Collection combinations = new LinkedHashSet();
- for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
- for (String city: citiesToCodes(cities)) {
- combinations.add(keyword+"-"+city);
- if (combinations.size()>=params.getOrDefault("max", 2)) {
- return combinations;
- }
- }
- }
+ for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
+ for (String city : citiesToCodes(cities)) {
+ combinations.add(keyword + "-" + city);
+ if (combinations.size() >= params.getOrDefault("max", 2)) {
+ return combinations;
+ }
+ }
+ }
- return combinations;
- }
+ return combinations;
+ }
- @Override
- public Collection apply(final Config conf, List fields) {
- return fields.stream().filter(f -> !f.isEmpty())
- .map(this::cleanup)
- .map(this::normalize)
- .map(s -> filterAllStopWords(s))
- .map(s -> doApply(conf, s))
- .map(c -> filterBlacklisted(c, ngramBlacklist))
- .flatMap(c -> c.stream())
- .filter(StringUtils::isNotBlank)
- .collect(Collectors.toCollection(HashSet::new));
- }
-}
\ No newline at end of file
+ @Override
+ public Collection apply(final Config conf, List fields) {
+ return fields
+ .stream()
+ .filter(f -> !f.isEmpty())
+ .map(this::cleanup)
+ .map(this::normalize)
+ .map(s -> filterAllStopWords(s))
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
index fa45ac909..5a385961a 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java
@@ -1,75 +1,79 @@
-package eu.dnetlib.pace.clustering;
-import com.google.common.collect.Lists;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Person;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.stream.Collectors;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.collect.Lists;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+
@ClusteringClass("lnfi")
-public class LastNameFirstInitial extends AbstractClusteringFunction{
+public class LastNameFirstInitial extends AbstractClusteringFunction {
- private boolean DEFAULT_AGGRESSIVE = true;
+ private boolean DEFAULT_AGGRESSIVE = true;
- public LastNameFirstInitial(final Map params) {
- super(params);
- }
+ public LastNameFirstInitial(final Map params) {
+ super(params);
+ }
- @Override
- public Collection apply(Config conf, List fields) {
- return fields.stream().filter(f -> !f.isEmpty())
- .map(this::normalize)
- .map(s -> doApply(conf, s))
- .map(c -> filterBlacklisted(c, ngramBlacklist))
- .flatMap(c -> c.stream())
- .filter(StringUtils::isNotBlank)
- .collect(Collectors.toCollection(HashSet::new));
- }
+ @Override
+ public Collection apply(Config conf, List fields) {
+ return fields
+ .stream()
+ .filter(f -> !f.isEmpty())
+ .map(this::normalize)
+ .map(s -> doApply(conf, s))
+ .map(c -> filterBlacklisted(c, ngramBlacklist))
+ .flatMap(c -> c.stream())
+ .filter(StringUtils::isNotBlank)
+ .collect(Collectors.toCollection(HashSet::new));
+ }
- @Override
- protected String normalize(final String s) {
- return fixAliases(transliterate(nfd(unicodeNormalization(s))))
- // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
- .replaceAll("[^ \\w]+", "")
- .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
- .replaceAll("(\\p{Punct})+", " ")
- .replaceAll("(\\d)+", " ")
- .replaceAll("(\\n)+", " ")
- .trim();
- }
+ @Override
+ protected String normalize(final String s) {
+ return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
+ // strings
+ .replaceAll("[^ \\w]+", "")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
- @Override
- protected Collection doApply(final Config conf, final String s) {
+ @Override
+ protected Collection doApply(final Config conf, final String s) {
- final List res = Lists.newArrayList();
+ final List res = Lists.newArrayList();
- final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
+ : DEFAULT_AGGRESSIVE);
- Person p = new Person(s, aggressive);
+ Person p = new Person(s, aggressive);
- if (p.isAccurate()) {
- String lastName = p.getNormalisedSurname().toLowerCase();
- String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
+ if (p.isAccurate()) {
+ String lastName = p.getNormalisedSurname().toLowerCase();
+ String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0, 1);
- res.add(firstInitial.concat(lastName));
- }
- else { // is not accurate, meaning it has no defined name and surname
- List fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
- if (fullname.size() == 1) {
- res.add(p.getNormalisedFullname().toLowerCase());
- }
- else if (fullname.size() == 2) {
- res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
- res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
- }
- else {
- res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
- res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
- }
- }
+ res.add(firstInitial.concat(lastName));
+ } else { // is not accurate, meaning it has no defined name and surname
+ List fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
+ if (fullname.size() == 1) {
+ res.add(p.getNormalisedFullname().toLowerCase());
+ } else if (fullname.size() == 2) {
+ res.add(fullname.get(0).substring(0, 1).concat(fullname.get(1)).toLowerCase());
+ res.add(fullname.get(1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
+ } else {
+ res.add(fullname.get(0).substring(0, 1).concat(fullname.get(fullname.size() - 1)).toLowerCase());
+ res.add(fullname.get(fullname.size() - 1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
+ }
+ }
- return res;
- }
-}
\ No newline at end of file
+ return res;
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
index d50a95008..a3a6c4881 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -1,14 +1,17 @@
-package eu.dnetlib.pace.clustering;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.config.Config;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.config.Config;
+
@ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction {
@@ -19,7 +22,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
@Override
public Collection apply(Config conf, List fields) {
Collection c = Sets.newLinkedHashSet();
- for(String f : fields) {
+ for (String f : fields) {
c.addAll(doApply(conf, f));
}
return c;
@@ -27,7 +30,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
@Override
protected Collection doApply(final Config conf, final String s) {
- if(StringUtils.isBlank(s)) {
+ if (StringUtils.isBlank(s)) {
return Lists.newArrayList();
}
return Lists.newArrayList(s.toLowerCase().trim());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
index 30d33629c..4c81e9a48 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Set;
@@ -11,7 +12,8 @@ public class NGramUtils extends AbstractPaceFunctions {
private static final int SIZE = 100;
- private static final Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+ private static final Set stopwords = AbstractPaceFunctions
+ .loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
public static String cleanupForOrdering(String s) {
String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords);
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
index fd7c17ec3..e42cabd8d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@@ -6,6 +7,7 @@ import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
+
import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrampairs")
@@ -32,7 +34,7 @@ public class NgramPairs extends Ngrams {
break;
}
res.add(ngrams.get(i) + ngrams.get(j));
- //System.out.println("-- " + concatNgrams);
+ // System.out.println("-- " + concatNgrams);
}
return res;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
index 3af7e98e8..96c305a16 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -1,9 +1,10 @@
+
package eu.dnetlib.pace.clustering;
-import eu.dnetlib.pace.config.Config;
-
import java.util.*;
+import eu.dnetlib.pace.config.Config;
+
@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction {
@@ -44,7 +45,7 @@ public class Ngrams extends AbstractClusteringFunction {
}
}
}
- //System.out.println(ngrams + " n: " + ngrams.size());
+ // System.out.println(ngrams + " n: " + ngrams.size());
return ngrams;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
index a5bad2075..b4a04ce65 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -1,16 +1,19 @@
-package eu.dnetlib.pace.clustering;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.common.AbstractPaceFunctions;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Person;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+
@ClusteringClass("personClustering")
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
@@ -30,7 +33,8 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
final Person person = new Person(f, false);
- if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
+ if (StringUtils.isNotBlank(person.getNormalisedFirstName())
+ && StringUtils.isNotBlank(person.getNormalisedSurname())) {
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
} else {
for (final String token1 : tokens(f, MAX_TOKENS)) {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
index f6c4fe07f..a3d58a9be 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@@ -22,7 +23,8 @@ public class PersonHash extends AbstractClusteringFunction {
protected Collection doApply(final Config conf, final String s) {
final List res = Lists.newArrayList();
- final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
+ final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
+ : DEFAULT_AGGRESSIVE);
res.add(new Person(s, aggressive).hash());
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
index 86a2e4e4f..2aab926da 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
@@ -1,10 +1,11 @@
-package eu.dnetlib.pace.clustering;
-import eu.dnetlib.pace.config.Config;
+package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Map;
+import eu.dnetlib.pace.config.Config;
+
public class RandomClusteringFunction extends AbstractClusteringFunction {
public RandomClusteringFunction(Map params) {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
index 77c2c0155..5809d8216 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.*;
@@ -5,6 +6,7 @@ import java.util.*;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
+
import eu.dnetlib.pace.config.Config;
@ClusteringClass("sortedngrampairs")
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
index 50cea4db3..392aecc79 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
@@ -1,15 +1,17 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
-import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
+import eu.dnetlib.pace.config.Config;
+
@ClusteringClass("spacetrimmingfieldvalue")
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
@@ -21,7 +23,10 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
protected Collection doApply(final Config conf, final String s) {
final List res = Lists.newArrayList();
- res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
+ res
+ .add(
+ StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength"))
+ : s.toLowerCase().replaceAll("\\s+", ""));
return res;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
index fa1f64362..2a1c023a9 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@@ -5,6 +6,7 @@ import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
+
import eu.dnetlib.pace.config.Config;
@ClusteringClass("suffixprefix")
@@ -18,7 +20,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
protected Collection doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
-
+
private Collection suffixPrefix(String s, int len, int max) {
final Set bigrams = Sets.newLinkedHashSet();
int i = 0;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
index 235cec101..5b267ad10 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -1,7 +1,5 @@
-package eu.dnetlib.pace.clustering;
-import eu.dnetlib.pace.common.AbstractPaceFunctions;
-import eu.dnetlib.pace.config.Config;
+package eu.dnetlib.pace.clustering;
import java.net.MalformedURLException;
import java.net.URL;
@@ -11,42 +9,44 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
+import eu.dnetlib.pace.common.AbstractPaceFunctions;
+import eu.dnetlib.pace.config.Config;
+
@ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
- protected Map params;
+ protected Map params;
- public UrlClustering(final Map params) {
- this.params = params;
- }
+ public UrlClustering(final Map params) {
+ this.params = params;
+ }
- @Override
- public Collection apply(final Config conf, List fields) {
- try {
- return fields.stream()
- .filter(f -> !f.isEmpty())
- .map(this::asUrl)
- .map(URL::getHost)
- .collect(Collectors.toCollection(HashSet::new));
- }
- catch (IllegalStateException e){
- return new HashSet<>();
- }
- }
+ @Override
+ public Collection apply(final Config conf, List fields) {
+ try {
+ return fields
+ .stream()
+ .filter(f -> !f.isEmpty())
+ .map(this::asUrl)
+ .map(URL::getHost)
+ .collect(Collectors.toCollection(HashSet::new));
+ } catch (IllegalStateException e) {
+ return new HashSet<>();
+ }
+ }
- @Override
- public Map getParams() {
- return null;
- }
-
- private URL asUrl(String value) {
- try {
- return new URL(value);
- } catch (MalformedURLException e) {
- // should not happen as checked by pace typing
- throw new IllegalStateException("invalid URL: " + value);
- }
- }
+ @Override
+ public Map getParams() {
+ return null;
+ }
+ private URL asUrl(String value) {
+ try {
+ return new URL(value);
+ } catch (MalformedURLException e) {
+ // should not happen as checked by pace typing
+ throw new IllegalStateException("invalid URL: " + value);
+ }
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
index 6fa2668fa..c8e02f8f0 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java
@@ -1,90 +1,91 @@
-package eu.dnetlib.pace.clustering;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.config.Config;
+package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.stream.Collectors;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.config.Config;
+
@ClusteringClass("wordsStatsSuffixPrefixChain")
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
- public WordsStatsSuffixPrefixChain(Map params) {
- super(params);
- }
+ public WordsStatsSuffixPrefixChain(Map params) {
+ super(params);
+ }
- @Override
- protected Collection doApply(Config conf, String s) {
- return suffixPrefixChain(s, param("mod"));
- }
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefixChain(s, param("mod"));
+ }
- private Collection suffixPrefixChain(String s, int mod) {
+ private Collection suffixPrefixChain(String s, int mod) {
- //create the list of words from the string (remove short words)
- List wordsList =
- Arrays.stream(s.split(" "))
- .filter(si -> si.length() > 3)
- .collect(Collectors.toList());
+ // create the list of words from the string (remove short words)
+ List wordsList = Arrays
+ .stream(s.split(" "))
+ .filter(si -> si.length() > 3)
+ .collect(Collectors.toList());
- final int words = wordsList.size();
- final int letters = s.length();
+ final int words = wordsList.size();
+ final int letters = s.length();
- //create the prefix: number of words + number of letters/mod
- String prefix = words + "-" + letters/mod + "-";
+ // create the prefix: number of words + number of letters/mod
+ String prefix = words + "-" + letters / mod + "-";
- return doSuffixPrefixChain(wordsList, prefix);
+ return doSuffixPrefixChain(wordsList, prefix);
- }
+ }
- private Collection doSuffixPrefixChain(List wordsList, String prefix) {
+ private Collection doSuffixPrefixChain(List wordsList, String prefix) {
- Set set = Sets.newLinkedHashSet();
- switch(wordsList.size()){
- case 0:
- case 1:
- break;
- case 2:
- set.add(
- prefix +
- suffix(wordsList.get(0), 3) +
- prefix(wordsList.get(1), 3)
- );
+ Set set = Sets.newLinkedHashSet();
+ switch (wordsList.size()) {
+ case 0:
+ case 1:
+ break;
+ case 2:
+ set
+ .add(
+ prefix +
+ suffix(wordsList.get(0), 3) +
+ prefix(wordsList.get(1), 3));
- set.add(
- prefix +
- prefix(wordsList.get(0), 3) +
- suffix(wordsList.get(1), 3)
- );
+ set
+ .add(
+ prefix +
+ prefix(wordsList.get(0), 3) +
+ suffix(wordsList.get(1), 3));
- break;
- default:
- set.add(
- prefix +
- suffix(wordsList.get(0), 3) +
- prefix(wordsList.get(1), 3) +
- suffix(wordsList.get(2), 3)
- );
+ break;
+ default:
+ set
+ .add(
+ prefix +
+ suffix(wordsList.get(0), 3) +
+ prefix(wordsList.get(1), 3) +
+ suffix(wordsList.get(2), 3));
- set.add(
- prefix +
- prefix(wordsList.get(0), 3) +
- suffix(wordsList.get(1), 3) +
- prefix(wordsList.get(2), 3)
- );
- break;
- }
+ set
+ .add(
+ prefix +
+ prefix(wordsList.get(0), 3) +
+ suffix(wordsList.get(1), 3) +
+ prefix(wordsList.get(2), 3));
+ break;
+ }
- return set;
+ return set;
- }
+ }
+ private String suffix(String s, int len) {
+ return s.substring(s.length() - len);
+ }
- private String suffix(String s, int len) {
- return s.substring(s.length()-len);
- }
-
- private String prefix(String s, int len) {
- return s.substring(0, len);
- }
+ private String prefix(String s, int len) {
+ return s.substring(0, len);
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
index 1e94b34d2..e606590a5 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@@ -5,53 +6,54 @@ import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
+
import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordssuffixprefix")
public class WordsSuffixPrefix extends AbstractClusteringFunction {
- public WordsSuffixPrefix(Map params) {
- super(params);
- }
+ public WordsSuffixPrefix(Map params) {
+ super(params);
+ }
- @Override
- protected Collection doApply(Config conf, String s) {
- return suffixPrefix(s, param("len"), param("max"));
- }
+ @Override
+ protected Collection doApply(Config conf, String s) {
+ return suffixPrefix(s, param("len"), param("max"));
+ }
- private Collection suffixPrefix(String s, int len, int max) {
+ private Collection suffixPrefix(String s, int len, int max) {
- final int words = s.split(" ").length;
+ final int words = s.split(" ").length;
- // adjust the token length according to the number of words
- switch (words) {
- case 1:
- return Sets.newLinkedHashSet();
- case 2:
- return doSuffixPrefix(s, len+2, max, words);
- case 3:
- return doSuffixPrefix(s, len+1, max, words);
- default:
- return doSuffixPrefix(s, len, max, words);
- }
- }
+ // adjust the token length according to the number of words
+ switch (words) {
+ case 1:
+ return Sets.newLinkedHashSet();
+ case 2:
+ return doSuffixPrefix(s, len + 2, max, words);
+ case 3:
+ return doSuffixPrefix(s, len + 1, max, words);
+ default:
+ return doSuffixPrefix(s, len, max, words);
+ }
+ }
- private Collection doSuffixPrefix(String s, int len, int max, int words) {
- final Set bigrams = Sets.newLinkedHashSet();
- int i = 0;
- while (++i < s.length() && bigrams.size() < max) {
- int j = s.indexOf(" ", i);
+ private Collection doSuffixPrefix(String s, int len, int max, int words) {
+ final Set bigrams = Sets.newLinkedHashSet();
+ int i = 0;
+ while (++i < s.length() && bigrams.size() < max) {
+ int j = s.indexOf(" ", i);
- int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
+ int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
- if (j - len > 0) {
- String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
- if (bigram.length() >= 4) {
- bigrams.add(words+bigram);
- }
- }
- }
- return bigrams;
- }
+ if (j - len > 0) {
+ String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
+ if (bigram.length() >= 4) {
+ bigrams.add(words + bigram);
+ }
+ }
+ }
+ return bigrams;
+ }
-}
\ No newline at end of file
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index 3b80bfcd1..06a955ba5 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -1,14 +1,5 @@
-package eu.dnetlib.pace.common;
-import com.google.common.base.Joiner;
-import com.google.common.base.Splitter;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import com.ibm.icu.text.Transliterator;
-import eu.dnetlib.pace.clustering.NGramUtils;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.common;
import java.io.IOException;
import java.io.StringWriter;
@@ -19,6 +10,18 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+
+import com.google.common.base.Joiner;
+import com.google.common.base.Splitter;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import com.ibm.icu.text.Transliterator;
+
+import eu.dnetlib.pace.clustering.NGramUtils;
+
/**
* Set of common functions for the framework
*
@@ -26,321 +29,325 @@ import java.util.stream.Collectors;
*/
public abstract class AbstractPaceFunctions {
- //city map to be used when translating the city names into codes
- private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
+ // city map to be used when translating the city names into codes
+ private static Map cityMap = AbstractPaceFunctions
+ .loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
- //list of stopwords in different languages
- protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
- protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
- protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
- protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
- protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
- protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
- protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
+ // list of stopwords in different languages
+ protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
+ protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
+ protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
+ protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
+ protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
+ protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
+ protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
- //transliterator
- protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+ // transliterator
+ protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
- //blacklist of ngrams: to avoid generic keys
- protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
+ // blacklist of ngrams: to avoid generic keys
+ protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
- //html regex for normalization
- public final String HTML_REGEX = "<[^>]*>";
+ // html regex for normalization
+ public final String HTML_REGEX = "<[^>]*>";
- private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
- private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
- private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
+ private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
+ private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
+ private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
- //doi prefix for normalization
- public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+ // doi prefix for normalization
+ public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
- private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
+ private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
- private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
+ private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
- protected String concat(final List l) {
- return Joiner.on(" ").skipNulls().join(l);
- }
+ protected String concat(final List l) {
+ return Joiner.on(" ").skipNulls().join(l);
+ }
- protected String cleanup(final String s) {
+ protected String cleanup(final String s) {
- final String s1 = s.replaceAll(HTML_REGEX, "");
- final String s2 = unicodeNormalization(s1.toLowerCase());
- final String s3 = nfd(s2);
- final String s4 = fixXML(s3);
- final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
- final String s6 = transliterate(s5);
- final String s7 = fixAliases(s6);
- final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
- final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
- final String s10 = s9.replaceAll("\\n", " ");
- final String s11 = s10.replaceAll("(?m)\\s+", " ");
- final String s12 = s11.trim();
- return s12;
- }
+ final String s1 = s.replaceAll(HTML_REGEX, "");
+ final String s2 = unicodeNormalization(s1.toLowerCase());
+ final String s3 = nfd(s2);
+ final String s4 = fixXML(s3);
+ final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
+ final String s6 = transliterate(s5);
+ final String s7 = fixAliases(s6);
+ final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
+ final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
+ final String s10 = s9.replaceAll("\\n", " ");
+ final String s11 = s10.replaceAll("(?m)\\s+", " ");
+ final String s12 = s11.trim();
+ return s12;
+ }
- protected String fixXML(final String a){
+ protected String fixXML(final String a) {
- return a.replaceAll("–", " ")
- .replaceAll("&", " ")
- .replaceAll(""", " ")
- .replaceAll("−", " ");
- }
+ return a
+ .replaceAll("–", " ")
+ .replaceAll("&", " ")
+ .replaceAll(""", " ")
+ .replaceAll("−", " ");
+ }
- protected boolean checkNumbers(final String a, final String b) {
- final String numbersA = getNumbers(a);
- final String numbersB = getNumbers(b);
- final String romansA = getRomans(a);
- final String romansB = getRomans(b);
- return !numbersA.equals(numbersB) || !romansA.equals(romansB);
- }
+ protected boolean checkNumbers(final String a, final String b) {
+ final String numbersA = getNumbers(a);
+ final String numbersB = getNumbers(b);
+ final String romansA = getRomans(a);
+ final String romansB = getRomans(b);
+ return !numbersA.equals(numbersB) || !romansA.equals(romansB);
+ }
- protected String getRomans(final String s) {
- final StringBuilder sb = new StringBuilder();
- for (final String t : s.split(" ")) {
- sb.append(isRoman(t) ? t : "");
- }
- return sb.toString();
- }
+ protected String getRomans(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isRoman(t) ? t : "");
+ }
+ return sb.toString();
+ }
- protected boolean isRoman(final String s) {
- return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
- }
+ protected boolean isRoman(final String s) {
+ return s
+ .replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop")
+ .equals("qwertyuiop");
+ }
- protected String getNumbers(final String s) {
- final StringBuilder sb = new StringBuilder();
- for (final String t : s.split(" ")) {
- sb.append(isNumber(t) ? t : "");
- }
- return sb.toString();
- }
+ protected String getNumbers(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final String t : s.split(" ")) {
+ sb.append(isNumber(t) ? t : "");
+ }
+ return sb.toString();
+ }
- public boolean isNumber(String strNum) {
- if (strNum == null) {
- return false;
- }
- return numberPattern.matcher(strNum).matches();
- }
+ public boolean isNumber(String strNum) {
+ if (strNum == null) {
+ return false;
+ }
+ return numberPattern.matcher(strNum).matches();
+ }
- protected static String fixAliases(final String s) {
- final StringBuilder sb = new StringBuilder();
- for (final char ch : Lists.charactersOf(s)) {
- final int i = StringUtils.indexOf(aliases_from, ch);
- sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
- }
- return sb.toString();
- }
+ protected static String fixAliases(final String s) {
+ final StringBuilder sb = new StringBuilder();
+ for (final char ch : Lists.charactersOf(s)) {
+ final int i = StringUtils.indexOf(aliases_from, ch);
+ sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
+ }
+ return sb.toString();
+ }
- protected static String transliterate(final String s) {
- try {
- return transliterator.transliterate(s);
- }
- catch(Exception e) {
- return s;
- }
- }
+ protected static String transliterate(final String s) {
+ try {
+ return transliterator.transliterate(s);
+ } catch (Exception e) {
+ return s;
+ }
+ }
- protected String removeSymbols(final String s) {
- final StringBuilder sb = new StringBuilder();
+ protected String removeSymbols(final String s) {
+ final StringBuilder sb = new StringBuilder();
- for (final char ch : Lists.charactersOf(s)) {
- sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
- }
- return sb.toString().replaceAll("\\s+", " ");
- }
-
- protected boolean notNull(final String s) {
- return s != null;
- }
+ for (final char ch : Lists.charactersOf(s)) {
+ sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
+ }
+ return sb.toString().replaceAll("\\s+", " ");
+ }
- protected String normalize(final String s) {
- return fixAliases(transliterate(nfd(unicodeNormalization(s))))
- .toLowerCase()
- // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
- .replaceAll("[^ \\w]+", "")
- .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
- .replaceAll("(\\p{Punct})+", " ")
- .replaceAll("(\\d)+", " ")
- .replaceAll("(\\n)+", " ")
- .trim();
- }
+ protected boolean notNull(final String s) {
+ return s != null;
+ }
- public String nfd(final String s) {
- return Normalizer.normalize(s, Normalizer.Form.NFD);
- }
+ protected String normalize(final String s) {
+ return fixAliases(transliterate(nfd(unicodeNormalization(s))))
+ .toLowerCase()
+ // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
+ // strings
+ .replaceAll("[^ \\w]+", "")
+ .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
+ .replaceAll("(\\p{Punct})+", " ")
+ .replaceAll("(\\d)+", " ")
+ .replaceAll("(\\n)+", " ")
+ .trim();
+ }
- public String utf8(final String s) {
- byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
- return new String(bytes, StandardCharsets.UTF_8);
- }
+ public String nfd(final String s) {
+ return Normalizer.normalize(s, Normalizer.Form.NFD);
+ }
- public String unicodeNormalization(final String s) {
+ public String utf8(final String s) {
+ byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
+ return new String(bytes, StandardCharsets.UTF_8);
+ }
- Matcher m = hexUnicodePattern.matcher(s);
- StringBuffer buf = new StringBuffer(s.length());
- while (m.find()) {
- String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
- m.appendReplacement(buf, Matcher.quoteReplacement(ch));
- }
- m.appendTail(buf);
- return buf.toString();
- }
+ public String unicodeNormalization(final String s) {
- protected String filterStopWords(final String s, final Set stopwords) {
- final StringTokenizer st = new StringTokenizer(s);
- final StringBuilder sb = new StringBuilder();
- while (st.hasMoreTokens()) {
- final String token = st.nextToken();
- if (!stopwords.contains(token)) {
- sb.append(token);
- sb.append(" ");
- }
- }
- return sb.toString().trim();
- }
+ Matcher m = hexUnicodePattern.matcher(s);
+ StringBuffer buf = new StringBuffer(s.length());
+ while (m.find()) {
+ String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
+ m.appendReplacement(buf, Matcher.quoteReplacement(ch));
+ }
+ m.appendTail(buf);
+ return buf.toString();
+ }
- public String filterAllStopWords(String s) {
+ protected String filterStopWords(final String s, final Set stopwords) {
+ final StringTokenizer st = new StringTokenizer(s);
+ final StringBuilder sb = new StringBuilder();
+ while (st.hasMoreTokens()) {
+ final String token = st.nextToken();
+ if (!stopwords.contains(token)) {
+ sb.append(token);
+ sb.append(" ");
+ }
+ }
+ return sb.toString().trim();
+ }
- s = filterStopWords(s, stopwords_en);
- s = filterStopWords(s, stopwords_de);
- s = filterStopWords(s, stopwords_it);
- s = filterStopWords(s, stopwords_fr);
- s = filterStopWords(s, stopwords_pt);
- s = filterStopWords(s, stopwords_es);
- s = filterStopWords(s, stopwords_gr);
+ public String filterAllStopWords(String s) {
- return s;
- }
+ s = filterStopWords(s, stopwords_en);
+ s = filterStopWords(s, stopwords_de);
+ s = filterStopWords(s, stopwords_it);
+ s = filterStopWords(s, stopwords_fr);
+ s = filterStopWords(s, stopwords_pt);
+ s = filterStopWords(s, stopwords_es);
+ s = filterStopWords(s, stopwords_gr);
- protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) {
- final Set newset = Sets.newLinkedHashSet();
- for (final String s : set) {
- if (!ngramBlacklist.contains(s)) {
- newset.add(s);
- }
- }
- return newset;
- }
+ return s;
+ }
- public static Set loadFromClasspath(final String classpath) {
+ protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) {
+ final Set newset = Sets.newLinkedHashSet();
+ for (final String s : set) {
+ if (!ngramBlacklist.contains(s)) {
+ newset.add(s);
+ }
+ }
+ return newset;
+ }
- Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+ public static Set loadFromClasspath(final String classpath) {
- final Set h = Sets.newHashSet();
- try {
- for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
- h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
- }
- } catch (final Throwable e) {
- return Sets.newHashSet();
- }
- return h;
- }
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
- public static Map loadMapFromClasspath(final String classpath) {
+ final Set h = Sets.newHashSet();
+ try {
+ for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
+ h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
+ }
+ } catch (final Throwable e) {
+ return Sets.newHashSet();
+ }
+ return h;
+ }
- Transliterator transliterator = Transliterator.getInstance("Any-Eng");
+ public static Map loadMapFromClasspath(final String classpath) {
- final Map m = new HashMap<>();
- try {
- for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
- //string is like this: code;word1;word2;word3
- String[] line = s.split(";");
- String value = line[0];
- for (int i = 1; i < line.length; i++) {
- m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
- }
- }
- } catch (final Throwable e) {
- return new HashMap<>();
- }
- return m;
- }
+ Transliterator transliterator = Transliterator.getInstance("Any-Eng");
- public String removeKeywords(String s, Set keywords) {
+ final Map m = new HashMap<>();
+ try {
+ for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
+ // string is like this: code;word1;word2;word3
+ String[] line = s.split(";");
+ String value = line[0];
+ for (int i = 1; i < line.length; i++) {
+ m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
+ }
+ }
+ } catch (final Throwable e) {
+ return new HashMap<>();
+ }
+ return m;
+ }
- s = " " + s + " ";
- for (String k : keywords) {
- s = s.replaceAll(k.toLowerCase(), "");
- }
+ public String removeKeywords(String s, Set keywords) {
- return s.trim();
- }
+ s = " " + s + " ";
+ for (String k : keywords) {
+ s = s.replaceAll(k.toLowerCase(), "");
+ }
- public double commonElementsPercentage(Set s1, Set s2) {
+ return s.trim();
+ }
- double longer = Math.max(s1.size(), s2.size());
- return (double) s1.stream().filter(s2::contains).count() / longer;
- }
+ public double commonElementsPercentage(Set s1, Set s2) {
- //convert the set of keywords to codes
- public Set toCodes(Set keywords, Map translationMap) {
- return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
- }
+ double longer = Math.max(s1.size(), s2.size());
+ return (double) s1.stream().filter(s2::contains).count() / longer;
+ }
- public Set keywordsToCodes(Set keywords, Map translationMap) {
- return toCodes(keywords, translationMap);
- }
+ // convert the set of keywords to codes
+ public Set toCodes(Set keywords, Map translationMap) {
+ return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
+ }
- public Set citiesToCodes(Set keywords) {
- return toCodes(keywords, cityMap);
- }
+ public Set keywordsToCodes(Set keywords, Map translationMap) {
+ return toCodes(keywords, translationMap);
+ }
- protected String firstLC(final String s) {
- return StringUtils.substring(s, 0, 1).toLowerCase();
- }
+ public Set citiesToCodes(Set keywords) {
+ return toCodes(keywords, cityMap);
+ }
- protected Iterable tokens(final String s, final int maxTokens) {
- return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
- }
+ protected String firstLC(final String s) {
+ return StringUtils.substring(s, 0, 1).toLowerCase();
+ }
- public String normalizePid(String pid) {
- return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
- }
+ protected Iterable tokens(final String s, final int maxTokens) {
+ return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
+ }
- //get the list of keywords into the input string
- public Set getKeywords(String s1, Map translationMap, int windowSize) {
+ public String normalizePid(String pid) {
+ return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
+ }
- String s = s1;
+ // get the list of keywords into the input string
+ public Set getKeywords(String s1, Map translationMap, int windowSize) {
- List tokens = Arrays.asList(s.toLowerCase().split(" "));
+ String s = s1;
- Set codes = new HashSet<>();
+ List tokens = Arrays.asList(s.toLowerCase().split(" "));
- if (tokens.size() < windowSize)
- windowSize = tokens.size();
+ Set codes = new HashSet<>();
- int length = windowSize;
+ if (tokens.size() < windowSize)
+ windowSize = tokens.size();
- while (length != 0) {
+ int length = windowSize;
- for (int i = 0; i <= tokens.size() - length; i++) {
- String candidate = concat(tokens.subList(i, i + length));
- if (translationMap.containsKey(candidate)) {
- codes.add(candidate);
- s = s.replace(candidate, "").trim();
- }
- }
+ while (length != 0) {
- tokens = Arrays.asList(s.split(" "));
- length -= 1;
- }
+ for (int i = 0; i <= tokens.size() - length; i++) {
+ String candidate = concat(tokens.subList(i, i + length));
+ if (translationMap.containsKey(candidate)) {
+ codes.add(candidate);
+ s = s.replace(candidate, "").trim();
+ }
+ }
- return codes;
- }
+ tokens = Arrays.asList(s.split(" "));
+ length -= 1;
+ }
- public Set getCities(String s1, int windowSize) {
- return getKeywords(s1, cityMap, windowSize);
- }
+ return codes;
+ }
- public static String readFromClasspath(final String filename, final Class clazz) {
- final StringWriter sw = new StringWriter();
- try {
- IOUtils.copy(clazz.getResourceAsStream(filename), sw);
- return sw.toString();
- } catch (final IOException e) {
- throw new RuntimeException("cannot load resource from classpath: " + filename);
- }
- }
+ public Set getCities(String s1, int windowSize) {
+ return getKeywords(s1, cityMap, windowSize);
+ }
+
+ public static String readFromClasspath(final String filename, final Class clazz) {
+ final StringWriter sw = new StringWriter();
+ try {
+ IOUtils.copy(clazz.getResourceAsStream(filename), sw);
+ return sw.toString();
+ } catch (final IOException e) {
+ throw new RuntimeException("cannot load resource from classpath: " + filename);
+ }
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
index 0db0270fb..00faff0bd 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.config;
import java.util.List;
@@ -44,7 +45,6 @@ public interface Config {
*/
public Map> blacklists();
-
/**
* Translation map.
*
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
index 63fc96aef..eeec68ae6 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java
@@ -1,16 +1,5 @@
-package eu.dnetlib.pace.config;
-import com.fasterxml.jackson.annotation.JsonIgnore;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Maps;
-import eu.dnetlib.pace.model.ClusteringDef;
-import eu.dnetlib.pace.model.FieldDef;
-import eu.dnetlib.pace.util.PaceException;
-import org.antlr.stringtemplate.StringTemplate;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.lang3.StringUtils;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+package eu.dnetlib.pace.config;
import java.io.IOException;
import java.io.Serializable;
@@ -25,139 +14,167 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
+import org.antlr.stringtemplate.StringTemplate;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Maps;
+
+import eu.dnetlib.pace.model.ClusteringDef;
+import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
-
+import eu.dnetlib.pace.util.PaceException;
public class DedupConfig implements Config, Serializable {
- private static String CONFIG_TEMPLATE = "dedupConfig.st";
+ private static String CONFIG_TEMPLATE = "dedupConfig.st";
- private PaceConfig pace;
+ private PaceConfig pace;
- private WfConfig wf;
+ private WfConfig wf;
- @JsonIgnore
- private Map> blacklists;
+ @JsonIgnore
+ private Map> blacklists;
- private static Map defaults = Maps.newHashMap();
+ private static Map defaults = Maps.newHashMap();
- static {
- defaults.put("dedupRun", "001");
- defaults.put("entityType", "result");
- defaults.put("subEntityType", "resulttype");
- defaults.put("subEntityValue", "publication");
- defaults.put("orderField", "title");
- defaults.put("queueMaxSize", "2000");
- defaults.put("groupMaxSize", "10");
- defaults.put("slidingWindowSize", "200");
- defaults.put("rootBuilder", "result");
- defaults.put("includeChildren", "true");
- defaults.put("maxIterations", "20");
- defaults.put("idPath", "$.id");
- }
+ static {
+ defaults.put("dedupRun", "001");
+ defaults.put("entityType", "result");
+ defaults.put("subEntityType", "resulttype");
+ defaults.put("subEntityValue", "publication");
+ defaults.put("orderField", "title");
+ defaults.put("queueMaxSize", "2000");
+ defaults.put("groupMaxSize", "10");
+ defaults.put("slidingWindowSize", "200");
+ defaults.put("rootBuilder", "result");
+ defaults.put("includeChildren", "true");
+ defaults.put("maxIterations", "20");
+ defaults.put("idPath", "$.id");
+ }
- public DedupConfig() {
- }
+ public DedupConfig() {
+ }
- public static DedupConfig load(final String json) {
+ public static DedupConfig load(final String json) {
- final DedupConfig config;
- try {
- config = new ObjectMapper().readValue(json, DedupConfig.class);
- config.getPace().initModel();
- config.getPace().initTranslationMap();
+ final DedupConfig config;
+ try {
+ config = new ObjectMapper().readValue(json, DedupConfig.class);
+ config.getPace().initModel();
+ config.getPace().initTranslationMap();
- config.blacklists = config.getPace().getBlacklists().entrySet()
- .stream()
- .map(e -> new AbstractMap.SimpleEntry>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList())))
- .collect(Collectors.toMap(e -> e.getKey(),
- e -> (Predicate & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent()))
+ config.blacklists = config
+ .getPace()
+ .getBlacklists()
+ .entrySet()
+ .stream()
+ .map(
+ e -> new AbstractMap.SimpleEntry>(e.getKey(),
+ e
+ .getValue()
+ .stream()
+ .filter(s -> !StringUtils.isBlank(s))
+ .map(Pattern::compile)
+ .collect(Collectors.toList())))
+ .collect(
+ Collectors
+ .toMap(
+ e -> e.getKey(),
+ e -> (Predicate & Serializable) s -> e
+ .getValue()
+ .stream()
+ .filter(p -> p.matcher(s).matches())
+ .findFirst()
+ .isPresent()))
- ;
+ ;
- return config;
- } catch (IOException |
- PatternSyntaxException e) {
- throw new PaceException("Error in parsing configuration json", e);
- }
+ return config;
+ } catch (IOException | PatternSyntaxException e) {
+ throw new PaceException("Error in parsing configuration json", e);
+ }
- }
+ }
- public static DedupConfig loadDefault() throws IOException {
- return loadDefault(new HashMap());
- }
+ public static DedupConfig loadDefault() throws IOException {
+ return loadDefault(new HashMap());
+ }
- public static DedupConfig loadDefault(final Map params) throws IOException {
+ public static DedupConfig loadDefault(final Map params) throws IOException {
- final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
+ final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
- for (final Entry e : defaults.entrySet()) {
- template.setAttribute(e.getKey(), e.getValue());
- }
- for (final Entry e : params.entrySet()) {
- if (template.getAttribute(e.getKey()) != null) {
- template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
- } else {
- template.setAttribute(e.getKey(), e.getValue());
- }
- }
+ for (final Entry e : defaults.entrySet()) {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ for (final Entry e : params.entrySet()) {
+ if (template.getAttribute(e.getKey()) != null) {
+ template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
+ } else {
+ template.setAttribute(e.getKey(), e.getValue());
+ }
+ }
- final String json = template.toString();
- return load(json);
- }
+ final String json = template.toString();
+ return load(json);
+ }
- private String readFromClasspath(final String resource) throws IOException {
- return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
- }
+ private String readFromClasspath(final String resource) throws IOException {
+ return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
+ }
- public PaceConfig getPace() {
- return pace;
- }
+ public PaceConfig getPace() {
+ return pace;
+ }
- public void setPace(final PaceConfig pace) {
- this.pace = pace;
- }
+ public void setPace(final PaceConfig pace) {
+ this.pace = pace;
+ }
- public WfConfig getWf() {
- return wf;
- }
+ public WfConfig getWf() {
+ return wf;
+ }
- public void setWf(final WfConfig wf) {
- this.wf = wf;
- }
+ public void setWf(final WfConfig wf) {
+ this.wf = wf;
+ }
- @Override
- public String toString() {
- try {
- return new ObjectMapper().writeValueAsString(this);
- } catch (IOException e) {
- throw new PaceException("unable to serialise configuration", e);
- }
- }
+ @Override
+ public String toString() {
+ try {
+ return new ObjectMapper().writeValueAsString(this);
+ } catch (IOException e) {
+ throw new PaceException("unable to serialise configuration", e);
+ }
+ }
- @Override
- public Map decisionTree() {
- return getPace().getDecisionTree();
- }
+ @Override
+ public Map decisionTree() {
+ return getPace().getDecisionTree();
+ }
- @Override
- public List model() {
- return getPace().getModel();
- }
+ @Override
+ public List model() {
+ return getPace().getModel();
+ }
- @Override
- public List clusterings() {
- return getPace().getClustering();
- }
+ @Override
+ public List clusterings() {
+ return getPace().getClustering();
+ }
- @Override
- public Map> blacklists() {
- return blacklists;
- }
+ @Override
+ public Map> blacklists() {
+ return blacklists;
+ }
- @Override
- public Map translationMap() {
- return getPace().translationMap();
- }
+ @Override
+ public Map translationMap() {
+ return getPace().translationMap();
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
index dc87a1b06..f1bc49f4a 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java
@@ -1,19 +1,20 @@
+
package eu.dnetlib.pace.config;
+import java.io.Serializable;
+import java.util.List;
+import java.util.Map;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.Maps;
import com.ibm.icu.text.Transliterator;
+
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver;
-import java.io.Serializable;
-import java.util.List;
-import java.util.Map;
-
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
private List model;
@@ -37,7 +38,8 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
@JsonIgnore
public static PaceResolver resolver = new PaceResolver();
- public PaceConfig() {}
+ public PaceConfig() {
+ }
public void initModel() {
modelMap = Maps.newHashMap();
@@ -46,20 +48,21 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
}
}
- public void initTranslationMap(){
+ public void initTranslationMap() {
translationMap = Maps.newHashMap();
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
for (String key : synonyms.keySet()) {
- for (String term : synonyms.get(key)){
- translationMap.put(
+ for (String term : synonyms.get(key)) {
+ translationMap
+ .put(
fixAliases(transliterator.transliterate(term.toLowerCase())),
- key);
+ key);
}
}
}
- public Map translationMap(){
+ public Map translationMap() {
return translationMap;
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
index 20981c427..9f3323edc 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.config;
public enum Type {
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
index 78fc18a13..8dea04232 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java
@@ -1,10 +1,5 @@
-package eu.dnetlib.pace.config;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.util.PaceException;
-import org.apache.commons.lang3.StringUtils;
+package eu.dnetlib.pace.config;
import java.io.IOException;
import java.io.Serializable;
@@ -12,6 +7,13 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import org.apache.commons.lang3.StringUtils;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.util.PaceException;
public class WfConfig implements Serializable {
@@ -76,7 +78,6 @@ public class WfConfig implements Serializable {
/** Maximum number of allowed children. */
private int maxChildren = MAX_CHILDREN;
-
/** Default maximum number of iterations. */
private final static int MAX_ITERATIONS = 20;
@@ -84,9 +85,10 @@ public class WfConfig implements Serializable {
private int maxIterations = MAX_ITERATIONS;
/** The Jquery path to retrieve the identifier */
- private String idPath = "$.id";
+ private String idPath = "$.id";
- public WfConfig() {}
+ public WfConfig() {
+ }
/**
* Instantiates a new dedup config.
@@ -114,8 +116,10 @@ public class WfConfig implements Serializable {
* @param idPath
* the path for the id of the entity
*/
- public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun,
- final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
+ public WfConfig(final String entityType, final String orderField, final List rootBuilder,
+ final String dedupRun,
+ final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize,
+ final boolean includeChildren, final int maxIterations, final String idPath) {
super();
this.entityType = entityType;
this.orderField = orderField;
@@ -257,7 +261,6 @@ public class WfConfig implements Serializable {
this.maxChildren = maxChildren;
}
-
public int getMaxIterations() {
return maxIterations;
}
@@ -277,7 +280,6 @@ public class WfConfig implements Serializable {
/*
* (non-Javadoc)
- *
* @see java.lang.Object#toString()
*/
@Override
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
index c15885ecf..d9ad81d42 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
@@ -1,15 +1,16 @@
-package eu.dnetlib.pace.model;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import eu.dnetlib.pace.clustering.ClusteringFunction;
-import eu.dnetlib.pace.config.PaceConfig;
-import eu.dnetlib.pace.util.PaceException;
+package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
+import com.fasterxml.jackson.databind.ObjectMapper;
+
+import eu.dnetlib.pace.clustering.ClusteringFunction;
+import eu.dnetlib.pace.config.PaceConfig;
+import eu.dnetlib.pace.util.PaceException;
public class ClusteringDef implements Serializable {
@@ -19,7 +20,8 @@ public class ClusteringDef implements Serializable {
private Map params;
- public ClusteringDef() {}
+ public ClusteringDef() {
+ }
public String getName() {
return name;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
index 196ac7248..f34545e6d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@@ -1,13 +1,15 @@
+
package eu.dnetlib.pace.model;
+import java.io.Serializable;
+import java.util.List;
+
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
-import eu.dnetlib.pace.config.Type;
-import java.io.Serializable;
-import java.util.List;
+import eu.dnetlib.pace.config.Type;
/**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
@@ -34,7 +36,8 @@ public class FieldDef implements Serializable {
*/
private int length = -1;
- public FieldDef() {}
+ public FieldDef() {
+ }
public String getName() {
return name;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
index 543b1bdfe..96120cf4d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.model;
import java.nio.charset.Charset;
@@ -43,7 +44,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
- if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
+ if (s.contains(",")) { // if the name contains a comma it is easy derivable the name and the surname
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
index a900a6082..1f8aab4bf 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java
@@ -1,3 +1,4 @@
+
package eu.dnetlib.pace.model;
import java.util.ArrayList;
@@ -57,7 +58,7 @@ public class PersonComparatorUtils {
private static boolean verifyNames(List list1, List list2) {
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
- && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
}
private static boolean verifySurnames(List list1, List list2) {
@@ -76,7 +77,7 @@ public class PersonComparatorUtils {
Collections.sort(list1);
Collections.sort(list2);
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
- && verifySimilarity(extractInitials(list1), extractInitials(list2));
+ && verifySimilarity(extractInitials(list1), extractInitials(list2));
}
private static List extractExtendedNames(List list) {
@@ -107,7 +108,7 @@ public class PersonComparatorUtils {
for (String s : list1) {
int curr = list2.indexOf(s);
if (curr > pos) {
- list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
+ list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
pos = curr;
} else {
return false;
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
index 3926b2897..42c226f87 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java
@@ -1,9 +1,11 @@
+
package eu.dnetlib.pace.model;
-import eu.dnetlib.pace.clustering.NGramUtils;
+import java.util.Comparator;
+
import org.apache.spark.sql.Row;
-import java.util.Comparator;
+import eu.dnetlib.pace.clustering.NGramUtils;
/**
* The Class MapDocumentComparator.
@@ -25,13 +27,12 @@ public class RowDataOrderingComparator implements Comparator {
/*
* (non-Javadoc)
- *
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
*/
@Override
public int compare(final Row d1, final Row d2) {
if (d1 == null)
- return d2==null ? 0: -1;
+ return d2 == null ? 0 : -1;
else if (d2 == null) {
return 1;
}
@@ -40,7 +41,7 @@ public class RowDataOrderingComparator implements Comparator {
final String o2 = d2.getString(comparatorField);
if (o1 == null)
- return o2==null ? 0: -1;
+ return o2 == null ? 0 : -1;
else if (o2 == null) {
return 1;
}
diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/SparkDedupConfig.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDedupConfig.scala
similarity index 82%
rename from dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/SparkDedupConfig.scala
rename to dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDedupConfig.scala
index 4300e80c6..def5ebb84 100644
--- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/SparkDedupConfig.scala
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDedupConfig.scala
@@ -1,32 +1,30 @@
-package eu.dnetlib.dhp.oa.dedup.model
+package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath, Option}
-import eu.dnetlib.dhp.oa.dedup.{DedupUtility, SparkReporter}
import eu.dnetlib.pace.config.{DedupConfig, Type}
-import eu.dnetlib.pace.model.{ClusteringDef, FieldDef}
import eu.dnetlib.pace.tree.support.TreeProcessor
import eu.dnetlib.pace.util.MapDocumentUtil.truncateValue
-import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil}
+import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil, SparkReporter}
import org.apache.spark.SparkContext
+import org.apache.spark.sql.{Column, Dataset, Row, functions}
import org.apache.spark.sql.catalyst.expressions.{GenericRowWithSchema, Literal}
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
-import org.apache.spark.sql.functions.{col, lit, udf}
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
-import org.apache.spark.sql.{Column, Dataset, Row, functions}
import java.util
import java.util.function.Predicate
import java.util.regex.Pattern
import scala.collection.JavaConverters._
import scala.collection.mutable
+import org.apache.spark.sql.functions.{col, lit, udf}
-class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
+case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")
private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")
- private var urlFilter = (s: String) => URL_REGEX.matcher(s).matches
+ private val urlFilter = (s: String) => URL_REGEX.matcher(s).matches
val modelExtractor: (Dataset[String] => Dataset[Row]) = df => {
df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0))))
@@ -226,60 +224,59 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
val orderingFieldPosition: Int = rowDataType.fieldIndex(conf.getWf.getOrderField)
- val rowFromJsonUDF = udf(
- (json: String) => {
- val documentContext =
- JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
- val values = new Array[Any](rowDataType.size)
+ val rowFromJson = (json: String) => {
+ val documentContext =
+ JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
+ val values = new Array[Any](rowDataType.size)
- values(identityFieldPosition) = DFMapDocumentUtils.getJPathString(conf.getWf.getIdPath, documentContext)
+ values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
- rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
- case ((res, (fname, index))) => {
- val fdef = conf.getPace.getModelMap.get(fname)
+ rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
+ case ((res, (fname, index))) => {
+ val fdef = conf.getPace.getModelMap.get(fname)
- if (fdef != null) {
- res(index) = fdef.getType match {
- case Type.String | Type.Int =>
- MapDocumentUtil.truncateValue(
- DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext),
- fdef.getLength
- )
+ if (fdef != null) {
+ res(index) = fdef.getType match {
+ case Type.String | Type.Int =>
+ MapDocumentUtil.truncateValue(
+ MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
+ fdef.getLength
+ )
- case Type.URL =>
- var uv = DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext)
- if (!urlFilter(uv)) uv = ""
- uv
+ case Type.URL =>
+ var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
+ if (!urlFilter(uv)) uv = ""
+ uv
- case Type.List | Type.JSON =>
- MapDocumentUtil.truncateList(
- DFMapDocumentUtils.getJPathList(fdef.getPath, documentContext, fdef.getType),
- fdef.getSize
- )
+ case Type.List | Type.JSON =>
+ MapDocumentUtil.truncateList(
+ MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
+ fdef.getSize
+ )
- case Type.StringConcat =>
- val jpaths = CONCAT_REGEX.split(fdef.getPath)
+ case Type.StringConcat =>
+ val jpaths = CONCAT_REGEX.split(fdef.getPath)
- truncateValue(
- jpaths
- .map(jpath => DFMapDocumentUtils.getJPathString(jpath, documentContext))
- .mkString(" "),
- fdef.getLength
- )
+ truncateValue(
+ jpaths
+ .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
+ .mkString(" "),
+ fdef.getLength
+ )
- case Type.DoubleArray =>
- MapDocumentUtil.getJPathArray(fdef.getPath, json)
- }
+ case Type.DoubleArray =>
+ MapDocumentUtil.getJPathArray(fdef.getPath, json)
}
-
- res
}
- }
- new GenericRowWithSchema(values, rowDataType)
- },
- rowDataType
- )
+ res
+ }
+ }
+
+ new GenericRowWithSchema(values, rowDataType)
+ }
+
+ val rowFromJsonUDF = udf(rowFromJson, rowDataType)
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
@@ -310,7 +307,7 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
}
def processBlock(implicit sc: SparkContext) = {
- val accumulators = DedupUtility.constructAccumulator(conf, sc)
+ val accumulators = SparkReporter.constructAccumulator(conf, sc)
udf[Array[Tuple2[String, String]], mutable.WrappedArray[Row]](block => {
val reporter = new SparkReporter(accumulators)
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
index aaac36ad7..4d31df5b3 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java
@@ -1,41 +1,42 @@
+
package eu.dnetlib.pace.tree;
+import java.util.Map;
+
import com.wcohen.ss.AbstractStringDistance;
+
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
-import java.util.Map;
-
@ComparatorClass("alwaysMatch")
public class AlwaysMatch extends AbstractComparator {
- public AlwaysMatch(final Map params){
- super(params, new com.wcohen.ss.JaroWinkler());
- }
+ public AlwaysMatch(final Map params) {
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
- public AlwaysMatch(final double weight) {
- super(weight, new com.wcohen.ss.JaroWinkler());
- }
+ public AlwaysMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
- protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
- super(weight, ssalgo);
- }
+ protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
- @Override
- public double compare(final Object a, final Object b, final Config conf) {
- return 1.0;
- }
+ @Override
+ public double compare(final Object a, final Object b, final Config conf) {
+ return 1.0;
+ }
- @Override
- public double getWeight() {
- return super.weight;
- }
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
- @Override
- protected double normalize(final double d) {
- return d;
- }
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
}
-
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
index eedc7f562..5c6939e60 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@@ -1,148 +1,157 @@
-package eu.dnetlib.pace.tree;
-import com.wcohen.ss.AbstractStringDistance;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.model.Person;
-import eu.dnetlib.pace.tree.support.AbstractListComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
+package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
+import com.wcohen.ss.AbstractStringDistance;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Person;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
@ComparatorClass("authorsMatch")
public class AuthorsMatch extends AbstractListComparator {
- Map params;
+ Map params;
- private double SURNAME_THRESHOLD;
- private double NAME_THRESHOLD;
- private double FULLNAME_THRESHOLD;
- private String MODE; //full or surname
- private int SIZE_THRESHOLD;
- private String TYPE; //count or percentage
- private int common;
+ private double SURNAME_THRESHOLD;
+ private double NAME_THRESHOLD;
+ private double FULLNAME_THRESHOLD;
+ private String MODE; // full or surname
+ private int SIZE_THRESHOLD;
+ private String TYPE; // count or percentage
+ private int common;
- public AuthorsMatch(Map params){
- super(params, new com.wcohen.ss.JaroWinkler());
- this.params = params;
+ public AuthorsMatch(Map params) {
+ super(params, new com.wcohen.ss.JaroWinkler());
+ this.params = params;
- MODE = params.getOrDefault("mode", "full");
- SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
- NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
- FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
- SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
- TYPE = params.getOrDefault("type", "percentage");
- common = 0;
- }
+ MODE = params.getOrDefault("mode", "full");
+ SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
+ NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
+ FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
+ SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
+ TYPE = params.getOrDefault("type", "percentage");
+ common = 0;
+ }
- protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
- super(w, ssalgo);
- }
+ protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
+ super(w, ssalgo);
+ }
- @Override
- public double compare(final List a, final List b, final Config conf) {
+ @Override
+ public double compare(final List a, final List b, final Config conf) {
- if (a.isEmpty() || b.isEmpty())
- return -1;
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
- if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
- return 1.0;
+ if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
+ return 1.0;
- List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
- List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+ List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+ List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
- common = 0;
- //compare each element of List1 with each element of List2
- for (Person p1 : aList)
+ common = 0;
+ // compare each element of List1 with each element of List2
+ for (Person p1 : aList)
- for (Person p2 : bList) {
+ for (Person p2 : bList) {
- //both persons are inaccurate
- if (!p1.isAccurate() && !p2.isAccurate()) {
- //compare just normalized fullnames
- String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
- String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
+ // both persons are inaccurate
+ if (!p1.isAccurate() && !p2.isAccurate()) {
+ // compare just normalized fullnames
+ String fullname1 = normalization(
+ p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname());
+ String fullname2 = normalization(
+ p2.getNormalisedFullname().isEmpty() ? p2.getOriginal() : p2.getNormalisedFullname());
- if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
- common += 1;
- break;
- }
- }
+ if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
+ common += 1;
+ break;
+ }
+ }
- //one person is inaccurate
- if (p1.isAccurate() ^ p2.isAccurate()) {
- //prepare data
- //data for the accurate person
- String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
- String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
+ // one person is inaccurate
+ if (p1.isAccurate() ^ p2.isAccurate()) {
+ // prepare data
+ // data for the accurate person
+ String name = normalization(
+ p1.isAccurate() ? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
+ String surname = normalization(
+ p1.isAccurate() ? p1.getNormalisedSurname() : p2.getNormalisedSurname());
- //data for the inaccurate person
- String fullname = normalization(
- p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
- );
+ // data for the inaccurate person
+ String fullname = normalization(
+ p1.isAccurate()
+ ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname())
+ : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()));
- if (fullname.contains(surname)) {
- if (MODE.equals("full")) {
- if (fullname.contains(name)) {
- common += 1;
- break;
- }
- }
- else { //MODE equals "surname"
- common += 1;
- break;
- }
- }
- }
+ if (fullname.contains(surname)) {
+ if (MODE.equals("full")) {
+ if (fullname.contains(name)) {
+ common += 1;
+ break;
+ }
+ } else { // MODE equals "surname"
+ common += 1;
+ break;
+ }
+ }
+ }
- //both persons are accurate
- if (p1.isAccurate() && p2.isAccurate()) {
+ // both persons are accurate
+ if (p1.isAccurate() && p2.isAccurate()) {
- if (compareSurname(p1, p2)) {
- if (MODE.equals("full")) {
- if(compareFirstname(p1, p2)) {
- common += 1;
- break;
- }
- }
- else { //MODE equals "surname"
- common += 1;
- break;
- }
- }
+ if (compareSurname(p1, p2)) {
+ if (MODE.equals("full")) {
+ if (compareFirstname(p1, p2)) {
+ common += 1;
+ break;
+ }
+ } else { // MODE equals "surname"
+ common += 1;
+ break;
+ }
+ }
- }
+ }
- }
+ }
- //normalization factor to compute the score
- int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
+ // normalization factor to compute the score
+ int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
- if(TYPE.equals("percentage")) {
- return (double) common / normFactor;
- }
- else {
- return (double) common;
- }
- }
+ if (TYPE.equals("percentage")) {
+ return (double) common / normFactor;
+ } else {
+ return (double) common;
+ }
+ }
- public boolean compareSurname(Person p1, Person p2) {
- return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
- }
+ public boolean compareSurname(Person p1, Person p2) {
+ return ssalgo
+ .score(
+ normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
+ }
- public boolean compareFirstname(Person p1, Person p2) {
+ public boolean compareFirstname(Person p1, Person p2) {
- if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
- if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
- return true;
- }
+ if (p1.getNormalisedFirstName().length() <= 2 || p2.getNormalisedFirstName().length() <= 2) {
+ if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
+ return true;
+ }
- return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
- }
+ return ssalgo
+ .score(
+ normalization(p1.getNormalisedFirstName()),
+ normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
+ }
- public String normalization(String s) {
- return normalize(utf8(cleanup(s)));
- }
+ public String normalization(String s) {
+ return normalize(utf8(cleanup(s)));
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
index a51d07eb7..1d898ad83 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java
@@ -1,47 +1,48 @@
+
package eu.dnetlib.pace.tree;
+import java.util.Map;
+import java.util.Set;
+
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
-import java.util.Map;
-import java.util.Set;
-
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
- private Map params;
+ private Map params;
- public CityMatch(Map params) {
- super(params);
- this.params = params;
- }
+ public CityMatch(Map params) {
+ super(params);
+ this.params = params;
+ }
- @Override
- public double distance(final String a, final String b, final Config conf) {
+ @Override
+ public double distance(final String a, final String b, final Config conf) {
- String ca = cleanup(a);
- String cb = cleanup(b);
+ String ca = cleanup(a);
+ String cb = cleanup(b);
- ca = normalize(ca);
- cb = normalize(cb);
+ ca = normalize(ca);
+ cb = normalize(cb);
- ca = filterAllStopWords(ca);
- cb = filterAllStopWords(cb);
+ ca = filterAllStopWords(ca);
+ cb = filterAllStopWords(cb);
- Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
- Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
+ Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
- Set codes1 = citiesToCodes(cities1);
- Set codes2 = citiesToCodes(cities2);
+ Set codes1 = citiesToCodes(cities1);
+ Set codes2 = citiesToCodes(cities2);
- //if no cities are detected, the comparator gives 1.0
- if (codes1.isEmpty() && codes2.isEmpty())
- return 1.0;
- else {
- if (codes1.isEmpty() ^ codes2.isEmpty())
- return -1; //undefined if one of the two has no cities
- return commonElementsPercentage(codes1, codes2);
- }
- }
+ // if no cities are detected, the comparator gives 1.0
+ if (codes1.isEmpty() && codes2.isEmpty())
+ return 1.0;
+ else {
+ if (codes1.isEmpty() ^ codes2.isEmpty())
+ return -1; // undefined if one of the two has no cities
+ return commonElementsPercentage(codes1, codes2);
+ }
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
index 59e5dd346..d255612ba 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java
@@ -1,47 +1,47 @@
+
package eu.dnetlib.pace.tree;
+import java.util.Map;
+
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
-import java.util.Map;
-
@ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator {
- Map params;
+ Map params;
- public CosineSimilarity(Map params) {
- super(params);
- }
+ public CosineSimilarity(Map params) {
+ super(params);
+ }
- @Override
- public double compare(Object a, Object b, Config config) {
- return compare((double[])a, (double[])b, config);
- }
+ @Override
+ public double compare(Object a, Object b, Config config) {
+ return compare((double[]) a, (double[]) b, config);
+ }
- public double compare(final double[] a, final double[] b, final Config conf) {
+ public double compare(final double[] a, final double[] b, final Config conf) {
- if (a.length == 0 || b.length == 0)
- return -1;
+ if (a.length == 0 || b.length == 0)
+ return -1;
- return cosineSimilarity(a, b);
- }
+ return cosineSimilarity(a, b);
+ }
- double cosineSimilarity(double[] a, double[] b) {
- double dotProduct = 0;
- double normASum = 0;
- double normBSum = 0;
+ double cosineSimilarity(double[] a, double[] b) {
+ double dotProduct = 0;
+ double normASum = 0;
+ double normBSum = 0;
- for(int i = 0; i < a.length; i ++) {
- dotProduct += a[i] * b[i];
- normASum += a[i] * a[i];
- normBSum += b[i] * b[i];
- }
-
- double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
- return dotProduct / eucledianDist;
- }
+ for (int i = 0; i < a.length; i++) {
+ dotProduct += a[i] * b[i];
+ normASum += a[i] * a[i];
+ normBSum += b[i] * b[i];
+ }
+ double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
+ return dotProduct / eucledianDist;
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
index 429882450..d3c5bc10d 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java
@@ -1,9 +1,10 @@
+
package eu.dnetlib.pace.tree;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
-
import java.util.Map;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
/**
* The Class ExactMatch.
*
@@ -12,15 +13,15 @@ import java.util.Map;
@ComparatorClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase {
- public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
+ public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
- public DoiExactMatch(final Map params) {
- super(params);
- }
+ public DoiExactMatch(final Map params) {
+ super(params);
+ }
- @Override
- protected String toString(final Object f) {
- return super.toString(f).replaceAll(PREFIX, "");
- }
+ @Override
+ protected String toString(final Object f) {
+ return super.toString(f).replaceAll(PREFIX, "");
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
index 2e99595e0..c28274652 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java
@@ -1,29 +1,30 @@
-package eu.dnetlib.pace.tree;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
+package eu.dnetlib.pace.tree;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
@ComparatorClass("domainExactMatch")
public class DomainExactMatch extends ExactMatchIgnoreCase {
- public DomainExactMatch(final Map params) {
- super(params);
- }
+ public DomainExactMatch(final Map params) {
+ super(params);
+ }
- @Override
- protected String toString(final Object f) {
+ @Override
+ protected String toString(final Object f) {
- try {
- return asUrl(super.toString(f)).getHost();
- } catch (MalformedURLException e) {
- return "";
- }
- }
+ try {
+ return asUrl(super.toString(f)).getHost();
+ } catch (MalformedURLException e) {
+ return "";
+ }
+ }
- private URL asUrl(final String value) throws MalformedURLException {
- return new URL(value);
- }
+ private URL asUrl(final String value) throws MalformedURLException {
+ return new URL(value);
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
index 08fca05c9..35357c553 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java
@@ -1,42 +1,44 @@
+
package eu.dnetlib.pace.tree;
+import java.util.Map;
+
import com.wcohen.ss.AbstractStringDistance;
+
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
-import java.util.Map;
-
@ComparatorClass("exactMatch")
public class ExactMatch extends AbstractStringComparator {
- public ExactMatch(Map params){
- super(params, new com.wcohen.ss.JaroWinkler());
- }
+ public ExactMatch(Map params) {
+ super(params, new com.wcohen.ss.JaroWinkler());
+ }
- public ExactMatch(final double weight) {
- super(weight, new com.wcohen.ss.JaroWinkler());
- }
+ public ExactMatch(final double weight) {
+ super(weight, new com.wcohen.ss.JaroWinkler());
+ }
- protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
- super(weight, ssalgo);
- }
+ protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
+ super(weight, ssalgo);
+ }
- @Override
- public double distance(final String a, final String b, final Config conf) {
- if (a.isEmpty() || b.isEmpty()) {
- return -1.0; //return -1 if a field is missing
- }
- return a.equals(b) ? 1.0 : 0;
- }
+ @Override
+ public double distance(final String a, final String b, final Config conf) {
+ if (a.isEmpty() || b.isEmpty()) {
+ return -1.0; // return -1 if a field is missing
+ }
+ return a.equals(b) ? 1.0 : 0;
+ }
- @Override
- public double getWeight() {
- return super.weight;
- }
+ @Override
+ public double getWeight() {
+ return super.weight;
+ }
- @Override
- protected double normalize(final double d) {
- return d;
- }
+ @Override
+ protected double normalize(final double d) {
+ return d;
+ }
}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
index b6b4d1af4..220bfb7dd 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java
@@ -1,30 +1,32 @@
-package eu.dnetlib.pace.tree;
-import com.google.common.base.Joiner;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractStringComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
+package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
+import com.google.common.base.Joiner;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractStringComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
@ComparatorClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractStringComparator {
- public ExactMatchIgnoreCase(Map params) {
- super(params);
- }
+ public ExactMatchIgnoreCase(Map params) {
+ super(params);
+ }
- @Override
- public double compare(String a, String b, final Config conf) {
+ @Override
+ public double compare(String a, String b, final Config conf) {
- if (a.isEmpty() || b.isEmpty())
- return -1;
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
- return a.equalsIgnoreCase(b) ? 1 : 0;
- }
+ return a.equalsIgnoreCase(b) ? 1 : 0;
+ }
- protected String toString(final Object object) {
- return toFirstString(object);
- }
-}
\ No newline at end of file
+ protected String toString(final Object object) {
+ return toFirstString(object);
+ }
+}
diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
index 074b82a19..238cb16ce 100644
--- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
+++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
@@ -1,9 +1,5 @@
-package eu.dnetlib.pace.tree;
-import com.google.common.collect.Sets;
-import eu.dnetlib.pace.config.Config;
-import eu.dnetlib.pace.tree.support.AbstractListComparator;
-import eu.dnetlib.pace.tree.support.ComparatorClass;
+package eu.dnetlib.pace.tree;
import java.util.HashMap;
import java.util.List;
@@ -11,70 +7,74 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
+import com.google.common.collect.Sets;
+
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.tree.support.AbstractListComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
@ComparatorClass("instanceTypeMatch")
public class InstanceTypeMatch extends AbstractListComparator {
- final Map translationMap = new HashMap<>();
+ final Map translationMap = new HashMap<>();
- public InstanceTypeMatch(Map params){
- super(params);
+ public InstanceTypeMatch(Map params) {
+ super(params);
- //jolly types
- translationMap.put("Conference object", "*");
- translationMap.put("Other literature type", "*");
- translationMap.put("Unknown", "*");
+ // jolly types
+ translationMap.put("Conference object", "*");
+ translationMap.put("Other literature type", "*");
+ translationMap.put("Unknown", "*");
- //article types
- translationMap.put("Article", "Article");
- translationMap.put("Data Paper", "Article");
- translationMap.put("Software Paper", "Article");
- translationMap.put("Preprint", "Article");
+ // article types
+ translationMap.put("Article", "Article");
+ translationMap.put("Data Paper", "Article");
+ translationMap.put("Software Paper", "Article");
+ translationMap.put("Preprint", "Article");
- //thesis types
- translationMap.put("Thesis", "Thesis");
- translationMap.put("Master thesis", "Thesis");
- translationMap.put("Bachelor thesis", "Thesis");
- translationMap.put("Doctoral thesis", "Thesis");
- }
+ // thesis types
+ translationMap.put("Thesis", "Thesis");
+ translationMap.put("Master thesis", "Thesis");
+ translationMap.put("Bachelor thesis", "Thesis");
+ translationMap.put("Doctoral thesis", "Thesis");
+ }
+ @Override
+ public double compare(final List a, final List b, final Config conf) {
- @Override
- public double compare(final List