WIP: various refactors

This commit is contained in:
Claudio Atzori 2023-06-26 13:58:11 +02:00 committed by Sandro La Bruzzo
parent 4c2dfcbdf7
commit 649679de8d
108 changed files with 5650 additions and 5414 deletions

View File

@ -81,9 +81,12 @@
</dependency> </dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.11</artifactId> <artifactId>spark-core_2.11</artifactId>
<version>2.4.0.cloudera2</version> </dependency>
<scope>compile</scope>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency> </dependency>
</dependencies> </dependencies>

View File

@ -1,8 +1,5 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection; import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
@ -10,32 +7,39 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params; protected Map<String, Integer> params;
public AbstractClusteringFunction(final Map<String, Integer> params) { public AbstractClusteringFunction(final Map<String, Integer> params) {
this.params = params; this.params = params;
} }
protected abstract Collection<String> doApply(Config conf, String s); protected abstract Collection<String> doApply(Config conf, String s);
@Override @Override
public Collection<String> apply(Config conf, List<String> fields) { public Collection<String> apply(Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty()) return fields
.map(this::normalize) .stream()
.map(s -> filterAllStopWords(s)) .filter(f -> !f.isEmpty())
.map(s -> doApply(conf, s)) .map(this::normalize)
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(s -> filterAllStopWords(s))
.flatMap(c -> c.stream()) .map(s -> doApply(conf, s))
.filter(StringUtils::isNotBlank) .map(c -> filterBlacklisted(c, ngramBlacklist))
.collect(Collectors.toCollection(HashSet::new)); .flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
} }
public Map<String, Integer> getParams() { public Map<String, Integer> getParams() {
return params; return params;
} }
protected Integer param(String name) { protected Integer param(String name) {
return params.get(name); return params.get(name);
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
@ -6,6 +7,7 @@ import java.util.Set;
import java.util.StringTokenizer; import java.util.StringTokenizer;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("acronyms") @ClusteringClass("acronyms")
@ -19,16 +21,16 @@ public class Acronyms extends AbstractClusteringFunction {
protected Collection<String> doApply(Config conf, String s) { protected Collection<String> doApply(Config conf, String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
} }
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
final Set<String> acronyms = Sets.newLinkedHashSet(); final Set<String> acronyms = Sets.newLinkedHashSet();
for (int i = 0; i < maxAcronyms; i++) { for (int i = 0; i < maxAcronyms; i++) {
final StringTokenizer st = new StringTokenizer(s); final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) { while (st.hasMoreTokens()) {
final String token = st.nextToken(); final String token = st.nextToken();
if (sb.length() > maxLen) { if (sb.length() > maxLen) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
@ -9,5 +10,5 @@ import java.lang.annotation.Target;
@Target(ElementType.TYPE) @Target(ElementType.TYPE)
public @interface ClusteringClass { public @interface ClusteringClass {
public String value(); public String value();
} }

View File

@ -1,15 +1,16 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.config.Config;
public interface ClusteringFunction { public interface ClusteringFunction {
public Collection<String> apply(Config config, List<String> fields); public Collection<String> apply(Config config, List<String> fields);
public Map<String, Integer> getParams(); public Map<String, Integer> getParams();
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
@ -5,6 +6,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("immutablefieldvalue") @ClusteringClass("immutablefieldvalue")

View File

@ -1,50 +1,54 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config; package eu.dnetlib.pace.clustering;
import org.apache.commons.lang3.StringUtils;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("keywordsclustering") @ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction { public class KeywordsClustering extends AbstractClusteringFunction {
public KeywordsClustering(Map<String, Integer> params) { public KeywordsClustering(Map<String, Integer> params) {
super(params); super(params);
} }
@Override @Override
protected Collection<String> doApply(final Config conf, String s) { protected Collection<String> doApply(final Config conf, String s) {
//takes city codes and keywords codes without duplicates // takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4)); Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
//list of combination to return as result // list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){ for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
for (String city: citiesToCodes(cities)) { for (String city : citiesToCodes(cities)) {
combinations.add(keyword+"-"+city); combinations.add(keyword + "-" + city);
if (combinations.size()>=params.getOrDefault("max", 2)) { if (combinations.size() >= params.getOrDefault("max", 2)) {
return combinations; return combinations;
} }
} }
} }
return combinations; return combinations;
} }
@Override @Override
public Collection<String> apply(final Config conf, List<String> fields) { public Collection<String> apply(final Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty()) return fields
.map(this::cleanup) .stream()
.map(this::normalize) .filter(f -> !f.isEmpty())
.map(s -> filterAllStopWords(s)) .map(this::cleanup)
.map(s -> doApply(conf, s)) .map(this::normalize)
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(s -> filterAllStopWords(s))
.flatMap(c -> c.stream()) .map(s -> doApply(conf, s))
.filter(StringUtils::isNotBlank) .map(c -> filterBlacklisted(c, ngramBlacklist))
.collect(Collectors.toCollection(HashSet::new)); .flatMap(c -> c.stream())
} .filter(StringUtils::isNotBlank)
} .collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -1,75 +1,79 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("lnfi") @ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{ public class LastNameFirstInitial extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = true; private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) { public LastNameFirstInitial(final Map<String, Integer> params) {
super(params); super(params);
} }
@Override @Override
public Collection<String> apply(Config conf, List<String> fields) { public Collection<String> apply(Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty()) return fields
.map(this::normalize) .stream()
.map(s -> doApply(conf, s)) .filter(f -> !f.isEmpty())
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(this::normalize)
.flatMap(c -> c.stream()) .map(s -> doApply(conf, s))
.filter(StringUtils::isNotBlank) .map(c -> filterBlacklisted(c, ngramBlacklist))
.collect(Collectors.toCollection(HashSet::new)); .flatMap(c -> c.stream())
} .filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override @Override
protected String normalize(final String s) { protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s)))) return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
.replaceAll("[^ \\w]+", "") // strings
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") .replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\d)+", " ") .replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\n)+", " ") .replaceAll("(\\d)+", " ")
.trim(); .replaceAll("(\\n)+", " ")
} .trim();
}
@Override @Override
protected Collection<String> doApply(final Config conf, final String s) { protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
: DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive); Person p = new Person(s, aggressive);
if (p.isAccurate()) { if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase(); String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1); String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0, 1);
res.add(firstInitial.concat(lastName)); res.add(firstInitial.concat(lastName));
} } else { // is not accurate, meaning it has no defined name and surname
else { // is not accurate, meaning it has no defined name and surname List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); if (fullname.size() == 1) {
if (fullname.size() == 1) { res.add(p.getNormalisedFullname().toLowerCase());
res.add(p.getNormalisedFullname().toLowerCase()); } else if (fullname.size() == 2) {
} res.add(fullname.get(0).substring(0, 1).concat(fullname.get(1)).toLowerCase());
else if (fullname.size() == 2) { res.add(fullname.get(1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase()); } else {
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase()); res.add(fullname.get(0).substring(0, 1).concat(fullname.get(fullname.size() - 1)).toLowerCase());
} res.add(fullname.get(fullname.size() - 1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
else { }
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase()); }
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res; return res;
} }
} }

View File

@ -1,14 +1,17 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists; package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("lowercase") @ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction { public class LowercaseClustering extends AbstractClusteringFunction {
@ -19,7 +22,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
@Override @Override
public Collection<String> apply(Config conf, List<String> fields) { public Collection<String> apply(Config conf, List<String> fields) {
Collection<String> c = Sets.newLinkedHashSet(); Collection<String> c = Sets.newLinkedHashSet();
for(String f : fields) { for (String f : fields) {
c.addAll(doApply(conf, f)); c.addAll(doApply(conf, f));
} }
return c; return c;
@ -27,7 +30,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
@Override @Override
protected Collection<String> doApply(final Config conf, final String s) { protected Collection<String> doApply(final Config conf, final String s) {
if(StringUtils.isBlank(s)) { if (StringUtils.isBlank(s)) {
return Lists.newArrayList(); return Lists.newArrayList();
} }
return Lists.newArrayList(s.toLowerCase().trim()); return Lists.newArrayList(s.toLowerCase().trim());

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Set; import java.util.Set;
@ -11,7 +12,8 @@ public class NGramUtils extends AbstractPaceFunctions {
private static final int SIZE = 100; private static final int SIZE = 100;
private static final Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); private static final Set<String> stopwords = AbstractPaceFunctions
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
public static String cleanupForOrdering(String s) { public static String cleanupForOrdering(String s) {
String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords); String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords);

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
@ -6,6 +7,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrampairs") @ClusteringClass("ngrampairs")
@ -32,7 +34,7 @@ public class NgramPairs extends Ngrams {
break; break;
} }
res.add(ngrams.get(i) + ngrams.get(j)); res.add(ngrams.get(i) + ngrams.get(j));
//System.out.println("-- " + concatNgrams); // System.out.println("-- " + concatNgrams);
} }
return res; return res;
} }

View File

@ -1,9 +1,10 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.*; import java.util.*;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrams") @ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction { public class Ngrams extends AbstractClusteringFunction {
@ -44,7 +45,7 @@ public class Ngrams extends AbstractClusteringFunction {
} }
} }
} }
//System.out.println(ngrams + " n: " + ngrams.size()); // System.out.println(ngrams + " n: " + ngrams.size());
return ngrams; return ngrams;
} }

View File

@ -1,16 +1,19 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personClustering") @ClusteringClass("personClustering")
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
@ -30,7 +33,8 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
final Person person = new Person(f, false); final Person person = new Person(f, false);
if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) { if (StringUtils.isNotBlank(person.getNormalisedFirstName())
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase()); hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
} else { } else {
for (final String token1 : tokens(f, MAX_TOKENS)) { for (final String token1 : tokens(f, MAX_TOKENS)) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
@ -22,7 +23,8 @@ public class PersonHash extends AbstractClusteringFunction {
protected Collection<String> doApply(final Config conf, final String s) { protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
: DEFAULT_AGGRESSIVE);
res.add(new Person(s, aggressive).hash()); res.add(new Person(s, aggressive).hash());

View File

@ -1,10 +1,11 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.config.Config;
public class RandomClusteringFunction extends AbstractClusteringFunction { public class RandomClusteringFunction extends AbstractClusteringFunction {
public RandomClusteringFunction(Map<String, Integer> params) { public RandomClusteringFunction(Map<String, Integer> params) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.*; import java.util.*;
@ -5,6 +6,7 @@ import java.util.*;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("sortedngrampairs") @ClusteringClass("sortedngrampairs")

View File

@ -1,15 +1,17 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("spacetrimmingfieldvalue") @ClusteringClass("spacetrimmingfieldvalue")
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
@ -21,7 +23,10 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
protected Collection<String> doApply(final Config conf, final String s) { protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); res
.add(
StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength"))
: s.toLowerCase().replaceAll("\\s+", ""));
return res; return res;
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
@ -5,6 +6,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("suffixprefix") @ClusteringClass("suffixprefix")
@ -18,7 +20,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
protected Collection<String> doApply(Config conf, String s) { protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max")); return suffixPrefix(s, param("len"), param("max"));
} }
private Collection<String> suffixPrefix(String s, int len, int max) { private Collection<String> suffixPrefix(String s, int len, int max) {
final Set<String> bigrams = Sets.newLinkedHashSet(); final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0; int i = 0;

View File

@ -1,7 +1,5 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
@ -11,42 +9,44 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("urlclustering") @ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params; protected Map<String, Integer> params;
public UrlClustering(final Map<String, Integer> params) { public UrlClustering(final Map<String, Integer> params) {
this.params = params; this.params = params;
} }
@Override @Override
public Collection<String> apply(final Config conf, List<String> fields) { public Collection<String> apply(final Config conf, List<String> fields) {
try { try {
return fields.stream() return fields
.filter(f -> !f.isEmpty()) .stream()
.map(this::asUrl) .filter(f -> !f.isEmpty())
.map(URL::getHost) .map(this::asUrl)
.collect(Collectors.toCollection(HashSet::new)); .map(URL::getHost)
} .collect(Collectors.toCollection(HashSet::new));
catch (IllegalStateException e){ } catch (IllegalStateException e) {
return new HashSet<>(); return new HashSet<>();
} }
} }
@Override @Override
public Map<String, Integer> getParams() { public Map<String, Integer> getParams() {
return null; return null;
} }
private URL asUrl(String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
private URL asUrl(String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
} }

View File

@ -1,90 +1,91 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordsStatsSuffixPrefixChain") @ClusteringClass("wordsStatsSuffixPrefixChain")
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction { public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) { public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
super(params); super(params);
} }
@Override @Override
protected Collection<String> doApply(Config conf, String s) { protected Collection<String> doApply(Config conf, String s) {
return suffixPrefixChain(s, param("mod")); return suffixPrefixChain(s, param("mod"));
} }
private Collection<String> suffixPrefixChain(String s, int mod) { private Collection<String> suffixPrefixChain(String s, int mod) {
//create the list of words from the string (remove short words) // create the list of words from the string (remove short words)
List<String> wordsList = List<String> wordsList = Arrays
Arrays.stream(s.split(" ")) .stream(s.split(" "))
.filter(si -> si.length() > 3) .filter(si -> si.length() > 3)
.collect(Collectors.toList()); .collect(Collectors.toList());
final int words = wordsList.size(); final int words = wordsList.size();
final int letters = s.length(); final int letters = s.length();
//create the prefix: number of words + number of letters/mod // create the prefix: number of words + number of letters/mod
String prefix = words + "-" + letters/mod + "-"; String prefix = words + "-" + letters / mod + "-";
return doSuffixPrefixChain(wordsList, prefix); return doSuffixPrefixChain(wordsList, prefix);
} }
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) { private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
Set<String> set = Sets.newLinkedHashSet(); Set<String> set = Sets.newLinkedHashSet();
switch(wordsList.size()){ switch (wordsList.size()) {
case 0: case 0:
case 1: case 1:
break; break;
case 2: case 2:
set.add( set
prefix + .add(
suffix(wordsList.get(0), 3) + prefix +
prefix(wordsList.get(1), 3) suffix(wordsList.get(0), 3) +
); prefix(wordsList.get(1), 3));
set.add( set
prefix + .add(
prefix(wordsList.get(0), 3) + prefix +
suffix(wordsList.get(1), 3) prefix(wordsList.get(0), 3) +
); suffix(wordsList.get(1), 3));
break; break;
default: default:
set.add( set
prefix + .add(
suffix(wordsList.get(0), 3) + prefix +
prefix(wordsList.get(1), 3) + suffix(wordsList.get(0), 3) +
suffix(wordsList.get(2), 3) prefix(wordsList.get(1), 3) +
); suffix(wordsList.get(2), 3));
set.add( set
prefix + .add(
prefix(wordsList.get(0), 3) + prefix +
suffix(wordsList.get(1), 3) + prefix(wordsList.get(0), 3) +
prefix(wordsList.get(2), 3) suffix(wordsList.get(1), 3) +
); prefix(wordsList.get(2), 3));
break; break;
} }
return set; return set;
} }
private String suffix(String s, int len) {
return s.substring(s.length() - len);
}
private String suffix(String s, int len) { private String prefix(String s, int len) {
return s.substring(s.length()-len); return s.substring(0, len);
} }
private String prefix(String s, int len) {
return s.substring(0, len);
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Collection; import java.util.Collection;
@ -5,53 +6,54 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordssuffixprefix") @ClusteringClass("wordssuffixprefix")
public class WordsSuffixPrefix extends AbstractClusteringFunction { public class WordsSuffixPrefix extends AbstractClusteringFunction {
public WordsSuffixPrefix(Map<String, Integer> params) { public WordsSuffixPrefix(Map<String, Integer> params) {
super(params); super(params);
} }
@Override @Override
protected Collection<String> doApply(Config conf, String s) { protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max")); return suffixPrefix(s, param("len"), param("max"));
} }
private Collection<String> suffixPrefix(String s, int len, int max) { private Collection<String> suffixPrefix(String s, int len, int max) {
final int words = s.split(" ").length; final int words = s.split(" ").length;
// adjust the token length according to the number of words // adjust the token length according to the number of words
switch (words) { switch (words) {
case 1: case 1:
return Sets.newLinkedHashSet(); return Sets.newLinkedHashSet();
case 2: case 2:
return doSuffixPrefix(s, len+2, max, words); return doSuffixPrefix(s, len + 2, max, words);
case 3: case 3:
return doSuffixPrefix(s, len+1, max, words); return doSuffixPrefix(s, len + 1, max, words);
default: default:
return doSuffixPrefix(s, len, max, words); return doSuffixPrefix(s, len, max, words);
} }
} }
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) { private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
final Set<String> bigrams = Sets.newLinkedHashSet(); final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0; int i = 0;
while (++i < s.length() && bigrams.size() < max) { while (++i < s.length() && bigrams.size() < max) {
int j = s.indexOf(" ", i); int j = s.indexOf(" ", i);
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
if (j - len > 0) { if (j - len > 0) {
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
if (bigram.length() >= 4) { if (bigram.length() >= 4) {
bigrams.add(words+bigram); bigrams.add(words + bigram);
} }
} }
} }
return bigrams; return bigrams;
} }
} }

View File

@ -1,14 +1,5 @@
package eu.dnetlib.pace.common;
import com.google.common.base.Joiner; package eu.dnetlib.pace.common;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
@ -19,6 +10,18 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
/** /**
* Set of common functions for the framework * Set of common functions for the framework
* *
@ -26,321 +29,325 @@ import java.util.stream.Collectors;
*/ */
public abstract class AbstractPaceFunctions { public abstract class AbstractPaceFunctions {
//city map to be used when translating the city names into codes // city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); private static Map<String, String> cityMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
//list of stopwords in different languages // list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
//transliterator // transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
//blacklist of ngrams: to avoid generic keys // blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
//html regex for normalization // html regex for normalization
public final String HTML_REGEX = "<[^>]*>"; public final String HTML_REGEX = "<[^>]*>";
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
//doi prefix for normalization // doi prefix for normalization
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected String concat(final List<String> l) { protected String concat(final List<String> l) {
return Joiner.on(" ").skipNulls().join(l); return Joiner.on(" ").skipNulls().join(l);
} }
protected String cleanup(final String s) { protected String cleanup(final String s) {
final String s1 = s.replaceAll(HTML_REGEX, ""); final String s1 = s.replaceAll(HTML_REGEX, "");
final String s2 = unicodeNormalization(s1.toLowerCase()); final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2); final String s3 = nfd(s2);
final String s4 = fixXML(s3); final String s4 = fixXML(s3);
final String s5 = s4.replaceAll("([0-9]+)", " $1 "); final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
final String s6 = transliterate(s5); final String s6 = transliterate(s5);
final String s7 = fixAliases(s6); final String s7 = fixAliases(s6);
final String s8 = s7.replaceAll("[^\\p{ASCII}]", ""); final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
final String s9 = s8.replaceAll("[\\p{Punct}]", " "); final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
final String s10 = s9.replaceAll("\\n", " "); final String s10 = s9.replaceAll("\\n", " ");
final String s11 = s10.replaceAll("(?m)\\s+", " "); final String s11 = s10.replaceAll("(?m)\\s+", " ");
final String s12 = s11.trim(); final String s12 = s11.trim();
return s12; return s12;
} }
protected String fixXML(final String a){ protected String fixXML(final String a) {
return a.replaceAll("&ndash;", " ") return a
.replaceAll("&amp;", " ") .replaceAll("&ndash;", " ")
.replaceAll("&quot;", " ") .replaceAll("&amp;", " ")
.replaceAll("&minus;", " "); .replaceAll("&quot;", " ")
} .replaceAll("&minus;", " ");
}
protected boolean checkNumbers(final String a, final String b) { protected boolean checkNumbers(final String a, final String b) {
final String numbersA = getNumbers(a); final String numbersA = getNumbers(a);
final String numbersB = getNumbers(b); final String numbersB = getNumbers(b);
final String romansA = getRomans(a); final String romansA = getRomans(a);
final String romansB = getRomans(b); final String romansB = getRomans(b);
return !numbersA.equals(numbersB) || !romansA.equals(romansB); return !numbersA.equals(numbersB) || !romansA.equals(romansB);
} }
protected String getRomans(final String s) { protected String getRomans(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) { for (final String t : s.split(" ")) {
sb.append(isRoman(t) ? t : ""); sb.append(isRoman(t) ? t : "");
} }
return sb.toString(); return sb.toString();
} }
protected boolean isRoman(final String s) { protected boolean isRoman(final String s) {
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop"); return s
} .replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop")
.equals("qwertyuiop");
}
protected String getNumbers(final String s) { protected String getNumbers(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) { for (final String t : s.split(" ")) {
sb.append(isNumber(t) ? t : ""); sb.append(isNumber(t) ? t : "");
} }
return sb.toString(); return sb.toString();
} }
public boolean isNumber(String strNum) { public boolean isNumber(String strNum) {
if (strNum == null) { if (strNum == null) {
return false; return false;
} }
return numberPattern.matcher(strNum).matches(); return numberPattern.matcher(strNum).matches();
} }
protected static String fixAliases(final String s) { protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) { for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch); final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : ch); sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
} }
return sb.toString(); return sb.toString();
} }
protected static String transliterate(final String s) { protected static String transliterate(final String s) {
try { try {
return transliterator.transliterate(s); return transliterator.transliterate(s);
} } catch (Exception e) {
catch(Exception e) { return s;
return s; }
} }
}
protected String removeSymbols(final String s) { protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) { for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " "); sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
} }
return sb.toString().replaceAll("\\s+", " "); return sb.toString().replaceAll("\\s+", " ");
} }
protected boolean notNull(final String s) {
return s != null;
}
protected String normalize(final String s) { protected boolean notNull(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s)))) return s != null;
.toLowerCase() }
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public String nfd(final String s) { protected String normalize(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD); return fixAliases(transliterate(nfd(unicodeNormalization(s))))
} .toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public String utf8(final String s) { public String nfd(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8); return Normalizer.normalize(s, Normalizer.Form.NFD);
return new String(bytes, StandardCharsets.UTF_8); }
}
public String unicodeNormalization(final String s) { public String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8);
}
Matcher m = hexUnicodePattern.matcher(s); public String unicodeNormalization(final String s) {
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
protected String filterStopWords(final String s, final Set<String> stopwords) { Matcher m = hexUnicodePattern.matcher(s);
final StringTokenizer st = new StringTokenizer(s); StringBuffer buf = new StringBuffer(s.length());
final StringBuilder sb = new StringBuilder(); while (m.find()) {
while (st.hasMoreTokens()) { String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
final String token = st.nextToken(); m.appendReplacement(buf, Matcher.quoteReplacement(ch));
if (!stopwords.contains(token)) { }
sb.append(token); m.appendTail(buf);
sb.append(" "); return buf.toString();
} }
}
return sb.toString().trim();
}
public String filterAllStopWords(String s) { protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (!stopwords.contains(token)) {
sb.append(token);
sb.append(" ");
}
}
return sb.toString().trim();
}
s = filterStopWords(s, stopwords_en); public String filterAllStopWords(String s) {
s = filterStopWords(s, stopwords_de);
s = filterStopWords(s, stopwords_it);
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
s = filterStopWords(s, stopwords_gr);
return s; s = filterStopWords(s, stopwords_en);
} s = filterStopWords(s, stopwords_de);
s = filterStopWords(s, stopwords_it);
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
s = filterStopWords(s, stopwords_gr);
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) { return s;
final Set<String> newset = Sets.newLinkedHashSet(); }
for (final String s : set) {
if (!ngramBlacklist.contains(s)) {
newset.add(s);
}
}
return newset;
}
public static Set<String> loadFromClasspath(final String classpath) { protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
final Set<String> newset = Sets.newLinkedHashSet();
for (final String s : set) {
if (!ngramBlacklist.contains(s)) {
newset.add(s);
}
}
return newset;
}
Transliterator transliterator = Transliterator.getInstance("Any-Eng"); public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = Sets.newHashSet(); Transliterator transliterator = Transliterator.getInstance("Any-Eng");
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
public static Map<String, String> loadMapFromClasspath(final String classpath) { final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
Transliterator transliterator = Transliterator.getInstance("Any-Eng"); public static Map<String, String> loadMapFromClasspath(final String classpath) {
final Map<String, String> m = new HashMap<>(); Transliterator transliterator = Transliterator.getInstance("Any-Eng");
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
public String removeKeywords(String s, Set<String> keywords) { final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
// string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
s = " " + s + " "; public String removeKeywords(String s, Set<String> keywords) {
for (String k : keywords) {
s = s.replaceAll(k.toLowerCase(), "");
}
return s.trim(); s = " " + s + " ";
} for (String k : keywords) {
s = s.replaceAll(k.toLowerCase(), "");
}
public double commonElementsPercentage(Set<String> s1, Set<String> s2) { return s.trim();
}
double longer = Math.max(s1.size(), s2.size()); public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
return (double) s1.stream().filter(s2::contains).count() / longer;
}
//convert the set of keywords to codes double longer = Math.max(s1.size(), s2.size());
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) { return (double) s1.stream().filter(s2::contains).count() / longer;
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); }
}
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) { // convert the set of keywords to codes
return toCodes(keywords, translationMap); public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
} return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
}
public Set<String> citiesToCodes(Set<String> keywords) { public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
return toCodes(keywords, cityMap); return toCodes(keywords, translationMap);
} }
protected String firstLC(final String s) { public Set<String> citiesToCodes(Set<String> keywords) {
return StringUtils.substring(s, 0, 1).toLowerCase(); return toCodes(keywords, cityMap);
} }
protected Iterable<String> tokens(final String s, final int maxTokens) { protected String firstLC(final String s) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); return StringUtils.substring(s, 0, 1).toLowerCase();
} }
public String normalizePid(String pid) { protected Iterable<String> tokens(final String s, final int maxTokens) {
return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
} }
//get the list of keywords into the input string public String normalizePid(String pid) {
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) { return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
}
String s = s1; // get the list of keywords into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
List<String> tokens = Arrays.asList(s.toLowerCase().split(" ")); String s = s1;
Set<String> codes = new HashSet<>(); List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
if (tokens.size() < windowSize) Set<String> codes = new HashSet<>();
windowSize = tokens.size();
int length = windowSize; if (tokens.size() < windowSize)
windowSize = tokens.size();
while (length != 0) { int length = windowSize;
for (int i = 0; i <= tokens.size() - length; i++) { while (length != 0) {
String candidate = concat(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
codes.add(candidate);
s = s.replace(candidate, "").trim();
}
}
tokens = Arrays.asList(s.split(" ")); for (int i = 0; i <= tokens.size() - length; i++) {
length -= 1; String candidate = concat(tokens.subList(i, i + length));
} if (translationMap.containsKey(candidate)) {
codes.add(candidate);
s = s.replace(candidate, "").trim();
}
}
return codes; tokens = Arrays.asList(s.split(" "));
} length -= 1;
}
public Set<String> getCities(String s1, int windowSize) { return codes;
return getKeywords(s1, cityMap, windowSize); }
}
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) { public Set<String> getCities(String s1, int windowSize) {
final StringWriter sw = new StringWriter(); return getKeywords(s1, cityMap, windowSize);
try { }
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString(); public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
} catch (final IOException e) { final StringWriter sw = new StringWriter();
throw new RuntimeException("cannot load resource from classpath: " + filename); try {
} IOUtils.copy(clazz.getResourceAsStream(filename), sw);
} return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.config; package eu.dnetlib.pace.config;
import java.util.List; import java.util.List;
@ -44,7 +45,6 @@ public interface Config {
*/ */
public Map<String, Predicate<String>> blacklists(); public Map<String, Predicate<String>> blacklists();
/** /**
* Translation map. * Translation map.
* *

View File

@ -1,16 +1,5 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore; package eu.dnetlib.pace.config;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
@ -25,139 +14,167 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException; import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
public class DedupConfig implements Config, Serializable { public class DedupConfig implements Config, Serializable {
private static String CONFIG_TEMPLATE = "dedupConfig.st"; private static String CONFIG_TEMPLATE = "dedupConfig.st";
private PaceConfig pace; private PaceConfig pace;
private WfConfig wf; private WfConfig wf;
@JsonIgnore @JsonIgnore
private Map<String, Predicate<String>> blacklists; private Map<String, Predicate<String>> blacklists;
private static Map<String, String> defaults = Maps.newHashMap(); private static Map<String, String> defaults = Maps.newHashMap();
static { static {
defaults.put("dedupRun", "001"); defaults.put("dedupRun", "001");
defaults.put("entityType", "result"); defaults.put("entityType", "result");
defaults.put("subEntityType", "resulttype"); defaults.put("subEntityType", "resulttype");
defaults.put("subEntityValue", "publication"); defaults.put("subEntityValue", "publication");
defaults.put("orderField", "title"); defaults.put("orderField", "title");
defaults.put("queueMaxSize", "2000"); defaults.put("queueMaxSize", "2000");
defaults.put("groupMaxSize", "10"); defaults.put("groupMaxSize", "10");
defaults.put("slidingWindowSize", "200"); defaults.put("slidingWindowSize", "200");
defaults.put("rootBuilder", "result"); defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true"); defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20"); defaults.put("maxIterations", "20");
defaults.put("idPath", "$.id"); defaults.put("idPath", "$.id");
} }
public DedupConfig() { public DedupConfig() {
} }
public static DedupConfig load(final String json) { public static DedupConfig load(final String json) {
final DedupConfig config; final DedupConfig config;
try { try {
config = new ObjectMapper().readValue(json, DedupConfig.class); config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel(); config.getPace().initModel();
config.getPace().initTranslationMap(); config.getPace().initTranslationMap();
config.blacklists = config.getPace().getBlacklists().entrySet() config.blacklists = config
.stream() .getPace()
.map(e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()))) .getBlacklists()
.collect(Collectors.toMap(e -> e.getKey(), .entrySet()
e -> (Predicate<String> & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent())) .stream()
.map(
e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(),
e
.getValue()
.stream()
.filter(s -> !StringUtils.isBlank(s))
.map(Pattern::compile)
.collect(Collectors.toList())))
.collect(
Collectors
.toMap(
e -> e.getKey(),
e -> (Predicate<String> & Serializable) s -> e
.getValue()
.stream()
.filter(p -> p.matcher(s).matches())
.findFirst()
.isPresent()))
; ;
return config; return config;
} catch (IOException | } catch (IOException | PatternSyntaxException e) {
PatternSyntaxException e) { throw new PaceException("Error in parsing configuration json", e);
throw new PaceException("Error in parsing configuration json", e); }
}
} }
public static DedupConfig loadDefault() throws IOException { public static DedupConfig loadDefault() throws IOException {
return loadDefault(new HashMap<String, String>()); return loadDefault(new HashMap<String, String>());
} }
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException { public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE)); final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
for (final Entry<String, String> e : defaults.entrySet()) { for (final Entry<String, String> e : defaults.entrySet()) {
template.setAttribute(e.getKey(), e.getValue()); template.setAttribute(e.getKey(), e.getValue());
} }
for (final Entry<String, String> e : params.entrySet()) { for (final Entry<String, String> e : params.entrySet()) {
if (template.getAttribute(e.getKey()) != null) { if (template.getAttribute(e.getKey()) != null) {
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue()); template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
} else { } else {
template.setAttribute(e.getKey(), e.getValue()); template.setAttribute(e.getKey(), e.getValue());
} }
} }
final String json = template.toString(); final String json = template.toString();
return load(json); return load(json);
} }
private String readFromClasspath(final String resource) throws IOException { private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8); return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
} }
public PaceConfig getPace() { public PaceConfig getPace() {
return pace; return pace;
} }
public void setPace(final PaceConfig pace) { public void setPace(final PaceConfig pace) {
this.pace = pace; this.pace = pace;
} }
public WfConfig getWf() { public WfConfig getWf() {
return wf; return wf;
} }
public void setWf(final WfConfig wf) { public void setWf(final WfConfig wf) {
this.wf = wf; this.wf = wf;
} }
@Override @Override
public String toString() { public String toString() {
try { try {
return new ObjectMapper().writeValueAsString(this); return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) { } catch (IOException e) {
throw new PaceException("unable to serialise configuration", e); throw new PaceException("unable to serialise configuration", e);
} }
} }
@Override @Override
public Map<String, TreeNodeDef> decisionTree() { public Map<String, TreeNodeDef> decisionTree() {
return getPace().getDecisionTree(); return getPace().getDecisionTree();
} }
@Override @Override
public List<FieldDef> model() { public List<FieldDef> model() {
return getPace().getModel(); return getPace().getModel();
} }
@Override @Override
public List<ClusteringDef> clusterings() { public List<ClusteringDef> clusterings() {
return getPace().getClustering(); return getPace().getClustering();
} }
@Override @Override
public Map<String, Predicate<String>> blacklists() { public Map<String, Predicate<String>> blacklists() {
return blacklists; return blacklists;
} }
@Override @Override
public Map<String, String> translationMap() { public Map<String, String> translationMap() {
return getPace().translationMap(); return getPace().translationMap();
} }
} }

View File

@ -1,19 +1,20 @@
package eu.dnetlib.pace.config; package eu.dnetlib.pace.config;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver; import eu.dnetlib.pace.util.PaceResolver;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
public class PaceConfig extends AbstractPaceFunctions implements Serializable { public class PaceConfig extends AbstractPaceFunctions implements Serializable {
private List<FieldDef> model; private List<FieldDef> model;
@ -37,7 +38,8 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
@JsonIgnore @JsonIgnore
public static PaceResolver resolver = new PaceResolver(); public static PaceResolver resolver = new PaceResolver();
public PaceConfig() {} public PaceConfig() {
}
public void initModel() { public void initModel() {
modelMap = Maps.newHashMap(); modelMap = Maps.newHashMap();
@ -46,20 +48,21 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
} }
} }
public void initTranslationMap(){ public void initTranslationMap() {
translationMap = Maps.newHashMap(); translationMap = Maps.newHashMap();
Transliterator transliterator = Transliterator.getInstance("Any-Eng"); Transliterator transliterator = Transliterator.getInstance("Any-Eng");
for (String key : synonyms.keySet()) { for (String key : synonyms.keySet()) {
for (String term : synonyms.get(key)){ for (String term : synonyms.get(key)) {
translationMap.put( translationMap
.put(
fixAliases(transliterator.transliterate(term.toLowerCase())), fixAliases(transliterator.transliterate(term.toLowerCase())),
key); key);
} }
} }
} }
public Map<String, String> translationMap(){ public Map<String, String> translationMap() {
return translationMap; return translationMap;
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.config; package eu.dnetlib.pace.config;
public enum Type { public enum Type {

View File

@ -1,10 +1,5 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.databind.ObjectMapper; package eu.dnetlib.pace.config;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
@ -12,6 +7,13 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
public class WfConfig implements Serializable { public class WfConfig implements Serializable {
@ -76,7 +78,6 @@ public class WfConfig implements Serializable {
/** Maximum number of allowed children. */ /** Maximum number of allowed children. */
private int maxChildren = MAX_CHILDREN; private int maxChildren = MAX_CHILDREN;
/** Default maximum number of iterations. */ /** Default maximum number of iterations. */
private final static int MAX_ITERATIONS = 20; private final static int MAX_ITERATIONS = 20;
@ -84,9 +85,10 @@ public class WfConfig implements Serializable {
private int maxIterations = MAX_ITERATIONS; private int maxIterations = MAX_ITERATIONS;
/** The Jquery path to retrieve the identifier */ /** The Jquery path to retrieve the identifier */
private String idPath = "$.id"; private String idPath = "$.id";
public WfConfig() {} public WfConfig() {
}
/** /**
* Instantiates a new dedup config. * Instantiates a new dedup config.
@ -114,8 +116,10 @@ public class WfConfig implements Serializable {
* @param idPath * @param idPath
* the path for the id of the entity * the path for the id of the entity
*/ */
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun, public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) { final String dedupRun,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize,
final boolean includeChildren, final int maxIterations, final String idPath) {
super(); super();
this.entityType = entityType; this.entityType = entityType;
this.orderField = orderField; this.orderField = orderField;
@ -257,7 +261,6 @@ public class WfConfig implements Serializable {
this.maxChildren = maxChildren; this.maxChildren = maxChildren;
} }
public int getMaxIterations() { public int getMaxIterations() {
return maxIterations; return maxIterations;
} }
@ -277,7 +280,6 @@ public class WfConfig implements Serializable {
/* /*
* (non-Javadoc) * (non-Javadoc)
*
* @see java.lang.Object#toString() * @see java.lang.Object#toString()
*/ */
@Override @Override

View File

@ -1,15 +1,16 @@
package eu.dnetlib.pace.model;
import com.fasterxml.jackson.databind.ObjectMapper; package eu.dnetlib.pace.model;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
public class ClusteringDef implements Serializable { public class ClusteringDef implements Serializable {
@ -19,7 +20,8 @@ public class ClusteringDef implements Serializable {
private Map<String, Integer> params; private Map<String, Integer> params;
public ClusteringDef() {} public ClusteringDef() {
}
public String getName() { public String getName() {
return name; return name;

View File

@ -1,13 +1,15 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Type;
import java.io.Serializable; import eu.dnetlib.pace.config.Type;
import java.util.List;
/** /**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm. * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
@ -34,7 +36,8 @@ public class FieldDef implements Serializable {
*/ */
private int length = -1; private int length = -1;
public FieldDef() {} public FieldDef() {
}
public String getName() { public String getName() {
return name; return name;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.nio.charset.Charset; import java.nio.charset.Charset;
@ -43,7 +44,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", ""); // s = s.replaceAll("[\\W&&[^,-]]", "");
} }
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname if (s.contains(",")) { // if the name contains a comma it is easy derivable the name and the surname
final String[] arr = s.split(","); final String[] arr = s.split(",");
if (arr.length == 1) { if (arr.length == 1) {
fullname = splitTerms(arr[0]); fullname = splitTerms(arr[0]);

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.util.ArrayList; import java.util.ArrayList;
@ -57,7 +58,7 @@ public class PersonComparatorUtils {
private static boolean verifyNames(List<String> list1, List<String> list2) { private static boolean verifyNames(List<String> list1, List<String> list2) {
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
&& verifySimilarity(extractInitials(list1), extractInitials(list2)); && verifySimilarity(extractInitials(list1), extractInitials(list2));
} }
private static boolean verifySurnames(List<String> list1, List<String> list2) { private static boolean verifySurnames(List<String> list1, List<String> list2) {
@ -76,7 +77,7 @@ public class PersonComparatorUtils {
Collections.sort(list1); Collections.sort(list1);
Collections.sort(list2); Collections.sort(list2);
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
&& verifySimilarity(extractInitials(list1), extractInitials(list2)); && verifySimilarity(extractInitials(list1), extractInitials(list2));
} }
private static List<String> extractExtendedNames(List<String> list) { private static List<String> extractExtendedNames(List<String> list) {
@ -107,7 +108,7 @@ public class PersonComparatorUtils {
for (String s : list1) { for (String s : list1) {
int curr = list2.indexOf(s); int curr = list2.indexOf(s);
if (curr > pos) { if (curr > pos) {
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm" list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
pos = curr; pos = curr;
} else { } else {
return false; return false;

View File

@ -1,9 +1,11 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import eu.dnetlib.pace.clustering.NGramUtils; import java.util.Comparator;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import java.util.Comparator; import eu.dnetlib.pace.clustering.NGramUtils;
/** /**
* The Class MapDocumentComparator. * The Class MapDocumentComparator.
@ -25,13 +27,12 @@ public class RowDataOrderingComparator implements Comparator<Row> {
/* /*
* (non-Javadoc) * (non-Javadoc)
*
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
*/ */
@Override @Override
public int compare(final Row d1, final Row d2) { public int compare(final Row d1, final Row d2) {
if (d1 == null) if (d1 == null)
return d2==null ? 0: -1; return d2 == null ? 0 : -1;
else if (d2 == null) { else if (d2 == null) {
return 1; return 1;
} }
@ -40,7 +41,7 @@ public class RowDataOrderingComparator implements Comparator<Row> {
final String o2 = d2.getString(comparatorField); final String o2 = d2.getString(comparatorField);
if (o1 == null) if (o1 == null)
return o2==null ? 0: -1; return o2 == null ? 0 : -1;
else if (o2 == null) { else if (o2 == null) {
return 1; return 1;
} }

View File

@ -1,32 +1,30 @@
package eu.dnetlib.dhp.oa.dedup.model package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath, Option} import com.jayway.jsonpath.{Configuration, JsonPath, Option}
import eu.dnetlib.dhp.oa.dedup.{DedupUtility, SparkReporter}
import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.model.{ClusteringDef, FieldDef}
import eu.dnetlib.pace.tree.support.TreeProcessor import eu.dnetlib.pace.tree.support.TreeProcessor
import eu.dnetlib.pace.util.MapDocumentUtil.truncateValue import eu.dnetlib.pace.util.MapDocumentUtil.truncateValue
import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil} import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil, SparkReporter}
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.sql.{Column, Dataset, Row, functions}
import org.apache.spark.sql.catalyst.expressions.{GenericRowWithSchema, Literal} import org.apache.spark.sql.catalyst.expressions.{GenericRowWithSchema, Literal}
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window} import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.functions.{col, lit, udf}
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
import org.apache.spark.sql.{Column, Dataset, Row, functions}
import java.util import java.util
import java.util.function.Predicate import java.util.function.Predicate
import java.util.regex.Pattern import java.util.regex.Pattern
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
import scala.collection.mutable import scala.collection.mutable
import org.apache.spark.sql.functions.{col, lit, udf}
class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable { case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*") private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")
private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|") private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")
private var urlFilter = (s: String) => URL_REGEX.matcher(s).matches private val urlFilter = (s: String) => URL_REGEX.matcher(s).matches
val modelExtractor: (Dataset[String] => Dataset[Row]) = df => { val modelExtractor: (Dataset[String] => Dataset[Row]) = df => {
df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0)))) df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0))))
@ -226,60 +224,59 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
val orderingFieldPosition: Int = rowDataType.fieldIndex(conf.getWf.getOrderField) val orderingFieldPosition: Int = rowDataType.fieldIndex(conf.getWf.getOrderField)
val rowFromJsonUDF = udf( val rowFromJson = (json: String) => {
(json: String) => { val documentContext =
val documentContext = JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json) val values = new Array[Any](rowDataType.size)
val values = new Array[Any](rowDataType.size)
values(identityFieldPosition) = DFMapDocumentUtils.getJPathString(conf.getWf.getIdPath, documentContext) values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
rowDataType.fieldNames.zipWithIndex.foldLeft(values) { rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
case ((res, (fname, index))) => { case ((res, (fname, index))) => {
val fdef = conf.getPace.getModelMap.get(fname) val fdef = conf.getPace.getModelMap.get(fname)
if (fdef != null) { if (fdef != null) {
res(index) = fdef.getType match { res(index) = fdef.getType match {
case Type.String | Type.Int => case Type.String | Type.Int =>
MapDocumentUtil.truncateValue( MapDocumentUtil.truncateValue(
DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext), MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
fdef.getLength fdef.getLength
) )
case Type.URL => case Type.URL =>
var uv = DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext) var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
if (!urlFilter(uv)) uv = "" if (!urlFilter(uv)) uv = ""
uv uv
case Type.List | Type.JSON => case Type.List | Type.JSON =>
MapDocumentUtil.truncateList( MapDocumentUtil.truncateList(
DFMapDocumentUtils.getJPathList(fdef.getPath, documentContext, fdef.getType), MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
fdef.getSize fdef.getSize
) )
case Type.StringConcat => case Type.StringConcat =>
val jpaths = CONCAT_REGEX.split(fdef.getPath) val jpaths = CONCAT_REGEX.split(fdef.getPath)
truncateValue( truncateValue(
jpaths jpaths
.map(jpath => DFMapDocumentUtils.getJPathString(jpath, documentContext)) .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
.mkString(" "), .mkString(" "),
fdef.getLength fdef.getLength
) )
case Type.DoubleArray => case Type.DoubleArray =>
MapDocumentUtil.getJPathArray(fdef.getPath, json) MapDocumentUtil.getJPathArray(fdef.getPath, json)
}
} }
res
} }
}
new GenericRowWithSchema(values, rowDataType) res
}, }
rowDataType }
)
new GenericRowWithSchema(values, rowDataType)
}
val rowFromJsonUDF = udf(rowFromJson, rowDataType)
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = { def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
@ -310,7 +307,7 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
} }
def processBlock(implicit sc: SparkContext) = { def processBlock(implicit sc: SparkContext) = {
val accumulators = DedupUtility.constructAccumulator(conf, sc) val accumulators = SparkReporter.constructAccumulator(conf, sc)
udf[Array[Tuple2[String, String]], mutable.WrappedArray[Row]](block => { udf[Array[Tuple2[String, String]], mutable.WrappedArray[Row]](block => {
val reporter = new SparkReporter(accumulators) val reporter = new SparkReporter(accumulators)

View File

@ -1,41 +1,42 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("alwaysMatch") @ComparatorClass("alwaysMatch")
public class AlwaysMatch<T> extends AbstractComparator<T> { public class AlwaysMatch<T> extends AbstractComparator<T> {
public AlwaysMatch(final Map<String, String> params){ public AlwaysMatch(final Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
} }
public AlwaysMatch(final double weight) { public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo); super(weight, ssalgo);
} }
@Override @Override
public double compare(final Object a, final Object b, final Config conf) { public double compare(final Object a, final Object b, final Config conf) {
return 1.0; return 1.0;
} }
@Override @Override
public double getWeight() { public double getWeight() {
return super.weight; return super.weight;
} }
@Override @Override
protected double normalize(final double d) { protected double normalize(final double d) {
return d; return d;
} }
} }

View File

@ -1,148 +1,157 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("authorsMatch") @ComparatorClass("authorsMatch")
public class AuthorsMatch extends AbstractListComparator { public class AuthorsMatch extends AbstractListComparator {
Map<String, String> params; Map<String, String> params;
private double SURNAME_THRESHOLD; private double SURNAME_THRESHOLD;
private double NAME_THRESHOLD; private double NAME_THRESHOLD;
private double FULLNAME_THRESHOLD; private double FULLNAME_THRESHOLD;
private String MODE; //full or surname private String MODE; // full or surname
private int SIZE_THRESHOLD; private int SIZE_THRESHOLD;
private String TYPE; //count or percentage private String TYPE; // count or percentage
private int common; private int common;
public AuthorsMatch(Map<String, String> params){ public AuthorsMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
this.params = params; this.params = params;
MODE = params.getOrDefault("mode", "full"); MODE = params.getOrDefault("mode", "full");
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95")); SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95")); NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20")); SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage"); TYPE = params.getOrDefault("type", "percentage");
common = 0; common = 0;
} }
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo); super(w, ssalgo);
} }
@Override @Override
public double compare(final List<String> a, final List<String> b, final Config conf) { public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty())
return -1; return -1;
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
return 1.0; return 1.0;
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
common = 0; common = 0;
//compare each element of List1 with each element of List2 // compare each element of List1 with each element of List2
for (Person p1 : aList) for (Person p1 : aList)
for (Person p2 : bList) { for (Person p2 : bList) {
//both persons are inaccurate // both persons are inaccurate
if (!p1.isAccurate() && !p2.isAccurate()) { if (!p1.isAccurate() && !p2.isAccurate()) {
//compare just normalized fullnames // compare just normalized fullnames
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname()); String fullname1 = normalization(
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname()); p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname());
String fullname2 = normalization(
p2.getNormalisedFullname().isEmpty() ? p2.getOriginal() : p2.getNormalisedFullname());
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) { if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
common += 1; common += 1;
break; break;
} }
} }
//one person is inaccurate // one person is inaccurate
if (p1.isAccurate() ^ p2.isAccurate()) { if (p1.isAccurate() ^ p2.isAccurate()) {
//prepare data // prepare data
//data for the accurate person // data for the accurate person
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName()); String name = normalization(
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname()); p1.isAccurate() ? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
String surname = normalization(
p1.isAccurate() ? p1.getNormalisedSurname() : p2.getNormalisedSurname());
//data for the inaccurate person // data for the inaccurate person
String fullname = normalization( String fullname = normalization(
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()) p1.isAccurate()
); ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname())
: (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()));
if (fullname.contains(surname)) { if (fullname.contains(surname)) {
if (MODE.equals("full")) { if (MODE.equals("full")) {
if (fullname.contains(name)) { if (fullname.contains(name)) {
common += 1; common += 1;
break; break;
} }
} } else { // MODE equals "surname"
else { //MODE equals "surname" common += 1;
common += 1; break;
break; }
} }
} }
}
//both persons are accurate // both persons are accurate
if (p1.isAccurate() && p2.isAccurate()) { if (p1.isAccurate() && p2.isAccurate()) {
if (compareSurname(p1, p2)) { if (compareSurname(p1, p2)) {
if (MODE.equals("full")) { if (MODE.equals("full")) {
if(compareFirstname(p1, p2)) { if (compareFirstname(p1, p2)) {
common += 1; common += 1;
break; break;
} }
} } else { // MODE equals "surname"
else { //MODE equals "surname" common += 1;
common += 1; break;
break; }
} }
}
} }
} }
//normalization factor to compute the score // normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
if(TYPE.equals("percentage")) { if (TYPE.equals("percentage")) {
return (double) common / normFactor; return (double) common / normFactor;
} } else {
else { return (double) common;
return (double) common; }
} }
}
public boolean compareSurname(Person p1, Person p2) { public boolean compareSurname(Person p1, Person p2) {
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD; return ssalgo
} .score(
normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
}
public boolean compareFirstname(Person p1, Person p2) { public boolean compareFirstname(Person p1, Person p2) {
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) { if (p1.getNormalisedFirstName().length() <= 2 || p2.getNormalisedFirstName().length() <= 2) {
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName()))) if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
return true; return true;
} }
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD; return ssalgo
} .score(
normalization(p1.getNormalisedFirstName()),
normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
}
public String normalization(String s) { public String normalization(String s) {
return normalize(utf8(cleanup(s))); return normalize(utf8(cleanup(s)));
} }
} }

View File

@ -1,47 +1,48 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("cityMatch") @ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator { public class CityMatch extends AbstractStringComparator {
private Map<String, String> params; private Map<String, String> params;
public CityMatch(Map<String, String> params) { public CityMatch(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
} }
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);
ca = normalize(ca); ca = normalize(ca);
cb = normalize(cb); cb = normalize(cb);
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1); Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2); Set<String> codes2 = citiesToCodes(cities2);
//if no cities are detected, the comparator gives 1.0 // if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty()) if (codes1.isEmpty() && codes2.isEmpty())
return 1.0; return 1.0;
else { else {
if (codes1.isEmpty() ^ codes2.isEmpty()) if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no cities return -1; // undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2); return commonElementsPercentage(codes1, codes2);
} }
} }
} }

View File

@ -1,47 +1,47 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("cosineSimilarity") @ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator<double[]> { public class CosineSimilarity extends AbstractComparator<double[]> {
Map<String, String> params; Map<String, String> params;
public CosineSimilarity(Map<String,String> params) { public CosineSimilarity(Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
public double compare(Object a, Object b, Config config) { public double compare(Object a, Object b, Config config) {
return compare((double[])a, (double[])b, config); return compare((double[]) a, (double[]) b, config);
} }
public double compare(final double[] a, final double[] b, final Config conf) { public double compare(final double[] a, final double[] b, final Config conf) {
if (a.length == 0 || b.length == 0) if (a.length == 0 || b.length == 0)
return -1; return -1;
return cosineSimilarity(a, b); return cosineSimilarity(a, b);
} }
double cosineSimilarity(double[] a, double[] b) { double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0; double dotProduct = 0;
double normASum = 0; double normASum = 0;
double normBSum = 0; double normBSum = 0;
for(int i = 0; i < a.length; i ++) { for (int i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i]; dotProduct += a[i] * b[i];
normASum += a[i] * a[i]; normASum += a[i] * a[i];
normBSum += b[i] * b[i]; normBSum += b[i] * b[i];
} }
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
} }

View File

@ -1,9 +1,10 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/** /**
* The Class ExactMatch. * The Class ExactMatch.
* *
@ -12,15 +13,15 @@ import java.util.Map;
@ComparatorClass("doiExactMatch") @ComparatorClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase { public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public DoiExactMatch(final Map<String, String> params) { public DoiExactMatch(final Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
protected String toString(final Object f) { protected String toString(final Object f) {
return super.toString(f).replaceAll(PREFIX, ""); return super.toString(f).replaceAll(PREFIX, "");
} }
} }

View File

@ -1,29 +1,30 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass; package eu.dnetlib.pace.tree;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("domainExactMatch") @ComparatorClass("domainExactMatch")
public class DomainExactMatch extends ExactMatchIgnoreCase { public class DomainExactMatch extends ExactMatchIgnoreCase {
public DomainExactMatch(final Map<String, String> params) { public DomainExactMatch(final Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
protected String toString(final Object f) { protected String toString(final Object f) {
try { try {
return asUrl(super.toString(f)).getHost(); return asUrl(super.toString(f)).getHost();
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
return ""; return "";
} }
} }
private URL asUrl(final String value) throws MalformedURLException { private URL asUrl(final String value) throws MalformedURLException {
return new URL(value); return new URL(value);
} }
} }

View File

@ -1,42 +1,44 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("exactMatch") @ComparatorClass("exactMatch")
public class ExactMatch extends AbstractStringComparator { public class ExactMatch extends AbstractStringComparator {
public ExactMatch(Map<String, String> params){ public ExactMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
} }
public ExactMatch(final double weight) { public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo); super(weight, ssalgo);
} }
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) { if (a.isEmpty() || b.isEmpty()) {
return -1.0; //return -1 if a field is missing return -1.0; // return -1 if a field is missing
} }
return a.equals(b) ? 1.0 : 0; return a.equals(b) ? 1.0 : 0;
} }
@Override @Override
public double getWeight() { public double getWeight() {
return super.weight; return super.weight;
} }
@Override @Override
protected double normalize(final double d) { protected double normalize(final double d) {
return d; return d;
} }
} }

View File

@ -1,30 +1,32 @@
package eu.dnetlib.pace.tree;
import com.google.common.base.Joiner; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("exactMatchIgnoreCase") @ComparatorClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractStringComparator { public class ExactMatchIgnoreCase extends AbstractStringComparator {
public ExactMatchIgnoreCase(Map<String, String> params) { public ExactMatchIgnoreCase(Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
public double compare(String a, String b, final Config conf) { public double compare(String a, String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty())
return -1; return -1;
return a.equalsIgnoreCase(b) ? 1 : 0; return a.equalsIgnoreCase(b) ? 1 : 0;
} }
protected String toString(final Object object) { protected String toString(final Object object) {
return toFirstString(object); return toFirstString(object);
} }
} }

View File

@ -1,9 +1,5 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -11,70 +7,74 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("instanceTypeMatch") @ComparatorClass("instanceTypeMatch")
public class InstanceTypeMatch extends AbstractListComparator { public class InstanceTypeMatch extends AbstractListComparator {
final Map<String, String> translationMap = new HashMap<>(); final Map<String, String> translationMap = new HashMap<>();
public InstanceTypeMatch(Map<String, String> params){ public InstanceTypeMatch(Map<String, String> params) {
super(params); super(params);
//jolly types // jolly types
translationMap.put("Conference object", "*"); translationMap.put("Conference object", "*");
translationMap.put("Other literature type", "*"); translationMap.put("Other literature type", "*");
translationMap.put("Unknown", "*"); translationMap.put("Unknown", "*");
//article types // article types
translationMap.put("Article", "Article"); translationMap.put("Article", "Article");
translationMap.put("Data Paper", "Article"); translationMap.put("Data Paper", "Article");
translationMap.put("Software Paper", "Article"); translationMap.put("Software Paper", "Article");
translationMap.put("Preprint", "Article"); translationMap.put("Preprint", "Article");
//thesis types // thesis types
translationMap.put("Thesis", "Thesis"); translationMap.put("Thesis", "Thesis");
translationMap.put("Master thesis", "Thesis"); translationMap.put("Master thesis", "Thesis");
translationMap.put("Bachelor thesis", "Thesis"); translationMap.put("Bachelor thesis", "Thesis");
translationMap.put("Doctoral thesis", "Thesis"); translationMap.put("Doctoral thesis", "Thesis");
} }
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
@Override if (a == null || b == null) {
public double compare(final List<String> a, final List<String> b, final Config conf) { return -1;
}
if (a == null || b == null) { if (a.isEmpty() || b.isEmpty()) {
return -1; return -1;
} }
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
if (a.isEmpty() || b.isEmpty()) { // if at least one is a jolly type, it must produce a match
return -1; if (ca.contains("*") || cb.contains("*"))
} return 1.0;
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet()); int incommon = Sets.intersection(ca, cb).size();
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
//if at least one is a jolly type, it must produce a match // if at least one is in common, it must produce a match
if (ca.contains("*") || cb.contains("*")) return incommon >= 1 ? 1 : 0;
return 1.0; }
int incommon = Sets.intersection(ca, cb).size(); public String translate(String term) {
return translationMap.getOrDefault(term, term);
}
//if at least one is in common, it must produce a match @Override
return incommon >= 1 ? 1 : 0; public double getWeight() {
} return super.weight;
}
public String translate(String term){ @Override
return translationMap.getOrDefault(term, term); protected double normalize(final double d) {
} return d;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
} }

View File

@ -1,44 +1,46 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinkler") @ComparatorClass("jaroWinkler")
public class JaroWinkler extends AbstractStringComparator { public class JaroWinkler extends AbstractStringComparator {
public JaroWinkler(Map<String, String> params){ public JaroWinkler(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
} }
public JaroWinkler(double weight) { public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo); super(weight, ssalgo);
} }
@Override @Override
public double distance(String a, String b, final Config conf) { public double distance(String a, String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);
return normalize(ssalgo.score(ca, cb)); return normalize(ssalgo.score(ca, cb));
} }
@Override @Override
public double getWeight() { public double getWeight() {
return super.weight; return super.weight;
} }
@Override @Override
protected double normalize(double d) { protected double normalize(double d) {
return d; return d;
} }
} }

View File

@ -1,70 +1,74 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName") @ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator { public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params; private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params){ public JaroWinklerNormalizedName(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
this.params = params; this.params = params;
} }
public JaroWinklerNormalizedName(double weight) { public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo); super(weight, ssalgo);
} }
@Override @Override
public double distance(String a, String b, final Config conf) { public double distance(String a, String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);
ca = normalize(ca); ca = normalize(ca);
cb = normalize(cb); cb = normalize(cb);
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> keywords1 = getKeywords(
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1); ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2); cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2); cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " "); ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " "); cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty()) if (ca.isEmpty() && cb.isEmpty())
return 1.0; return 1.0;
else else
return normalize(ssalgo.score(ca,cb)); return normalize(ssalgo.score(ca, cb));
} }
@Override @Override
public double getWeight() { public double getWeight() {
return super.weight; return super.weight;
} }
@Override @Override
protected double normalize(double d) { protected double normalize(double d) {
return d; return d;
} }
} }

View File

@ -1,17 +1,19 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinklerTitle") @ComparatorClass("jaroWinklerTitle")
public class JaroWinklerTitle extends AbstractStringComparator { public class JaroWinklerTitle extends AbstractStringComparator {
public JaroWinklerTitle(Map<String, String> params){ public JaroWinklerTitle(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler()); super(params, new com.wcohen.ss.JaroWinkler());
} }
@ -22,7 +24,7 @@ public class JaroWinklerTitle extends AbstractStringComparator {
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo); super(weight, ssalgo);
} }
@Override @Override
public double distance(String a, String b, final Config conf) { public double distance(String a, String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
@ -30,7 +32,7 @@ public class JaroWinklerTitle extends AbstractStringComparator {
boolean check = checkNumbers(ca, cb); boolean check = checkNumbers(ca, cb);
return check ? 0.5 : normalize(ssalgo.score(ca, cb)); return check ? 0.5 : normalize(ssalgo.score(ca, cb));
} }
@Override @Override
public double getWeight() { public double getWeight() {

View File

@ -1,72 +1,76 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
@ComparatorClass("jsonListMatch") @ComparatorClass("jsonListMatch")
public class JsonListMatch extends AbstractListComparator { public class JsonListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(JsonListMatch.class); private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map<String, String> params; private Map<String, String> params;
private String MODE; //"percentage" or "count" private String MODE; // "percentage" or "count"
public JsonListMatch(final Map<String, String> params) { public JsonListMatch(final Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
MODE = params.getOrDefault("mode", "percentage"); MODE = params.getOrDefault("mode", "percentage");
} }
@Override @Override
public double compare(final List<String> sa, final List<String> sb, final Config conf) { public double compare(final List<String> sa, final List<String> sb, final Config conf) {
if (sa.isEmpty() || sb.isEmpty()) { if (sa.isEmpty() || sb.isEmpty()) {
return -1; return -1;
} }
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
int incommon = Sets.intersection(ca, cb).size(); int incommon = Sets.intersection(ca, cb).size();
int simDiff = Sets.symmetricDifference(ca, cb).size(); int simDiff = Sets.symmetricDifference(ca, cb).size();
if (incommon + simDiff == 0) { if (incommon + simDiff == 0) {
return 0.0; return 0.0;
} }
if (MODE.equals("percentage")) if (MODE.equals("percentage"))
return (double)incommon / (incommon + simDiff); return (double) incommon / (incommon + simDiff);
else else
return incommon; return incommon;
} }
//converts every json into a comparable string basing on parameters // converts every json into a comparable string basing on parameters
private String toComparableString(String json){ private String toComparableString(String json) {
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
// parameters
//for each path in the param list // for each path in the param list
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key); String path = params.get(key);
String value = MapDocumentUtil.getJPathString(path, json); String value = MapDocumentUtil.getJPathString(path, json);
if (value == null || value.isEmpty()) if (value == null || value.isEmpty())
value = ""; value = "";
st.append(value); st.append(value);
st.append("::"); st.append("::");
} }
st.setLength(st.length()-2); st.setLength(st.length() - 2);
return st.toString(); return st.toString();
} }
} }

View File

@ -1,47 +1,50 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("keywordMatch") @ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator { public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params; Map<String, String> params;
public KeywordMatch(Map<String, String> params) { public KeywordMatch(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
} }
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);
ca = normalize(ca); ca = normalize(ca);
cb = normalize(cb); cb = normalize(cb);
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); Set<String> keywords1 = getKeywords(
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap()); Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap()); Set<String> codes2 = toCodes(keywords2, conf.translationMap());
//if no cities are detected, the comparator gives 1.0 // if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty()) if (codes1.isEmpty() && codes2.isEmpty())
return 1.0; return 1.0;
else { else {
if (codes1.isEmpty() ^ codes2.isEmpty()) if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; //undefined if one of the two has no keywords return -1.0; // undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2); return commonElementsPercentage(codes1, codes2);
} }
} }
} }

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinkler") @ComparatorClass("level2JaroWinkler")
public class Level2JaroWinkler extends AbstractStringComparator { public class Level2JaroWinkler extends AbstractStringComparator {
public Level2JaroWinkler(Map<String, String> params){ public Level2JaroWinkler(Map<String, String> params) {
super(params, new com.wcohen.ss.Level2JaroWinkler()); super(params, new com.wcohen.ss.Level2JaroWinkler());
} }

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinklerTitle") @ComparatorClass("level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends AbstractStringComparator { public class Level2JaroWinklerTitle extends AbstractStringComparator {
public Level2JaroWinklerTitle(Map<String,String> params){ public Level2JaroWinklerTitle(Map<String, String> params) {
super(params, new com.wcohen.ss.Level2JaroWinkler()); super(params, new com.wcohen.ss.Level2JaroWinkler());
} }
@ -29,7 +31,8 @@ public class Level2JaroWinklerTitle extends AbstractStringComparator {
final boolean check = checkNumbers(ca, cb); final boolean check = checkNumbers(ca, cb);
if (check) return 0.5; if (check)
return 0.5;
return ssalgo.score(ca, cb); return ssalgo.score(ca, cb);
} }

View File

@ -1,15 +1,17 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2Levenstein") @ComparatorClass("level2Levenstein")
public class Level2Levenstein extends AbstractStringComparator { public class Level2Levenstein extends AbstractStringComparator {
public Level2Levenstein(Map<String,String> params){ public Level2Levenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Level2Levenstein()); super(params, new com.wcohen.ss.Level2Levenstein());
} }

View File

@ -1,15 +1,17 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("levenstein") @ComparatorClass("levenstein")
public class Levenstein extends AbstractStringComparator { public class Levenstein extends AbstractStringComparator {
public Levenstein(Map<String,String> params){ public Levenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein()); super(params, new com.wcohen.ss.Levenstein());
} }

View File

@ -1,20 +1,23 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import java.util.Map; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("levensteinTitle") @ComparatorClass("levensteinTitle")
public class LevensteinTitle extends AbstractStringComparator { public class LevensteinTitle extends AbstractStringComparator {
private static final Log log = LogFactory.getLog(LevensteinTitle.class); private static final Log log = LogFactory.getLog(LevensteinTitle.class);
public LevensteinTitle(Map<String,String> params){ public LevensteinTitle(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein()); super(params, new com.wcohen.ss.Levenstein());
} }
@ -33,7 +36,8 @@ public class LevensteinTitle extends AbstractStringComparator {
final boolean check = checkNumbers(ca, cb); final boolean check = checkNumbers(ca, cb);
if (check) return 0.5; if (check)
return 0.5;
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length()); return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
} }

View File

@ -1,19 +1,21 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/** /**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities. * Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/ */
@ComparatorClass("levensteinTitleIgnoreVersion") @ComparatorClass("levensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends AbstractStringComparator { public class LevensteinTitleIgnoreVersion extends AbstractStringComparator {
public LevensteinTitleIgnoreVersion(Map<String,String> params){ public LevensteinTitleIgnoreVersion(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein()); super(params, new com.wcohen.ss.Levenstein());
} }

View File

@ -1,13 +1,14 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/** /**
* The Class Contains match * The Class Contains match
* *
@ -16,51 +17,50 @@ import java.util.stream.Collectors;
@ComparatorClass("listContainsMatch") @ComparatorClass("listContainsMatch")
public class ListContainsMatch extends AbstractListComparator { public class ListContainsMatch extends AbstractListComparator {
private Map<String, String> params; private Map<String, String> params;
private boolean CASE_SENSITIVE; private boolean CASE_SENSITIVE;
private String STRING; private String STRING;
private String AGGREGATOR; private String AGGREGATOR;
public ListContainsMatch(Map<String, String> params) { public ListContainsMatch(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
//read parameters // read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string"); STRING = params.get("string");
AGGREGATOR = params.get("bool"); AGGREGATOR = params.get("bool");
} }
@Override @Override
public double compare(List<String> sa, List<String> sb, Config conf) { public double compare(List<String> sa, List<String> sb, Config conf) {
if (sa.isEmpty() || sb.isEmpty()) { if (sa.isEmpty() || sb.isEmpty()) {
return -1; return -1;
} }
if (!CASE_SENSITIVE) { if (!CASE_SENSITIVE) {
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList()); sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList()); sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
STRING = STRING.toLowerCase(); STRING = STRING.toLowerCase();
} }
switch(AGGREGATOR) { switch (AGGREGATOR) {
case "AND": case "AND":
if(sa.contains(STRING) && sb.contains(STRING)) if (sa.contains(STRING) && sb.contains(STRING))
return 1.0; return 1.0;
break; break;
case "OR": case "OR":
if(sa.contains(STRING) || sb.contains(STRING)) if (sa.contains(STRING) || sb.contains(STRING))
return 1.0; return 1.0;
break; break;
case "XOR": case "XOR":
if(sa.contains(STRING) ^ sb.contains(STRING)) if (sa.contains(STRING) ^ sb.contains(STRING))
return 1.0; return 1.0;
break; break;
default: default:
return 0.0; return 0.0;
} }
return 0.0; return 0.0;
} }
} }

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("mustBeDifferent") @ComparatorClass("mustBeDifferent")
public class MustBeDifferent extends AbstractStringComparator { public class MustBeDifferent extends AbstractStringComparator {
public MustBeDifferent(Map<String,String> params){ public MustBeDifferent(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein()); super(params, new com.wcohen.ss.Levenstein());
} }

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/** /**
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo. * NullDistanceAlgo.
@ -13,7 +14,7 @@ import java.util.Map;
@ComparatorClass("null") @ComparatorClass("null")
public class NullDistanceAlgo<T> implements Comparator<T> { public class NullDistanceAlgo<T> implements Comparator<T> {
public NullDistanceAlgo(Map<String, String> params){ public NullDistanceAlgo(Map<String, String> params) {
} }
@Override @Override

View File

@ -1,34 +1,35 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator") @ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractStringComparator { public class NumbersComparator extends AbstractStringComparator {
Map<String, String> params; Map<String, String> params;
public NumbersComparator(Map<String, String> params) { public NumbersComparator(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
} }
@Override @Override
public double distance(String a, String b, Config conf) { public double distance(String a, String b, Config conf) {
//extracts numbers from the field // extracts numbers from the field
String numbers1 = getNumbers(nfd(a)); String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b)); String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty()) if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0; return -1.0;
int n1 = Integer.parseInt(numbers1); int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2); int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2); return Math.abs(n1 - n2);
} }
} }

View File

@ -1,36 +1,36 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersMatch") @ComparatorClass("numbersMatch")
public class NumbersMatch extends AbstractStringComparator { public class NumbersMatch extends AbstractStringComparator {
public NumbersMatch(Map<String, String> params) {
super(params);
}
public NumbersMatch(Map<String, String> params) { @Override
super(params); public double distance(String a, String b, Config conf) {
}
@Override // extracts numbers from the field
public double distance(String a, String b, Config conf) { String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
//extracts numbers from the field if (numbers1.isEmpty() && numbers2.isEmpty())
String numbers1 = getNumbers(nfd(a)); return 1.0;
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() && numbers2.isEmpty()) if (numbers1.isEmpty() || numbers2.isEmpty())
return 1.0; return -1.0;
if (numbers1.isEmpty() || numbers2.isEmpty()) if (numbers1.equals(numbers2))
return -1.0; return 1.0;
if (numbers1.equals(numbers2)) return 0.0;
return 1.0; }
}
return 0.0;
}
}

View File

@ -1,36 +1,36 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("romansMatch") @ComparatorClass("romansMatch")
public class RomansMatch extends AbstractStringComparator { public class RomansMatch extends AbstractStringComparator {
public RomansMatch(Map<String, String> params) {
super(params);
}
public RomansMatch(Map<String, String> params) { @Override
super(params); public double distance(String a, String b, Config conf) {
}
@Override // extracts romans from the field
public double distance(String a, String b, Config conf) { String romans1 = getRomans(nfd(a));
String romans2 = getRomans(nfd(b));
//extracts romans from the field if (romans1.isEmpty() && romans2.isEmpty())
String romans1 = getRomans(nfd(a)); return 1.0;
String romans2 = getRomans(nfd(b));
if (romans1.isEmpty() && romans2.isEmpty()) if (romans1.isEmpty() || romans2.isEmpty())
return 1.0; return -1.0;
if (romans1.isEmpty() || romans2.isEmpty()) if (romans1.equals(romans2))
return -1.0; return 1.0;
if (romans1.equals(romans2)) return 0.0;
return 1.0; }
return 0.0;
}
} }

View File

@ -1,13 +1,15 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Lists; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/** /**
* Returns true if the number of values in the fields is the same. * Returns true if the number of values in the fields is the same.
* *
@ -16,23 +18,23 @@ import java.util.Map;
@ComparatorClass("sizeMatch") @ComparatorClass("sizeMatch")
public class SizeMatch extends AbstractListComparator { public class SizeMatch extends AbstractListComparator {
/** /**
* Instantiates a new size match. * Instantiates a new size match.
* *
* @param params * @param params
* the parameters * the parameters
*/ */
public SizeMatch(final Map<String, String> params) { public SizeMatch(final Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
public double compare(final List<String> a, final List<String> b, final Config conf) { public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty())
return -1.0; return -1.0;
return a.size() == b.size() ? 1.0 : 0.0; return a.size() == b.size() ? 1.0 : 0.0;
} }
} }

View File

@ -1,18 +1,20 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/** /**
* The Class SortedJaroWinkler. * The Class SortedJaroWinkler.
*/ */
@ComparatorClass("sortedJaroWinkler") @ComparatorClass("sortedJaroWinkler")
public class SortedJaroWinkler extends AbstractSortedComparator { public class SortedJaroWinkler extends AbstractSortedComparator {
public SortedJaroWinkler(Map<String,String> params){ public SortedJaroWinkler(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein()); super(params, new com.wcohen.ss.Levenstein());
} }
@ -40,7 +42,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator {
/* /*
* (non-Javadoc) * (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/ */
@Override @Override
@ -50,7 +51,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator {
/* /*
* (non-Javadoc) * (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/ */
@Override @Override

View File

@ -1,11 +1,13 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/** /**
* The Class SortedJaroWinkler. * The Class SortedJaroWinkler.
*/ */
@ -22,7 +24,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
super(weight, new com.wcohen.ss.Level2JaroWinkler()); super(weight, new com.wcohen.ss.Level2JaroWinkler());
} }
public SortedLevel2JaroWinkler(final Map<String, String> params){ public SortedLevel2JaroWinkler(final Map<String, String> params) {
super(params, new com.wcohen.ss.Level2JaroWinkler()); super(params, new com.wcohen.ss.Level2JaroWinkler());
} }
@ -40,7 +42,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/* /*
* (non-Javadoc) * (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/ */
@Override @Override
@ -50,7 +51,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/* /*
* (non-Javadoc) * (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/ */
@Override @Override

View File

@ -1,12 +1,13 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/** /**
* The Class Contains match * The Class Contains match
* *
@ -15,50 +16,50 @@ import java.util.Map;
@ComparatorClass("stringContainsMatch") @ComparatorClass("stringContainsMatch")
public class StringContainsMatch extends AbstractStringComparator { public class StringContainsMatch extends AbstractStringComparator {
private Map<String, String> params; private Map<String, String> params;
private boolean CASE_SENSITIVE; private boolean CASE_SENSITIVE;
private String STRING; private String STRING;
private String AGGREGATOR; private String AGGREGATOR;
public StringContainsMatch(Map<String, String> params) { public StringContainsMatch(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
//read parameters // read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string"); STRING = params.get("string");
AGGREGATOR = params.get("aggregator"); AGGREGATOR = params.get("aggregator");
} }
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
String ca = a; String ca = a;
String cb = b; String cb = b;
if (!CASE_SENSITIVE) { if (!CASE_SENSITIVE) {
ca = a.toLowerCase(); ca = a.toLowerCase();
cb = b.toLowerCase(); cb = b.toLowerCase();
STRING = STRING.toLowerCase(); STRING = STRING.toLowerCase();
} }
switch(AGGREGATOR) { switch (AGGREGATOR) {
case "AND": case "AND":
if(ca.contains(STRING) && cb.contains(STRING)) if (ca.contains(STRING) && cb.contains(STRING))
return 1.0; return 1.0;
break; break;
case "OR": case "OR":
if(ca.contains(STRING) || cb.contains(STRING)) if (ca.contains(STRING) || cb.contains(STRING))
return 1.0; return 1.0;
break; break;
case "XOR": case "XOR":
if(ca.contains(STRING) ^ cb.contains(STRING)) if (ca.contains(STRING) ^ cb.contains(STRING))
return 1.0; return 1.0;
break; break;
default: default:
return 0.0; return 0.0;
} }
return 0.0; return 0.0;
} }
} }

View File

@ -1,53 +1,56 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("stringListMatch") @ComparatorClass("stringListMatch")
public class StringListMatch extends AbstractListComparator { public class StringListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class); private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params; private Map<String, String> params;
final private String TYPE; //percentage or count final private String TYPE; // percentage or count
public StringListMatch(final Map<String, String> params) { public StringListMatch(final Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
TYPE = params.getOrDefault("type", "percentage"); TYPE = params.getOrDefault("type", "percentage");
} }
@Override @Override
public double compare(final List<String> a, final List<String> b, final Config conf) { public double compare(final List<String> a, final List<String> b, final Config conf) {
final Set<String> pa = new HashSet<>(a); final Set<String> pa = new HashSet<>(a);
final Set<String> pb = new HashSet<>(b); final Set<String> pb = new HashSet<>(b);
if (pa.isEmpty() || pb.isEmpty()) { if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists is empty return -1; // return undefined if one of the two lists is empty
} }
int incommon = Sets.intersection(pa, pb).size(); int incommon = Sets.intersection(pa, pb).size();
int simDiff = Sets.symmetricDifference(pa, pb).size(); int simDiff = Sets.symmetricDifference(pa, pb).size();
if (incommon + simDiff == 0) { if (incommon + simDiff == 0) {
return 0.0; return 0.0;
} }
if(TYPE.equals("percentage")) if (TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff); return (double) incommon / (incommon + simDiff);
else else
return incommon; return incommon;
} }
} }

View File

@ -1,12 +1,15 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.util.Map;
/** /**
* The Class SubStringLevenstein. * The Class SubStringLevenstein.
@ -14,76 +17,74 @@ import java.util.Map;
@ComparatorClass("subStringLevenstein") @ComparatorClass("subStringLevenstein")
public class SubStringLevenstein extends AbstractStringComparator { public class SubStringLevenstein extends AbstractStringComparator {
/** /**
* The limit. * The limit.
*/ */
protected int limit; protected int limit;
/** /**
* Instantiates a new sub string levenstein. * Instantiates a new sub string levenstein.
* *
* @param w the w * @param w the w
*/ */
public SubStringLevenstein(final double w) { public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein()); super(w, new com.wcohen.ss.Levenstein());
} }
public SubStringLevenstein(Map<String, String> params) { public SubStringLevenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein()); super(params, new com.wcohen.ss.Levenstein());
this.limit = Integer.parseInt(params.getOrDefault("limit", "1")); this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
} }
/** /**
* Instantiates a new sub string levenstein. * Instantiates a new sub string levenstein.
* *
* @param w the w * @param w the w
* @param limit the limit * @param limit the limit
*/ */
public SubStringLevenstein(final double w, final int limit) { public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein()); super(w, new com.wcohen.ss.Levenstein());
this.limit = limit; this.limit = limit;
} }
/** /**
* Instantiates a new sub string levenstein. * Instantiates a new sub string levenstein.
* *
* @param w the w * @param w the w
* @param limit the limit * @param limit the limit
* @param ssalgo the ssalgo * @param ssalgo the ssalgo
*/ */
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo); super(w, ssalgo);
this.limit = limit; this.limit = limit;
} }
/* /*
* (non-Javadoc) * (non-Javadoc)
* * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field,
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) * eu.dnetlib.pace.model.Field)
*/ */
@Override @Override
public double distance(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf); return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
} }
/* /*
* (non-Javadoc) * (non-Javadoc)
* * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */
*/ @Override
@Override public double getWeight() {
public double getWeight() { return super.weight;
return super.weight; }
}
/* /*
* (non-Javadoc) * (non-Javadoc)
* * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */
*/ @Override
@Override protected double normalize(final double d) {
protected double normalize(final double d) { return 1 / Math.pow(Math.abs(d) + 1, 0.1);
return 1 / Math.pow(Math.abs(d) + 1, 0.1); }
}
} }

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/** /**
* Returns true if the titles in the given documents contains the same numbers, false otherwise. * Returns true if the titles in the given documents contains the same numbers, false otherwise.
* *
@ -15,24 +16,24 @@ import java.util.Map;
@ComparatorClass("titleVersionMatch") @ComparatorClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractStringComparator { public class TitleVersionMatch extends AbstractStringComparator {
public TitleVersionMatch(final Map<String, String> params) { public TitleVersionMatch(final Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
public double compare(final String valueA, final String valueB, final Config conf) { public double compare(final String valueA, final String valueB, final Config conf) {
if (valueA.isEmpty() || valueB.isEmpty()) if (valueA.isEmpty() || valueB.isEmpty())
return -1; return -1;
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0; return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
} }
@Override @Override
public String toString() { public String toString() {
return getClass().getSimpleName() + ":" + super.toString(); return getClass().getSimpleName() + ":" + super.toString();
} }
protected String toString(final Object object) { protected String toString(final Object object) {
return toFirstString(object); return toFirstString(object);
} }
} }

View File

@ -1,61 +1,63 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("urlMatcher") @ComparatorClass("urlMatcher")
public class UrlMatcher extends Levenstein { public class UrlMatcher extends Levenstein {
private Map<String, String> params; private Map<String, String> params;
public UrlMatcher(Map<String, String> params){ public UrlMatcher(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
} }
public UrlMatcher(double weight, Map<String, String> params) { public UrlMatcher(double weight, Map<String, String> params) {
super(weight); super(weight);
this.params = params; this.params = params;
} }
public void setParams(Map<String, String> params) { public void setParams(Map<String, String> params) {
this.params = params; this.params = params;
} }
@Override @Override
public double distance(String a, String b, final Config conf) { public double distance(String a, String b, final Config conf) {
final URL urlA = asUrl(a); final URL urlA = asUrl(a);
final URL urlB = asUrl(b); final URL urlB = asUrl(b);
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0; return 0.0;
} }
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5")); Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5")); Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5; return hostW * 0.5;
} }
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
} }
private URL asUrl(final String value) { private URL asUrl(final String value) {
try { try {
return new URL(value); return new URL(value);
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
// should not happen as checked by pace typing // should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value); throw new IllegalStateException("invalid URL: " + value);
} }
} }
protected String toString(final Object object) { protected String toString(final Object object) {
return toFirstString(object); return toFirstString(object);
} }
} }

View File

@ -1,11 +1,13 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.util.Map;
/** /**
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing. * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
@ -15,36 +17,36 @@ import java.util.Map;
@ComparatorClass("yearMatch") @ComparatorClass("yearMatch")
public class YearMatch extends AbstractStringComparator { public class YearMatch extends AbstractStringComparator {
private int limit = 4; private int limit = 4;
public YearMatch(final Map<String, String> params) { public YearMatch(final Map<String, String> params) {
super(params); super(params);
} }
@Override @Override
public double compare(final String a, final String b, final Config conf) { public double compare(final String a, final String b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a)); final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b)); final String valueB = getNumbers(getFirstValue(b));
if (valueA.isEmpty() || valueB.isEmpty()) if (valueA.isEmpty() || valueB.isEmpty())
return -1; return -1;
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0; return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
} }
protected boolean checkLength(final String s) { protected boolean checkLength(final String s) {
return s.length() == limit; return s.length() == limit;
} }
protected String getFirstValue(final String value) { protected String getFirstValue(final String value) {
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : ""; return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
} }
@Override @Override
public String toString() { public String toString() {
return getClass().getSimpleName() + ":" + super.toString(); return getClass().getSimpleName() + ":" + super.toString();
} }
} }

View File

@ -1,130 +1,131 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.base.Joiner; package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
public abstract class AbstractComparator<T> extends AbstractPaceFunctions implements Comparator<T> { public abstract class AbstractComparator<T> extends AbstractPaceFunctions implements Comparator<T> {
/** The ssalgo. */ /** The ssalgo. */
protected AbstractStringDistance ssalgo; protected AbstractStringDistance ssalgo;
/** The weight. */ /** The weight. */
protected double weight = 0.0; protected double weight = 0.0;
private Map<String, String> params; private Map<String, String> params;
protected AbstractComparator(Map<String, String> params) { protected AbstractComparator(Map<String, String> params) {
this.params = params; this.params = params;
} }
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo){ protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo) {
this.params = params; this.params = params;
this.weight = 1.0; this.weight = 1.0;
this.ssalgo = ssalgo; this.ssalgo = ssalgo;
} }
/** /**
* Instantiates a new second string compare algo. * Instantiates a new second string compare algo.
* *
* @param weight * @param weight
* the weight * the weight
* @param ssalgo * @param ssalgo
* the ssalgo * the ssalgo
*/ */
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo; this.ssalgo = ssalgo;
this.weight = weight; this.weight = weight;
} }
protected AbstractComparator(final AbstractStringDistance ssalgo){ protected AbstractComparator(final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo; this.ssalgo = ssalgo;
} }
/** /**
* Normalize. * Normalize.
* *
* @param d * @param d
* the d * the d
* @return the double * @return the double
*/ */
protected double normalize(double d) { protected double normalize(double d) {
return d; return d;
} }
/** /**
* Distance. * Distance.
* *
* @param a * @param a
* the a * the a
* @param b * @param b
* the b * the b
* @return the double * @return the double
*/ */
protected double distance(final String a, final String b, final Config conf) { protected double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) { if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing return -1; // return -1 if a field is missing
} }
double score = ssalgo.score(a, b); double score = ssalgo.score(a, b);
return normalize(score); return normalize(score);
} }
protected double compare(final String a, final String b, final Config conf) { protected double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty())
return -1; return -1;
return distance(a, b, conf); return distance(a, b, conf);
} }
/** /**
* Convert the given argument to a List of Strings * Convert the given argument to a List of Strings
* *
* @param object * @param object
* function argument * function argument
* @return the list * @return the list
*/ */
protected List<String> toList(final Object object) { protected List<String> toList(final Object object) {
if (object instanceof List) { if (object instanceof List) {
return (List<String>)object; return (List<String>) object;
} }
return Lists.newArrayList(object.toString()); return Lists.newArrayList(object.toString());
} }
/** /**
* Convert the given argument to a String * Convert the given argument to a String
* *
* @param object * @param object
* function argument * function argument
* @return the list * @return the list
*/ */
protected String toString(final Object object) { protected String toString(final Object object) {
if (object instanceof List) { if (object instanceof List) {
List<String> l = (List<String>) object; List<String> l = (List<String>) object;
return Joiner.on(" ").join(l); return Joiner.on(" ").join(l);
} }
return object.toString(); return object.toString();
} }
protected String toFirstString(final Object object) { protected String toFirstString(final Object object) {
if (object instanceof List) { if (object instanceof List) {
List<String> l = (List<String>) object; List<String> l = (List<String>) object;
return l.isEmpty() ? "" : l.get(0); return l.isEmpty() ? "" : l.get(0);
} }
return object.toString(); return object.toString();
} }
public double getWeight() {
public double getWeight(){ return this.weight;
return this.weight; }
}
} }

View File

@ -1,39 +1,41 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists; package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
abstract public class AbstractListComparator extends AbstractComparator<List<String>>{ import com.google.common.collect.Lists;
protected AbstractListComparator(Map<String, String> params) { import com.wcohen.ss.AbstractStringDistance;
super(params);
}
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) { import eu.dnetlib.pace.config.Config;
super(params, ssalgo); import eu.dnetlib.pace.config.Type;
}
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) { abstract public class AbstractListComparator extends AbstractComparator<List<String>> {
super(weight, ssalgo); protected AbstractListComparator(Map<String, String> params) {
} super(params);
}
protected AbstractListComparator(AbstractStringDistance ssalgo) { protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(ssalgo); super(params, ssalgo);
} }
@Override protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
public double compare(Object a, Object b, Config conf) { super(weight, ssalgo);
return compare(toList(a), toList(b), conf); }
}
public double compare(final List<String> a, final List<String> b, final Config conf) { protected AbstractListComparator(AbstractStringDistance ssalgo) {
if (a.isEmpty() || b.isEmpty()) super(ssalgo);
return -1; }
return distance(concat(a), concat(b), conf); @Override
} public double compare(Object a, Object b, Config conf) {
return compare(toList(a), toList(b), conf);
}
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(concat(a), concat(b), conf);
}
} }

View File

@ -1,40 +1,41 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists; package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import java.util.AbstractList; import java.util.AbstractList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
public abstract class AbstractSortedComparator extends AbstractListComparator { public abstract class AbstractSortedComparator extends AbstractListComparator {
/** /**
* Instantiates a new sorted second string compare algo. * Instantiates a new sorted second string compare algo.
* *
* @param weight * @param weight
* the weight * the weight
* @param ssalgo * @param ssalgo
* the ssalgo * the ssalgo
*/ */
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo); super(weight, ssalgo);
} }
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo){ protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo) {
super(Double.parseDouble(params.get("weight")), ssalgo); super(Double.parseDouble(params.get("weight")), ssalgo);
} }
@Override @Override
protected List<String> toList(final Object object) { protected List<String> toList(final Object object) {
if (object instanceof List) { if (object instanceof List) {
List<String> fl = (List<String>) object; List<String> fl = (List<String>) object;
List<String> values = Lists.newArrayList(fl); List<String> values = Lists.newArrayList(fl);
Collections.sort(values); Collections.sort(values);
return values; return values;
} }
return Lists.newArrayList(object.toString()); return Lists.newArrayList(object.toString());
} }
} }

View File

@ -1,44 +1,46 @@
package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import java.util.Map; import java.util.Map;
public abstract class AbstractStringComparator extends AbstractComparator<String>{ import com.wcohen.ss.AbstractStringDistance;
protected AbstractStringComparator(Map<String, String> params) {
super(params);
}
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) { import eu.dnetlib.pace.config.Config;
super(params, ssalgo);
}
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) { public abstract class AbstractStringComparator extends AbstractComparator<String> {
super(weight, ssalgo); protected AbstractStringComparator(Map<String, String> params) {
} super(params);
}
protected AbstractStringComparator(AbstractStringDistance ssalgo) { protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(ssalgo); super(params, ssalgo);
} }
public double distance(final String a, final String b, final Config conf) { protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
if (a.isEmpty() || b.isEmpty()) { super(weight, ssalgo);
return -1; //return -1 if a field is missing }
}
double score = ssalgo.score(a, b);
return normalize(score);
}
@Override protected AbstractStringComparator(AbstractStringDistance ssalgo) {
public double compare(Object a, Object b, Config conf) { super(ssalgo);
return compare(toString(a), toString(b), conf); }
}
public double compare(final String a, final String b, final Config conf) { public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty()) {
return -1; return -1; // return -1 if a field is missing
return distance(a, b, conf); }
} double score = ssalgo.score(a, b);
return normalize(score);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toString(a), toString(b), conf);
}
public double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
} }

View File

@ -1,24 +1,21 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
public enum AggType { public enum AggType {
W_MEAN, //weighted mean W_MEAN, // weighted mean
AVG, //average AVG, // average
SUM, SUM, MAX, MIN, AND, // used for necessary conditions
MAX, OR; // used for sufficient conditions
MIN,
AND, //used for necessary conditions
OR; //used for sufficient conditions
public static AggType getEnum(String value) { public static AggType getEnum(String value) {
try { try {
return AggType.valueOf(value); return AggType.valueOf(value);
} } catch (IllegalArgumentException e) {
catch (IllegalArgumentException e) { throw new PaceException("Undefined aggregation type", e);
throw new PaceException("Undefined aggregation type", e); }
} }
}
} }

View File

@ -1,12 +1,12 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
public interface Comparator<T> { public interface Comparator<T> {
/* /*
* return : -1 -> can't decide (i.e. missing field) * return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm)
* >0 -> similarity degree (depends on the algorithm) */
* */ public double compare(Object a, Object b, Config conf);
public double compare(Object a, Object b, Config conf);
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import java.lang.annotation.ElementType; import java.lang.annotation.ElementType;
@ -9,5 +10,5 @@ import java.lang.annotation.Target;
@Target(ElementType.TYPE) @Target(ElementType.TYPE)
public @interface ComparatorClass { public @interface ComparatorClass {
public String value(); public String value();
} }

View File

@ -1,82 +1,84 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.Map; import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
/** /**
* The class that defines the configuration of each field in the decision tree. * The class that defines the configuration of each field in the decision tree.
* */ * */
public class FieldConf implements Serializable { public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator private String field; // name of the field on which apply the comparator
private String comparator; //comparator name private String comparator; // comparator name
private double weight = 1.0; //weight for the field (to be used in the aggregation) private double weight = 1.0; // weight for the field (to be used in the aggregation)
private Map<String,String> params; //parameters private Map<String, String> params; // parameters
private boolean countIfUndefined; private boolean countIfUndefined;
public boolean isCountIfUndefined() { public boolean isCountIfUndefined() {
return countIfUndefined; return countIfUndefined;
} }
public void setCountIfUndefined(boolean countIfUndefined) { public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined; this.countIfUndefined = countIfUndefined;
} }
public FieldConf() { public FieldConf() {
} }
public FieldConf(String field, String comparator, double weight, Map<String, String> params, boolean countIfUndefined) { public FieldConf(String field, String comparator, double weight, Map<String, String> params,
this.field = field; boolean countIfUndefined) {
this.comparator = comparator; this.field = field;
this.weight = weight; this.comparator = comparator;
this.params = params; this.weight = weight;
this.countIfUndefined = countIfUndefined; this.params = params;
} this.countIfUndefined = countIfUndefined;
}
public String getField() { public String getField() {
return field; return field;
} }
public void setField(String field) { public void setField(String field) {
this.field = field; this.field = field;
} }
public String getComparator() { public String getComparator() {
return comparator; return comparator;
} }
public void setComparator(String comparator) { public void setComparator(String comparator) {
this.comparator = comparator; this.comparator = comparator;
} }
public double getWeight() { public double getWeight() {
return weight; return weight;
} }
public void setWeight(double weight) { public void setWeight(double weight) {
this.weight = weight; this.weight = weight;
} }
public Map<String, String> getParams() { public Map<String, String> getParams() {
return params; return params;
} }
public void setParams(Map<String, String> params) { public void setParams(Map<String, String> params) {
this.params = params; this.params = params;
} }
@Override @Override
public String toString() { public String toString() {
try { try {
return new ObjectMapper().writeValueAsString(this); return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) { } catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e); throw new PaceException("Impossible to convert to JSON: ", e);
} }
} }
} }

View File

@ -1,87 +1,89 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
/** /**
* The class that contains the result of each comparison in the decision tree * The class that contains the result of each comparison in the decision tree
* */ * */
public class FieldStats implements Serializable { public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation) private double weight; // weight for the field (to be used in the aggregation)
private double threshold; //threshold for the field (to be used in some kind of aggregations) private double threshold; // threshold for the field (to be used in some kind of aggregations)
private double result; //the result of the comparison private double result; // the result of the comparison
private Object a; private Object a;
private Object b; private Object b;
private boolean countIfUndefined; private boolean countIfUndefined;
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) { public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
this.weight = weight; this.weight = weight;
this.threshold = threshold; this.threshold = threshold;
this.result = result; this.result = result;
this.countIfUndefined = countIfUndefined; this.countIfUndefined = countIfUndefined;
this.a = a; this.a = a;
this.b = b; this.b = b;
} }
public double getThreshold() { public double getThreshold() {
return threshold; return threshold;
} }
public void setThreshold(double threshold) { public void setThreshold(double threshold) {
this.threshold = threshold; this.threshold = threshold;
} }
public double getWeight() { public double getWeight() {
return weight; return weight;
} }
public void setWeight(double weight) { public void setWeight(double weight) {
this.weight = weight; this.weight = weight;
} }
public double getResult() { public double getResult() {
return result; return result;
} }
public void setResult(double result) { public void setResult(double result) {
this.result = result; this.result = result;
} }
public boolean isCountIfUndefined() { public boolean isCountIfUndefined() {
return countIfUndefined; return countIfUndefined;
} }
public void setCountIfUndefined(boolean countIfUndefined) { public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined; this.countIfUndefined = countIfUndefined;
} }
public Object getA() { public Object getA() {
return a; return a;
} }
public void setA(Object a) { public void setA(Object a) {
this.a = a; this.a = a;
} }
public Object getB() { public Object getB() {
return b; return b;
} }
public void setB(Object b) { public void setB(Object b) {
this.b = b; this.b = b;
} }
@Override @Override
public String toString(){ public String toString() {
try { try {
return new ObjectMapper().writeValueAsString(this); return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) { } catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e); throw new PaceException("Impossible to convert to JSON: ", e);
} }
} }
} }

View File

@ -1,20 +1,19 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
public enum MatchType { public enum MatchType {
MATCH, MATCH, NO_MATCH, UNDEFINED;
NO_MATCH,
UNDEFINED;
public static MatchType parse(String value) { public static MatchType parse(String value) {
if (MATCH.name().equals(value)) { if (MATCH.name().equals(value)) {
return MATCH; return MATCH;
} else if (NO_MATCH.name().equals(value)) { } else if (NO_MATCH.name().equals(value)) {
return NO_MATCH; return NO_MATCH;
} else { } else {
return UNDEFINED; return UNDEFINED;
} }
// try { // try {
// return MatchType.valueOf(value); // return MatchType.valueOf(value);
@ -22,5 +21,5 @@ public enum MatchType {
// catch (IllegalArgumentException e) { // catch (IllegalArgumentException e) {
// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable // return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
// } // }
} }
} }

View File

@ -1,166 +1,170 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
public class TreeNodeDef implements Serializable { public class TreeNodeDef implements Serializable {
final static String CROSS_COMPARE = "crossCompare"; final static String CROSS_COMPARE = "crossCompare";
private List<FieldConf> fields; private List<FieldConf> fields;
private AggType aggregation; private AggType aggregation;
private double threshold; private double threshold;
private String positive; private String positive;
private String negative; private String negative;
private String undefined; private String undefined;
boolean ignoreUndefined; boolean ignoreUndefined;
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) { public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative,
this.fields = fields; String undefined, boolean ignoreUndefined) {
this.aggregation = aggregation; this.fields = fields;
this.threshold = threshold; this.aggregation = aggregation;
this.positive = positive; this.threshold = threshold;
this.negative = negative; this.positive = positive;
this.undefined = undefined; this.negative = negative;
this.ignoreUndefined = ignoreUndefined; this.undefined = undefined;
} this.ignoreUndefined = ignoreUndefined;
}
public TreeNodeDef() {} public TreeNodeDef() {
}
//function for the evaluation of the node // function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) { public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(); TreeNodeStats stats = new TreeNodeStats();
//for each field in the node, it computes the // for each field in the node, it computes the
for (FieldConf fieldConf : fields) { for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight(); double weight = fieldConf.getWeight();
double result; double result;
Object value1 = getJavaValue(doc1,fieldConf.getField()); Object value1 = getJavaValue(doc1, fieldConf.getField());
Object value2 = getJavaValue(doc2,fieldConf.getField()); Object value2 = getJavaValue(doc2, fieldConf.getField());
// if the param specifies a cross comparison (i.e. compare elements from different fields), compute the
// result for both sides and return the maximum
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
if (crossField != null) {
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf);
double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf);
result = Math.max(result1, result2);
} else {
result = comparator(fieldConf).compare(value1, value2, conf);
}
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum stats
String crossField = fieldConf.getParams().get(CROSS_COMPARE); .addFieldStats(
if (crossField != null) { fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2,crossField), conf); new FieldStats(
double result2 = comparator(fieldConf).compare(getJavaValue(doc1,crossField), value2, conf); weight,
result = Math.max(result1,result2); Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
} result,
else { fieldConf.isCountIfUndefined(),
result = comparator(fieldConf).compare(value1, value2, conf); value1,
} value2));
}
stats.addFieldStats( return stats;
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), }
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
result,
fieldConf.isCountIfUndefined(),
value1,
value2
));
}
return stats; public Object getJavaValue(Row row, String name) {
} int pos = row.fieldIndex(name);
if (pos >= 0) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
}
public Object getJavaValue(Row row, String name) { return null;
int pos = row.fieldIndex(name); }
if (pos >= 0) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
}
return null; private Comparator comparator(final FieldConf field) {
}
private Comparator comparator(final FieldConf field){ return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
}
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); public List<FieldConf> getFields() {
} return fields;
}
public List<FieldConf> getFields() { public void setFields(List<FieldConf> fields) {
return fields; this.fields = fields;
} }
public void setFields(List<FieldConf> fields) { public AggType getAggregation() {
this.fields = fields; return aggregation;
} }
public AggType getAggregation() { public void setAggregation(AggType aggregation) {
return aggregation; this.aggregation = aggregation;
} }
public void setAggregation(AggType aggregation) { public double getThreshold() {
this.aggregation = aggregation; return threshold;
} }
public double getThreshold() { public void setThreshold(double threshold) {
return threshold; this.threshold = threshold;
} }
public void setThreshold(double threshold) { public String getPositive() {
this.threshold = threshold; return positive;
} }
public String getPositive() { public void setPositive(String positive) {
return positive; this.positive = positive;
} }
public void setPositive(String positive) { public String getNegative() {
this.positive = positive; return negative;
} }
public String getNegative() { public void setNegative(String negative) {
return negative; this.negative = negative;
} }
public void setNegative(String negative) { public String getUndefined() {
this.negative = negative; return undefined;
} }
public String getUndefined() { public void setUndefined(String undefined) {
return undefined; this.undefined = undefined;
} }
public void setUndefined(String undefined) { public boolean isIgnoreUndefined() {
this.undefined = undefined; return ignoreUndefined;
} }
public boolean isIgnoreUndefined() { public void setIgnoreUndefined(boolean ignoreUndefined) {
return ignoreUndefined; this.ignoreUndefined = ignoreUndefined;
} }
public void setIgnoreUndefined(boolean ignoreUndefined) { @Override
this.ignoreUndefined = ignoreUndefined; public String toString() {
} try {
return new ObjectMapper().writeValueAsString(this);
@Override } catch (IOException e) {
public String toString() { throw new PaceException("Impossible to convert to JSON: ", e);
try { }
return new ObjectMapper().writeValueAsString(this); }
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
} }

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import java.io.Serializable; import java.io.Serializable;
@ -6,129 +7,128 @@ import java.util.Map;
public class TreeNodeStats implements Serializable { public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; //this is an accumulator for the results of the node private Map<String, FieldStats> results; // this is an accumulator for the results of the node
public TreeNodeStats(){ public TreeNodeStats() {
this.results = new HashMap<>(); this.results = new HashMap<>();
} }
public Map<String, FieldStats> getResults() { public Map<String, FieldStats> getResults() {
return results; return results;
} }
public void addFieldStats(String id, FieldStats fieldStats){ public void addFieldStats(String id, FieldStats fieldStats) {
this.results.put(id, fieldStats); this.results.put(id, fieldStats);
} }
public int fieldsCount(){ public int fieldsCount() {
return this.results.size(); return this.results.size();
} }
public int undefinedCount(){ public int undefinedCount() {
int undefinedCount = 0; int undefinedCount = 0;
for(FieldStats fs: this.results.values()){ for (FieldStats fs : this.results.values()) {
if(fs.getResult() == -1) if (fs.getResult() == -1)
undefinedCount ++; undefinedCount++;
} }
return undefinedCount; return undefinedCount;
} }
public double scoreSum(){ public double scoreSum() {
double scoreSum = 0.0; double scoreSum = 0.0;
for(FieldStats fs: this.results.values()){ for (FieldStats fs : this.results.values()) {
if(fs.getResult()>=0.0) { if (fs.getResult() >= 0.0) {
scoreSum += fs.getResult(); scoreSum += fs.getResult();
} }
} }
return scoreSum; return scoreSum;
} }
//return the sum of the weights without considering the fields with countIfMissing=false && result=-1 // return the sum of the weights without considering the fields with countIfMissing=false && result=-1
public double weightSum(){ public double weightSum() {
double weightSum = 0.0; double weightSum = 0.0;
for(FieldStats fs: this.results.values()){ for (FieldStats fs : this.results.values()) {
if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) { if (fs.getResult() >= 0.0 || (fs.getResult() < 0.0 && fs.isCountIfUndefined())) {
weightSum += fs.getWeight(); weightSum += fs.getWeight();
} }
} }
return weightSum; return weightSum;
} }
public double weightedScoreSum(){ public double weightedScoreSum() {
double weightedScoreSum = 0.0; double weightedScoreSum = 0.0;
for(FieldStats fs: this.results.values()){ for (FieldStats fs : this.results.values()) {
if(fs.getResult()>=0.0) { if (fs.getResult() >= 0.0) {
weightedScoreSum += fs.getResult()*fs.getWeight(); weightedScoreSum += fs.getResult() * fs.getWeight();
} }
} }
return weightedScoreSum; return weightedScoreSum;
} }
public double max(){ public double max() {
double max = -1.0; double max = -1.0;
for(FieldStats fs: this.results.values()){ for (FieldStats fs : this.results.values()) {
if(fs.getResult()>max) if (fs.getResult() > max)
max = fs.getResult(); max = fs.getResult();
} }
return max; return max;
} }
public double min(){ public double min() {
double min = 100.0; //random high value double min = 100.0; // random high value
for(FieldStats fs: this.results.values()){ for (FieldStats fs : this.results.values()) {
if(fs.getResult()<min) { if (fs.getResult() < min) {
if (fs.getResult()>=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
min = fs.getResult(); min = fs.getResult();
} }
} }
return min; return min;
} }
//if at least one is true, return 1.0 // if at least one is true, return 1.0
public double or(){ public double or() {
for (FieldStats fieldStats : this.results.values()) { for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() >= fieldStats.getThreshold()) if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0; return 1.0;
} }
return 0.0; return 0.0;
} }
//if at least one is false, return 0.0 // if at least one is false, return 0.0
public double and() { public double and() {
for (FieldStats fieldStats : this.results.values()) { for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() == -1) { if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined()) if (fieldStats.isCountIfUndefined())
return 0.0; return 0.0;
} } else {
else { if (fieldStats.getResult() < fieldStats.getThreshold())
if (fieldStats.getResult() < fieldStats.getThreshold()) return 0.0;
return 0.0; }
}
} }
return 1.0; return 1.0;
} }
public double getFinalScore(AggType aggregation){ public double getFinalScore(AggType aggregation) {
switch (aggregation){ switch (aggregation) {
case AVG: case AVG:
return scoreSum()/fieldsCount(); return scoreSum() / fieldsCount();
case SUM: case SUM:
return scoreSum(); return scoreSum();
case MAX: case MAX:
return max(); return max();
case MIN: case MIN:
return min(); return min();
case W_MEAN: case W_MEAN:
return weightedScoreSum()/weightSum(); return weightedScoreSum() / weightSum();
case OR: case OR:
return or(); return or();
case AND: case AND:
return and(); return and();
default: default:
return 0.0; return 0.0;
} }
} }
} }

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.util.PaceException;
/** /**
* The compare between two documents is given by the weighted mean of the field distances * The compare between two documents is given by the weighted mean of the field distances
@ -23,11 +24,11 @@ public class TreeProcessor {
// row based copies // row based copies
public boolean compare(final Row a, final Row b) { public boolean compare(final Row a, final Row b) {
//evaluate the decision tree // evaluate the decision tree
return evaluateTree(a, b).getResult() == MatchType.MATCH; return evaluateTree(a, b).getResult() == MatchType.MATCH;
} }
public TreeStats evaluateTree(final Row doc1, final Row doc2){ public TreeStats evaluateTree(final Row doc1, final Row doc2) {
TreeStats treeStats = new TreeStats(); TreeStats treeStats = new TreeStats();
@ -36,26 +37,25 @@ public class TreeProcessor {
do { do {
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName); TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
//throw an exception if the node doesn't exist // throw an exception if the node doesn't exist
if (currentNode == null) if (currentNode == null)
throw new PaceException("Missing tree node: " + nextNodeName); throw new PaceException("Missing tree node: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats); treeStats.addNodeStats(nextNodeName, stats);
//if ignoreUndefined=false the miss is considered as undefined // if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
nextNodeName = currentNode.getUndefined(); nextNodeName = currentNode.getUndefined();
} }
//if ignoreUndefined=true the miss is ignored and the score computed anyway // if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive(); nextNodeName = currentNode.getPositive();
} } else {
else {
nextNodeName = currentNode.getNegative(); nextNodeName = currentNode.getNegative();
} }
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED); } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
treeStats.setResult(MatchType.parse(nextNodeName)); treeStats.setResult(MatchType.parse(nextNodeName));
return treeStats; return treeStats;
@ -68,25 +68,24 @@ public class TreeProcessor {
do { do {
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName); TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
//throw an exception if the node doesn't exist // throw an exception if the node doesn't exist
if (currentNode == null) if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + nextNodeName); throw new PaceException("The Tree Node doesn't exist: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
score = stats.getFinalScore(currentNode.getAggregation()); score = stats.getFinalScore(currentNode.getAggregation());
//if ignoreUndefined=false the miss is considered as undefined // if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
nextNodeName = currentNode.getUndefined(); nextNodeName = currentNode.getUndefined();
} }
//if ignoreUndefined=true the miss is ignored and the score computed anyway // if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive(); nextNodeName = currentNode.getPositive();
} } else {
else {
nextNodeName = currentNode.getNegative(); nextNodeName = currentNode.getNegative();
} }
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED); } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
return score; return score;
} }

View File

@ -1,51 +1,52 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException; package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
public class TreeStats { public class TreeStats {
//<layer_id, <field:comparator, result>> // <layer_id, <field:comparator, result>>
Map<String, TreeNodeStats> stats; Map<String, TreeNodeStats> stats;
MatchType result; MatchType result;
public TreeStats(){ public TreeStats() {
this.stats = new HashMap<>(); this.stats = new HashMap<>();
this.result = MatchType.NO_MATCH; this.result = MatchType.NO_MATCH;
} }
public MatchType getResult(){ public MatchType getResult() {
return this.result; return this.result;
} }
public void setResult(MatchType result){ public void setResult(MatchType result) {
this.result = result; this.result = result;
} }
public Map<String, TreeNodeStats> getStats() { public Map<String, TreeNodeStats> getStats() {
return stats; return stats;
} }
public void setStats(Map<String, TreeNodeStats> stats) { public void setStats(Map<String, TreeNodeStats> stats) {
this.stats = stats; this.stats = stats;
} }
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){ public void addNodeStats(String layerID, TreeNodeStats treeNodeStats) {
this.stats.put(layerID, treeNodeStats); this.stats.put(layerID, treeNodeStats);
} }
@Override
public String toString(){
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
@Override
public String toString() {
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
} }

View File

@ -1,8 +1,11 @@
package eu.dnetlib.pace.util; package eu.dnetlib.pace.util;
import eu.dnetlib.pace.config.DedupConfig; import java.util.ArrayList;
import eu.dnetlib.pace.config.WfConfig; import java.util.Collection;
import eu.dnetlib.pace.tree.support.TreeProcessor; import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -12,127 +15,137 @@ import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType; import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType;
import java.util.ArrayList; import eu.dnetlib.pace.config.DedupConfig;
import java.util.Collection; import eu.dnetlib.pace.config.WfConfig;
import java.util.Iterator; import eu.dnetlib.pace.tree.support.TreeProcessor;
import java.util.List;
public class BlockProcessor { public class BlockProcessor {
public static final List<String> accumulators= new ArrayList<>(); public static final List<String> accumulators = new ArrayList<>();
private static final Log log = LogFactory.getLog(BlockProcessor.class); private static final Log log = LogFactory.getLog(BlockProcessor.class);
private DedupConfig dedupConf; private DedupConfig dedupConf;
private final int identifierFieldPos; private final int identifierFieldPos;
private final int orderFieldPos; private final int orderFieldPos;
public static void constructAccumulator( final DedupConfig dedupConf) { public static void constructAccumulator(final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); accumulators
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()))); .add(
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list")); String
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); .format(
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); "%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
} accumulators
.add(
String
.format(
"%s::%s", dedupConf.getWf().getEntityType(),
String
.format(
"Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(),
dedupConf.getWf().getGroupMaxSize())));
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"));
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
accumulators
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
}
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) { public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
this.dedupConf = dedupConf; this.dedupConf = dedupConf;
this.identifierFieldPos = identifierFieldPos; this.identifierFieldPos = identifierFieldPos;
this.orderFieldPos = orderFieldPos; this.orderFieldPos = orderFieldPos;
} }
public void processSortedRows(final Collection<Row> documents, final Reporter context) { public void processSortedRows(final Collection<Row> documents, final Reporter context) {
if (documents.size() > 1) { if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size()); // log.info("reducing key: '" + key + "' records: " + q.size());
processRows(documents, context); processRows(documents, context);
} else { } else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
} }
} }
private void processRows(final Collection<Row> queue, final Reporter context) {
private void processRows(final Collection<Row> queue, final Reporter context) { Iterator<Row> it = queue.iterator();
while (it.hasNext()) {
Iterator<Row> it = queue.iterator(); final Row pivot = it.next();
while (it.hasNext()) { it.remove();
final Row pivot = it.next(); final String idPivot = pivot.getString(identifierFieldPos); // identifier
it.remove(); final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
final WfConfig wf = dedupConf.getWf();
if (fieldPivot != null) {
int i = 0;
for (final Row curr : queue) {
final String idCurr = curr.getString(identifierFieldPos); // identifier
final String idPivot = pivot.getString(identifierFieldPos); //identifier if (mustSkip(idCurr)) {
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
final WfConfig wf = dedupConf.getWf();
if (fieldPivot != null) { context.incrementCounter(wf.getEntityType(), "skip list", 1);
int i = 0;
for (final Row curr : queue) {
final String idCurr = curr.getString(identifierFieldPos); //identifier
if (mustSkip(idCurr)) { break;
}
context.incrementCounter(wf.getEntityType(), "skip list", 1); if (i > wf.getSlidingWindowSize()) {
break;
}
break; final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
} final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
if (i > wf.getSlidingWindowSize()) { if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
break;
}
final Object fieldsCurr = getJavaValue(curr, orderFieldPos); final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
if (!idCurr.equals(idPivot) && (fieldCurr != null)) { emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); }
}
}
}
}
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); public Object getJavaValue(Row row, int pos) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
} return null;
} }
}
}
}
public Object getJavaValue(Row row, int pos) { private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
return null; if (result) {
} writeSimilarity(context, idPivot, idCurr);
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
}
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}
if (result) { private String getNsPrefix(final String id) {
writeSimilarity(context, idPivot, idCurr); return StringUtils.substringBetween(id, "|", "::");
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); }
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
}
}
private boolean mustSkip(final String idPivot) { private void writeSimilarity(final Reporter context, final String from, final String to) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); final String type = dedupConf.getWf().getEntityType();
}
private String getNsPrefix(final String id) { context.emit(type, from, to);
return StringUtils.substringBetween(id, "|", "::"); context.emit(type, to, from);
} }
private void writeSimilarity(final Reporter context, final String from, final String to) {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
context.emit(type, to, from);
}
} }

View File

@ -1,15 +1,18 @@
package eu.dnetlib.pace.util; package eu.dnetlib.pace.util;
import org.apache.commons.lang3.text.WordUtils;
import com.google.common.base.Function; import com.google.common.base.Function;
import org.apache.commons.lang3.text.WordUtils;
public class Capitalise implements Function<String, String> { public class Capitalise implements Function<String, String> {
private final char[] DELIM = {' ', '-'}; private final char[] DELIM = {
' ', '-'
};
@Override @Override
public String apply(final String s) { public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM); return WordUtils.capitalize(s.toLowerCase(), DELIM);
} }
}; };

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.util; package eu.dnetlib.pace.util;
import com.google.common.base.Function; import com.google.common.base.Function;
@ -7,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
public String apply(String s) { public String apply(String s) {
return s.length() == 1 ? s + "." : s; return s.length() == 1 ? s + "." : s;
} }
}; };

View File

@ -1,117 +1,172 @@
package eu.dnetlib.pace.util;
import com.fasterxml.jackson.core.JsonProcessingException; package eu.dnetlib.pace.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
import java.math.BigDecimal; import java.math.BigDecimal;
import java.util.*; import java.util.*;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
public class MapDocumentUtil { public class MapDocumentUtil {
public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX); public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
public static List<String> getJPathList(String path, String json, Type type) { public static List<String> getJPathList(String path, String json, Type type) {
if (type == Type.List) if (type == Type.List)
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); return JsonPath
Object jresult; .using(
List<String> result = new ArrayList<>(); Configuration
try { .defaultConfiguration()
jresult = JsonPath.read(json, path); .addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS))
} catch (Throwable e) { .parse(json)
return result; .read(path);
} Object jresult;
if (jresult instanceof JSONArray) { List<String> result = new ArrayList<>();
try {
jresult = JsonPath.read(json, path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
((JSONArray) jresult).forEach(it -> { ((JSONArray) jresult).forEach(it -> {
try { try {
result.add(new ObjectMapper().writeValueAsString(it)); result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
} }
} });
); return result;
return result; }
}
if (jresult instanceof LinkedHashMap) { if (jresult instanceof LinkedHashMap) {
try { try {
result.add(new ObjectMapper().writeValueAsString(jresult)); result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) { } catch (JsonProcessingException e) {
} }
return result; return result;
} }
if (jresult instanceof String) { if (jresult instanceof String) {
result.add((String) jresult); result.add((String) jresult);
} }
return result; return result;
} }
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
public static String getJPathString(final String jsonPath, final String json) { public static double[] getJPathArray(final String jsonPath, final String json) {
try { try {
Object o = JsonPath.read(json, jsonPath); Object o = JsonPath.read(json, jsonPath);
if (o instanceof String) if (o instanceof double[])
return (String)o; return (double[]) o;
if (o instanceof JSONArray && ((JSONArray)o).size()>0) if (o instanceof JSONArray) {
return (String)((JSONArray)o).get(0); Object[] objects = ((JSONArray) o).toArray();
return ""; double[] array = new double[objects.length];
} catch (Exception e) { for (int i = 0; i < objects.length; i++) {
return ""; if (objects[i] instanceof BigDecimal)
} array[i] = ((BigDecimal) objects[i]).doubleValue();
} else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
} catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static double[] getJPathArray(final String jsonPath, final String json) { public static String truncateValue(String value, int length) {
try { if (value == null)
Object o = JsonPath.read(json, jsonPath); return "";
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
if (length == -1 || length > value.length())
return value;
public static String truncateValue(String value, int length) { return value.substring(0, length);
if (value == null) }
return "";
if (length == -1 || length > value.length()) public static List<String> truncateList(List<String> list, int size) {
return value; if (size == -1 || size > list.size())
return list;
return value.substring(0, length); return list.subList(0, size);
} }
public static List<String> truncateList(List<String> list, int size) { public static String getJPathString(final String jsonPath, final DocumentContext json) {
if (size == -1 || size > list.size()) try {
return list; Object o = json.read(jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
return list.subList(0, size); public static List<String> getJPathList(String path, DocumentContext json, Type type) {
} // if (type == Type.List)
// return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST,
// Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
Object jresult;
List<String> result = new ArrayList<>();
try {
jresult = json.read(path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
((JSONArray) jresult).forEach(it -> {
try {
result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) {
}
});
return result;
}
if (jresult instanceof LinkedHashMap) {
try {
result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) {
}
return result;
}
if (jresult instanceof String) {
result.add((String) jresult);
}
return result;
}
} }

View File

@ -1,13 +1,14 @@
package eu.dnetlib.pace.util; package eu.dnetlib.pace.util;
public class PaceException extends RuntimeException { public class PaceException extends RuntimeException {
public PaceException(String s, Throwable e){ public PaceException(String s, Throwable e) {
super(s, e); super(s, e);
} }
public PaceException(String s){ public PaceException(String s) {
super(s); super(s);
} }
} }

View File

@ -1,49 +1,61 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringClass; package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.reflections.Reflections;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException; import java.lang.reflect.InvocationTargetException;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.reflections.Reflections;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
public class PaceResolver implements Serializable { public class PaceResolver implements Serializable {
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
private final Map<String, Class<ClusteringFunction>> clusteringFunctions; private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<Comparator>> comparators; private final Map<String, Class<Comparator>> comparators;
public PaceResolver() { public PaceResolver() {
this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream() this.clusteringFunctions = CLUSTERING_RESOLVER
.filter(ClusteringFunction.class::isAssignableFrom) .getTypesAnnotatedWith(ClusteringClass.class)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl)); .stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(
Collectors
.toMap(
cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>) cl));
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream() this.comparators = COMPARATOR_RESOLVER
.filter(Comparator.class::isAssignableFrom) .getTypesAnnotatedWith(ComparatorClass.class)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl)); .stream()
} .filter(Comparator.class::isAssignableFrom)
.collect(
Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException { public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
try { try {
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params); return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { } catch (InstantiationException | IllegalAccessException | InvocationTargetException
throw new PaceException(name + " not found ", e); | NoSuchMethodException e) {
} throw new PaceException(name + " not found ", e);
} }
}
public Comparator getComparator(String name, Map<String, String> params) throws PaceException { public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
try { try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException
throw new PaceException(name + " not found ", e); | NullPointerException e) {
} throw new PaceException(name + " not found ", e);
} }
}
} }

View File

@ -1,11 +1,11 @@
package eu.dnetlib.pace.util;
package eu.dnetlib.pace.util;
import java.io.Serializable; import java.io.Serializable;
public interface Reporter extends Serializable { public interface Reporter extends Serializable {
void incrementCounter(String counterGroup, String counterName, long delta); void incrementCounter(String counterGroup, String counterName, long delta);
void emit(String type, String from, String to); void emit(String type, String from, String to);
} }

View File

@ -0,0 +1,86 @@
package eu.dnetlib.pace.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkContext;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.Reporter;
import scala.Serializable;
import scala.Tuple2;
public class SparkReporter implements Serializable, Reporter {
private final List<Tuple2<String, String>> relations = new ArrayList<>();
private final Map<String, LongAccumulator> accumulators;
public SparkReporter(Map<String, LongAccumulator> accumulators) {
this.accumulators = accumulators;
}
public void incrementCounter(
String counterGroup,
String counterName,
long delta,
Map<String, LongAccumulator> accumulators) {
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
if (accumulators.containsKey(accumulatorName)) {
accumulators.get(accumulatorName).add(delta);
}
}
@Override
public void incrementCounter(String counterGroup, String counterName, long delta) {
incrementCounter(counterGroup, counterName, delta, accumulators);
}
@Override
public void emit(String type, String from, String to) {
relations.add(new Tuple2<>(from, to));
}
public List<Tuple2<String, String>> getRelations() {
return relations;
}
public static Map<String, LongAccumulator> constructAccumulator(
final DedupConfig dedupConf, final SparkContext context) {
Map<String, LongAccumulator> accumulators = new HashMap<>();
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
accumulators.put(acc1, context.longAccumulator(acc1));
String acc2 = String
.format(
"%s::%s",
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
accumulators.put(acc2, context.longAccumulator(acc2));
String acc3 = String
.format(
"%s::%s",
dedupConf.getWf().getEntityType(),
String
.format(
"Skipped records for count(%s) >= %s",
dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
accumulators.put(acc3, context.longAccumulator(acc3));
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
accumulators.put(acc4, context.longAccumulator(acc4));
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
accumulators.put(acc5, context.longAccumulator(acc5));
String acc6 = String
.format(
"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
accumulators.put(acc6, context.longAccumulator(acc6));
return accumulators;
}
}

View File

@ -1,12 +1,14 @@
package eu.dnetlib.pace;
import eu.dnetlib.pace.common.AbstractPaceFunctions; package eu.dnetlib.pace;
import org.apache.commons.io.IOUtils;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.util.List; import java.util.List;
import org.apache.commons.io.IOUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
public abstract class AbstractPaceTest extends AbstractPaceFunctions { public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) { protected String readFromClasspath(final String filename) {
@ -35,7 +37,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
return a; return a;
} }
protected List<String> createFieldList(List<String> strings, String fieldName){ protected List<String> createFieldList(List<String> strings, String fieldName) {
return strings; return strings;
} }

View File

@ -1,17 +1,20 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists; package eu.dnetlib.pace.clustering;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.junit.jupiter.api.*;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.DedupConfig;
public class ClusteringFunctionTest extends AbstractPaceTest { public class ClusteringFunctionTest extends AbstractPaceTest {
private static Map<String, Integer> params; private static Map<String, Integer> params;
@ -20,7 +23,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
@BeforeAll @BeforeAll
public static void setUp() throws Exception { public static void setUp() throws Exception {
params = Maps.newHashMap(); params = Maps.newHashMap();
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class)); conf = DedupConfig
.load(
AbstractPaceFunctions
.readFromClasspath(
"/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
} }
@Test @Test
@ -210,7 +217,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
} }
@Test @Test
public void testPersonClustering(){ public void testPersonClustering() {
final ClusteringFunction cf = new PersonClustering(params); final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N."; final String s = "Abd-Alla, Abo-el-nour N.";
@ -224,7 +231,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
} }
@Test @Test
public void testPersonHash(){ public void testPersonHash() {
final ClusteringFunction cf = new PersonHash(params); final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo"; final String s = "Manghi, Paolo";
@ -238,7 +245,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
} }
@Test @Test
public void testLastNameFirstInitial(){ public void testLastNameFirstInitial() {
final ClusteringFunction cf = new LastNameFirstInitial(params); final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong"; final String s = "LI Yonghong";
@ -246,4 +253,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s)));
} }
} }

View File

@ -1,56 +1,57 @@
package eu.dnetlib.pace.common; package eu.dnetlib.pace.common;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.*;
public class PaceFunctionTest extends AbstractPaceFunctions { public class PaceFunctionTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
@Test @Test
public void normalizePidTest(){ public void normalizePidTest() {
assertEquals("identifier", normalizePid("IdentifIer")); assertEquals("identifier", normalizePid("IdentifIer"));
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
} }
@Test @Test
public void filterAllStopwordsTest(){ public void filterAllStopwordsTest() {
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
} }
@Test @Test
public void normalizeTest() { public void normalizeTest() {
assertEquals("universitat", normalize("Universität")); assertEquals("universitat", normalize("Universität"));
System.out.println(normalize("İstanbul Ticarət Universiteti")); System.out.println(normalize("İstanbul Ticarət Universiteti"));
} }
@Test @Test
public void cleanupTest() { public void cleanupTest() {
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti")); assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
System.out.println("cleaned up : " + cleanup(TEST_STRING));
}
System.out.println("cleaned up : " + cleanup(TEST_STRING)); @Test
} public void testGetNumbers() {
System.out.println("Numbers : " + getNumbers(TEST_STRING));
}
@Test @Test
public void testGetNumbers() { public void testRemoveSymbols() {
System.out.println("Numbers : " + getNumbers(TEST_STRING)); System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
} }
@Test @Test
public void testRemoveSymbols() { public void testFixAliases() {
System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
} }
@Test
public void testFixAliases() {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
}
} }

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.comparators; package eu.dnetlib.pace.comparators;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.*;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.tree.*;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import java.util.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
@TestInstance(TestInstance.Lifecycle.PER_CLASS) @TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class ComparatorTest extends AbstractPaceTest { public class ComparatorTest extends AbstractPaceTest {
@ -26,7 +28,8 @@ public class ComparatorTest extends AbstractPaceTest {
params.put("name_th", "0.95"); params.put("name_th", "0.95");
params.put("jpath_value", "$.value"); params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid"); params.put("jpath_classid", "$.qualifier.classid");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); conf = DedupConfig
.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
} }
@Test @Test
@ -39,32 +42,38 @@ public class ComparatorTest extends AbstractPaceTest {
public void cityMatchTest() { public void cityMatchTest() {
final CityMatch cityMatch = new CityMatch(params); final CityMatch cityMatch = new CityMatch(params);
//both names with no cities // both names with no cities
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf)); assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
//one of the two names with no cities // one of the two names with no cities
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf)); assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
//both names with cities (same) // both names with cities (same)
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf)); assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
//both names with cities (different) // both names with cities (different)
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf)); assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
//particular cases // particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); assertEquals(
1.0,
cityMatch
.distance(
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
conf));
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf)); assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
@Test @Test
public void keywordMatchTest(){ public void keywordMatchTest() {
params.put("threshold", "0.5"); params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params); final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); assertEquals(
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
@ -77,7 +86,7 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void listContainsMatchTest(){ public void listContainsMatchTest() {
List<String> a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType"); List<String> a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
List<String> b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType"); List<String> b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
@ -100,7 +109,7 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void stringContainsMatchTest(){ public void stringContainsMatchTest() {
params.put("string", "openorgs"); params.put("string", "openorgs");
params.put("bool", "XOR"); params.put("bool", "XOR");
@ -120,7 +129,7 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void numbersMatchTest(){ public void numbersMatchTest() {
final NumbersMatch numbersMatch = new NumbersMatch(params); final NumbersMatch numbersMatch = new NumbersMatch(params);
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf)); assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
@ -128,7 +137,7 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void romansMatchTest(){ public void romansMatchTest() {
final RomansMatch romansMatch = new RomansMatch(params); final RomansMatch romansMatch = new RomansMatch(params);
@ -142,8 +151,9 @@ public class ComparatorTest extends AbstractPaceTest {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf); double result = jaroWinklerNormalizedName
System.out.println("result = " + result); .distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result);
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
@ -171,7 +181,11 @@ public class ComparatorTest extends AbstractPaceTest {
final LevensteinTitle levensteinTitle = new LevensteinTitle(params); final LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result = levensteinTitle.distance("Degradation of lignin βaryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK6", "Degradation of lignin β-aryl ether units in <i>Arabidopsis thaliana</i> expressing <i>LigD</i>, <i>LigF</i> and <i>LigG</i> from <i>Sphingomonas paucimobilis</i> SYK-6", conf); double result = levensteinTitle
.distance(
"Degradation of lignin βaryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK6",
"Degradation of lignin β-aryl ether units in <i>Arabidopsis thaliana</i> expressing <i>LigD</i>, <i>LigF</i> and <i>LigG</i> from <i>Sphingomonas paucimobilis</i> SYK-6",
conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -195,13 +209,16 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result); assertEquals(1.0, result);
List<String> c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); List<String> c = createFieldList(
Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
result = instanceTypeMatch.compare(c, b, conf); result = instanceTypeMatch.compare(c, b, conf);
assertEquals(1.0, result); assertEquals(1.0, result);
List<String> d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); List<String> d = createFieldList(
List<String> e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
List<String> e = createFieldList(
Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
result = instanceTypeMatch.compare(d, e, conf); result = instanceTypeMatch.compare(d, e, conf);
assertEquals(1.0, result); assertEquals(1.0, result);
@ -222,7 +239,8 @@ public class ComparatorTest extends AbstractPaceTest {
AuthorsMatch authorsMatch = new AuthorsMatch(params); AuthorsMatch authorsMatch = new AuthorsMatch(params);
List<String> a = createFieldList(Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors"); List<String> a = createFieldList(
Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
List<String> b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors"); List<String> b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
double result = authorsMatch.compare(a, b, conf); double result = authorsMatch.compare(a, b, conf);
@ -232,7 +250,7 @@ public class ComparatorTest extends AbstractPaceTest {
List<String> d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors"); List<String> d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
result = authorsMatch.compare(c, d, conf); result = authorsMatch.compare(c, d, conf);
assertEquals(0.0, result) ; assertEquals(0.0, result);
params.put("mode", "surname"); params.put("mode", "surname");
authorsMatch = new AuthorsMatch(params); authorsMatch = new AuthorsMatch(params);
@ -246,7 +264,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.25, result); assertEquals(0.25, result);
List<String> f = createFieldList(new ArrayList<>(), "authors"); List<String> f = createFieldList(new ArrayList<>(), "authors");
result = authorsMatch.compare(f,f, conf); result = authorsMatch.compare(f, f, conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -256,8 +274,19 @@ public class ComparatorTest extends AbstractPaceTest {
JsonListMatch jsonListMatch = new JsonListMatch(params); JsonListMatch jsonListMatch = new JsonListMatch(params);
List<String> a = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors"); List<String> a = createFieldList(
List<String> b = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), "authors"); Arrays
.asList(
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
"authors");
List<String> b = createFieldList(
Arrays
.asList(
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
"authors");
double result = jsonListMatch.compare(a, b, conf); double result = jsonListMatch.compare(a, b, conf);
@ -287,13 +316,16 @@ public class ComparatorTest extends AbstractPaceTest {
CosineSimilarity cosineSimilarity = new CosineSimilarity(params); CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
double[] a = new double[]{1,2,3}; double[] a = new double[] {
double[] b = new double[]{1,2,3}; 1, 2, 3
};
double[] b = new double[] {
1, 2, 3
};
double compare = cosineSimilarity.compare(a, b, conf); double compare = cosineSimilarity.compare(a, b, conf);
System.out.println("compare = " + compare); System.out.println("compare = " + compare);
} }
} }

View File

@ -1,17 +1,17 @@
package eu.dnetlib.pace.config; package eu.dnetlib.pace.config;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.util.MapDocumentUtil;
public class ConfigTest extends AbstractPaceTest { public class ConfigTest extends AbstractPaceTest {
@ -56,7 +56,7 @@ public class ConfigTest extends AbstractPaceTest {
System.out.println("translationMap = " + translationMap.size()); System.out.println("translationMap = " + translationMap.size());
for (String key: translationMap.keySet()) { for (String key : translationMap.keySet()) {
if (translationMap.get(key).equals("key::1")) if (translationMap.get(key).equals("key::1"))
System.out.println("key = " + key); System.out.println("key = " + key);
} }
@ -70,13 +70,13 @@ public class ConfigTest extends AbstractPaceTest {
assertEquals(0, load.getPace().translationMap().keySet().size()); assertEquals(0, load.getPace().translationMap().keySet().size());
} }
@Test @Test
public void testJPath() { public void testJPath() {
final String json = readFromClasspath("organization.json"); final String json = readFromClasspath("organization.json");
final String jpath ="$.id"; final String jpath = "$.id";
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
} }
} }

View File

@ -1,40 +1,43 @@
package eu.dnetlib.pace.util; package eu.dnetlib.pace.util;
import eu.dnetlib.pace.model.Person; import static org.junit.jupiter.api.Assertions.assertEquals;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.junit.jupiter.api.*;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.*;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest { public class UtilTest {
static Map<String, String> params; static Map<String, String> params;
@BeforeAll @BeforeAll
public static void setUp(){ public static void setUp() {
params = new HashMap<>(); params = new HashMap<>();
} }
@Test @Test
@Ignore @Ignore
public void paceResolverTest() { public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver(); PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params); paceResolver.getComparator("keywordMatch", params);
} }
@Test @Test
public void personTest() { public void personTest() {
Person p = new Person("j. f. kennedy", false); Person p = new Person("j. f. kennedy", false);
assertEquals("kennedy", p.getSurnameString()); assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString()); assertEquals("j f", p.getNameString());
p = new Person("Guan-Hua Du", false); p = new Person("Guan-Hua Du", false);
System.out.println("surname = " + p.getSurnameString()); System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString()); System.out.println("name = " + p.getNameString());
} }
} }

View File

@ -1,16 +1,18 @@
package eu.dnetlib.dhp.broker.oa.util; package eu.dnetlib.dhp.broker.oa.util;
import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkDedupConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
public class TrustUtils { public class TrustUtils {
@ -18,13 +20,18 @@ public class TrustUtils {
private static DedupConfig dedupConfig; private static DedupConfig dedupConfig;
private static SparkDedupConfig sparkDedupConfig;
private static final ObjectMapper mapper;
static { static {
final ObjectMapper mapper = new ObjectMapper(); mapper = new ObjectMapper();
try { try {
dedupConfig = mapper dedupConfig = mapper
.readValue( .readValue(
DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
DedupConfig.class); DedupConfig.class);
sparkDedupConfig = new SparkDedupConfig(dedupConfig, 1);
} catch (final IOException e) { } catch (final IOException e) {
log.error("Error loading dedupConfig, e"); log.error("Error loading dedupConfig, e");
} }
@ -40,11 +47,8 @@ public class TrustUtils {
} }
try { try {
final ObjectMapper objectMapper = new ObjectMapper(); final Row doc1 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r1));
final Row doc1 = MapDocumentUtil final Row doc2 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r2));
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
final Row doc2 = MapDocumentUtil
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);

View File

@ -53,13 +53,17 @@
</dependencyManagement> </dependencyManagement>
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>io.opentelemetry</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>opentelemetry-api</artifactId> <artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>io.opentelemetry</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>opentelemetry-sdk</artifactId> <artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -83,31 +87,21 @@
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_2.11</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>com.arakelian</groupId> <groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId> <artifactId>java-jq</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>dom4j</groupId> <groupId>dom4j</groupId>
<artifactId>dom4j</artifactId> <artifactId>dom4j</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>jaxen</groupId> <groupId>jaxen</groupId>
<artifactId>jaxen</artifactId> <artifactId>jaxen</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>org.apache.spark</groupId> <groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId> <artifactId>spark-graphx_2.11</artifactId>
@ -141,12 +135,7 @@
<version>1.4.200</version> <version>1.4.200</version>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.11</artifactId>
<version>2.4.0.cloudera2</version>
<scope>compile</scope>
</dependency>
</dependencies> </dependencies>

View File

@ -3,29 +3,20 @@ package eu.dnetlib.dhp.oa.dedup
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.dedup.dsl.{Clustering, Deduper} import eu.dnetlib.dhp.oa.dedup.dsl.{Clustering, Deduper}
import eu.dnetlib.dhp.oa.dedup.model.BlockStats import eu.dnetlib.dhp.oa.dedup.model.BlockStats
import eu.dnetlib.dhp.oa.dedup.model.SparkDedupConfig
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.utils.ISLookupClientFactory import eu.dnetlib.dhp.utils.ISLookupClientFactory
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException import eu.dnetlib.enabling.is.lookup.rmi.{ISLookUpException, ISLookUpService}
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService import eu.dnetlib.pace.model.{RowDataOrderingComparator, SparkDedupConfig}
import eu.dnetlib.pace.config.DedupConfig
import eu.dnetlib.pace.model.RowDataOrderingComparator
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._ import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types.DataTypes import org.apache.spark.sql.types.DataTypes
import org.dom4j.DocumentException import org.dom4j.DocumentException
import org.slf4j.Logger
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
import org.xml.sax.SAXException import org.xml.sax.SAXException
import java.io.IOException import java.io.IOException
import java.util
import java.util.Optional
import java.util.stream.Collectors import java.util.stream.Collectors
import scala.collection.Seq
object DSLExample { object DSLExample {
private val log = LoggerFactory.getLogger(classOf[DSLExample]) private val log = LoggerFactory.getLogger(classOf[DSLExample])
@ -64,15 +55,15 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends
DSLExample.log.info("isLookUpUrl: '{}'", isLookUpUrl) DSLExample.log.info("isLookUpUrl: '{}'", isLookUpUrl)
DSLExample.log.info("actionSetId: '{}'", actionSetId) DSLExample.log.info("actionSetId: '{}'", actionSetId)
DSLExample.log.info("workingPath: '{}'", workingPath) DSLExample.log.info("workingPath: '{}'", workingPath)
// for each dedup configuration // for each dedup configuration
import scala.collection.JavaConversions._ import scala.collection.JavaConversions._
for (dedupConf <- getConfigurations(isLookUpService, actionSetId).subList(0, 1)) { for (dedupConf <- getConfigurations(isLookUpService, actionSetId).subList(0, 1)) {
val subEntity = dedupConf.getWf.getSubEntityValue val subEntity = dedupConf.getWf.getSubEntityValue
DSLExample.log.info("Creating blockstats for: '{}'", subEntity) DSLExample.log.info("Creating blockstats for: '{}'", subEntity)
val outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity) val outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity)
AbstractSparkAction.removeOutputDir(spark, outputPath) AbstractSparkAction.removeOutputDir(spark, outputPath)
val sc = JavaSparkContext.fromSparkContext(spark.sparkContext)
val sparkConfig = new SparkDedupConfig(dedupConf, numPartitions) val sparkConfig = SparkDedupConfig(dedupConf, numPartitions)
val inputDF = spark.read val inputDF = spark.read
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
@ -87,8 +78,7 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends
Clustering("suffixprefix", Seq("legalname"), Map("max" -> 1, "len" -> 3)), Clustering("suffixprefix", Seq("legalname"), Map("max" -> 1, "len" -> 3)),
Clustering("urlclustering", Seq("websiteurl")), Clustering("urlclustering", Seq("websiteurl")),
Clustering("keywordsclustering", Seq("fields"), Map("max" -> 2, "windowSize" -> 4)) Clustering("keywordsclustering", Seq("fields"), Map("max" -> 2, "windowSize" -> 4))
); )
simRels simRels
.map[BlockStats]( .map[BlockStats](

Some files were not shown because too many files have changed in this diff Show More