WIP: various refactors

This commit is contained in:
Claudio Atzori 2023-06-26 13:58:11 +02:00 committed by Sandro La Bruzzo
parent 4c2dfcbdf7
commit 649679de8d
108 changed files with 5650 additions and 5414 deletions

View File

@ -81,9 +81,12 @@
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.11</artifactId>
<version>2.4.0.cloudera2</version>
<scope>compile</scope>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
</dependencies>

View File

@ -1,8 +1,5 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.HashSet;
@ -10,32 +7,39 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params;
public AbstractClusteringFunction(final Map<String, Integer> params) {
this.params = params;
}
protected abstract Collection<String> doApply(Config conf, String s);
@Override
public Collection<String> apply(Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
return fields
.stream()
.filter(f -> !f.isEmpty())
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
public Map<String, Integer> getParams() {
return params;
}
protected Integer param(String name) {
return params.get(name);
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@ -6,6 +7,7 @@ import java.util.Set;
import java.util.StringTokenizer;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("acronyms")
@ -19,16 +21,16 @@ public class Acronyms extends AbstractClusteringFunction {
protected Collection<String> doApply(Config conf, String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
}
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
final Set<String> acronyms = Sets.newLinkedHashSet();
for (int i = 0; i < maxAcronyms; i++) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (sb.length() > maxLen) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.lang.annotation.ElementType;
@ -9,5 +10,5 @@ import java.lang.annotation.Target;
@Target(ElementType.TYPE)
public @interface ClusteringClass {
public String value();
}
public String value();
}

View File

@ -1,15 +1,16 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
public interface ClusteringFunction {
public Collection<String> apply(Config config, List<String> fields);
public Map<String, Integer> getParams();
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@ -5,6 +6,7 @@ import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("immutablefieldvalue")

View File

@ -1,50 +1,54 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(final Config conf, String s) {
@Override
protected Collection<String> doApply(final Config conf, String s) {
//takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
// takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
//list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>();
// list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
for (String city: citiesToCodes(cities)) {
combinations.add(keyword+"-"+city);
if (combinations.size()>=params.getOrDefault("max", 2)) {
return combinations;
}
}
}
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
for (String city : citiesToCodes(cities)) {
combinations.add(keyword + "-" + city);
if (combinations.size() >= params.getOrDefault("max", 2)) {
return combinations;
}
}
}
return combinations;
}
return combinations;
}
@Override
public Collection<String> apply(final Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(this::cleanup)
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
}
@Override
public Collection<String> apply(final Config conf, List<String> fields) {
return fields
.stream()
.filter(f -> !f.isEmpty())
.map(this::cleanup)
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -1,75 +1,79 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{
public class LastNameFirstInitial extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = true;
private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
public Collection<String> apply(Config conf, List<String> fields) {
return fields
.stream()
.filter(f -> !f.isEmpty())
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
: DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive);
Person p = new Person(s, aggressive);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0, 1);
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
res.add(firstInitial.concat(lastName));
} else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
} else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0, 1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
} else {
res.add(fullname.get(0).substring(0, 1).concat(fullname.get(fullname.size() - 1)).toLowerCase());
res.add(fullname.get(fullname.size() - 1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
}
}
return res;
}
}
return res;
}
}

View File

@ -1,14 +1,17 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction {
@ -19,7 +22,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
@Override
public Collection<String> apply(Config conf, List<String> fields) {
Collection<String> c = Sets.newLinkedHashSet();
for(String f : fields) {
for (String f : fields) {
c.addAll(doApply(conf, f));
}
return c;
@ -27,7 +30,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
@Override
protected Collection<String> doApply(final Config conf, final String s) {
if(StringUtils.isBlank(s)) {
if (StringUtils.isBlank(s)) {
return Lists.newArrayList();
}
return Lists.newArrayList(s.toLowerCase().trim());

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Set;
@ -11,7 +12,8 @@ public class NGramUtils extends AbstractPaceFunctions {
private static final int SIZE = 100;
private static final Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
private static final Set<String> stopwords = AbstractPaceFunctions
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
public static String cleanupForOrdering(String s) {
String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords);

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@ -6,6 +7,7 @@ import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrampairs")
@ -32,7 +34,7 @@ public class NgramPairs extends Ngrams {
break;
}
res.add(ngrams.get(i) + ngrams.get(j));
//System.out.println("-- " + concatNgrams);
// System.out.println("-- " + concatNgrams);
}
return res;
}

View File

@ -1,9 +1,10 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.*;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction {
@ -44,7 +45,7 @@ public class Ngrams extends AbstractClusteringFunction {
}
}
}
//System.out.println(ngrams + " n: " + ngrams.size());
// System.out.println(ngrams + " n: " + ngrams.size());
return ngrams;
}

View File

@ -1,16 +1,19 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personClustering")
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
@ -30,7 +33,8 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
final Person person = new Person(f, false);
if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
if (StringUtils.isNotBlank(person.getNormalisedFirstName())
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
} else {
for (final String token1 : tokens(f, MAX_TOKENS)) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@ -22,7 +23,8 @@ public class PersonHash extends AbstractClusteringFunction {
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
: DEFAULT_AGGRESSIVE);
res.add(new Person(s, aggressive).hash());

View File

@ -1,10 +1,11 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
public class RandomClusteringFunction extends AbstractClusteringFunction {
public RandomClusteringFunction(Map<String, Integer> params) {

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.*;
@ -5,6 +6,7 @@ import java.util.*;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("sortedngrampairs")

View File

@ -1,15 +1,17 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("spacetrimmingfieldvalue")
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
@ -21,7 +23,10 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
res
.add(
StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength"))
: s.toLowerCase().replaceAll("\\s+", ""));
return res;
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@ -5,6 +6,7 @@ import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("suffixprefix")
@ -18,7 +20,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
private Collection<String> suffixPrefix(String s, int len, int max) {
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;

View File

@ -1,7 +1,5 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
package eu.dnetlib.pace.clustering;
import java.net.MalformedURLException;
import java.net.URL;
@ -11,42 +9,44 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params;
protected Map<String, Integer> params;
public UrlClustering(final Map<String, Integer> params) {
this.params = params;
}
public UrlClustering(final Map<String, Integer> params) {
this.params = params;
}
@Override
public Collection<String> apply(final Config conf, List<String> fields) {
try {
return fields.stream()
.filter(f -> !f.isEmpty())
.map(this::asUrl)
.map(URL::getHost)
.collect(Collectors.toCollection(HashSet::new));
}
catch (IllegalStateException e){
return new HashSet<>();
}
}
@Override
public Collection<String> apply(final Config conf, List<String> fields) {
try {
return fields
.stream()
.filter(f -> !f.isEmpty())
.map(this::asUrl)
.map(URL::getHost)
.collect(Collectors.toCollection(HashSet::new));
} catch (IllegalStateException e) {
return new HashSet<>();
}
}
@Override
public Map<String, Integer> getParams() {
return null;
}
private URL asUrl(String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
@Override
public Map<String, Integer> getParams() {
return null;
}
private URL asUrl(String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
}

View File

@ -1,90 +1,91 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordsStatsSuffixPrefixChain")
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
super(params);
}
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefixChain(s, param("mod"));
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefixChain(s, param("mod"));
}
private Collection<String> suffixPrefixChain(String s, int mod) {
private Collection<String> suffixPrefixChain(String s, int mod) {
//create the list of words from the string (remove short words)
List<String> wordsList =
Arrays.stream(s.split(" "))
.filter(si -> si.length() > 3)
.collect(Collectors.toList());
// create the list of words from the string (remove short words)
List<String> wordsList = Arrays
.stream(s.split(" "))
.filter(si -> si.length() > 3)
.collect(Collectors.toList());
final int words = wordsList.size();
final int letters = s.length();
final int words = wordsList.size();
final int letters = s.length();
//create the prefix: number of words + number of letters/mod
String prefix = words + "-" + letters/mod + "-";
// create the prefix: number of words + number of letters/mod
String prefix = words + "-" + letters / mod + "-";
return doSuffixPrefixChain(wordsList, prefix);
return doSuffixPrefixChain(wordsList, prefix);
}
}
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
Set<String> set = Sets.newLinkedHashSet();
switch(wordsList.size()){
case 0:
case 1:
break;
case 2:
set.add(
prefix +
suffix(wordsList.get(0), 3) +
prefix(wordsList.get(1), 3)
);
Set<String> set = Sets.newLinkedHashSet();
switch (wordsList.size()) {
case 0:
case 1:
break;
case 2:
set
.add(
prefix +
suffix(wordsList.get(0), 3) +
prefix(wordsList.get(1), 3));
set.add(
prefix +
prefix(wordsList.get(0), 3) +
suffix(wordsList.get(1), 3)
);
set
.add(
prefix +
prefix(wordsList.get(0), 3) +
suffix(wordsList.get(1), 3));
break;
default:
set.add(
prefix +
suffix(wordsList.get(0), 3) +
prefix(wordsList.get(1), 3) +
suffix(wordsList.get(2), 3)
);
break;
default:
set
.add(
prefix +
suffix(wordsList.get(0), 3) +
prefix(wordsList.get(1), 3) +
suffix(wordsList.get(2), 3));
set.add(
prefix +
prefix(wordsList.get(0), 3) +
suffix(wordsList.get(1), 3) +
prefix(wordsList.get(2), 3)
);
break;
}
set
.add(
prefix +
prefix(wordsList.get(0), 3) +
suffix(wordsList.get(1), 3) +
prefix(wordsList.get(2), 3));
break;
}
return set;
return set;
}
}
private String suffix(String s, int len) {
return s.substring(s.length() - len);
}
private String suffix(String s, int len) {
return s.substring(s.length()-len);
}
private String prefix(String s, int len) {
return s.substring(0, len);
}
private String prefix(String s, int len) {
return s.substring(0, len);
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
@ -5,53 +6,54 @@ import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordssuffixprefix")
public class WordsSuffixPrefix extends AbstractClusteringFunction {
public WordsSuffixPrefix(Map<String, Integer> params) {
super(params);
}
public WordsSuffixPrefix(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
private Collection<String> suffixPrefix(String s, int len, int max) {
private Collection<String> suffixPrefix(String s, int len, int max) {
final int words = s.split(" ").length;
final int words = s.split(" ").length;
// adjust the token length according to the number of words
switch (words) {
case 1:
return Sets.newLinkedHashSet();
case 2:
return doSuffixPrefix(s, len+2, max, words);
case 3:
return doSuffixPrefix(s, len+1, max, words);
default:
return doSuffixPrefix(s, len, max, words);
}
}
// adjust the token length according to the number of words
switch (words) {
case 1:
return Sets.newLinkedHashSet();
case 2:
return doSuffixPrefix(s, len + 2, max, words);
case 3:
return doSuffixPrefix(s, len + 1, max, words);
default:
return doSuffixPrefix(s, len, max, words);
}
}
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {
int j = s.indexOf(" ", i);
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {
int j = s.indexOf(" ", i);
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
if (j - len > 0) {
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
if (bigram.length() >= 4) {
bigrams.add(words+bigram);
}
}
}
return bigrams;
}
if (j - len > 0) {
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
if (bigram.length() >= 4) {
bigrams.add(words + bigram);
}
}
}
return bigrams;
}
}
}

View File

@ -1,14 +1,5 @@
package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.common;
import java.io.IOException;
import java.io.StringWriter;
@ -19,6 +10,18 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
/**
* Set of common functions for the framework
*
@ -26,321 +29,325 @@ import java.util.stream.Collectors;
*/
public abstract class AbstractPaceFunctions {
//city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
// city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
//list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
// list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
//transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
//blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
// blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
//html regex for normalization
public final String HTML_REGEX = "<[^>]*>";
// html regex for normalization
public final String HTML_REGEX = "<[^>]*>";
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
//doi prefix for normalization
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
// doi prefix for normalization
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected String concat(final List<String> l) {
return Joiner.on(" ").skipNulls().join(l);
}
protected String concat(final List<String> l) {
return Joiner.on(" ").skipNulls().join(l);
}
protected String cleanup(final String s) {
protected String cleanup(final String s) {
final String s1 = s.replaceAll(HTML_REGEX, "");
final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2);
final String s4 = fixXML(s3);
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
final String s6 = transliterate(s5);
final String s7 = fixAliases(s6);
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
final String s10 = s9.replaceAll("\\n", " ");
final String s11 = s10.replaceAll("(?m)\\s+", " ");
final String s12 = s11.trim();
return s12;
}
final String s1 = s.replaceAll(HTML_REGEX, "");
final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2);
final String s4 = fixXML(s3);
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
final String s6 = transliterate(s5);
final String s7 = fixAliases(s6);
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
final String s10 = s9.replaceAll("\\n", " ");
final String s11 = s10.replaceAll("(?m)\\s+", " ");
final String s12 = s11.trim();
return s12;
}
protected String fixXML(final String a){
protected String fixXML(final String a) {
return a.replaceAll("&ndash;", " ")
.replaceAll("&amp;", " ")
.replaceAll("&quot;", " ")
.replaceAll("&minus;", " ");
}
return a
.replaceAll("&ndash;", " ")
.replaceAll("&amp;", " ")
.replaceAll("&quot;", " ")
.replaceAll("&minus;", " ");
}
protected boolean checkNumbers(final String a, final String b) {
final String numbersA = getNumbers(a);
final String numbersB = getNumbers(b);
final String romansA = getRomans(a);
final String romansB = getRomans(b);
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
}
protected boolean checkNumbers(final String a, final String b) {
final String numbersA = getNumbers(a);
final String numbersB = getNumbers(b);
final String romansA = getRomans(a);
final String romansB = getRomans(b);
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
}
protected String getRomans(final String s) {
final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isRoman(t) ? t : "");
}
return sb.toString();
}
protected String getRomans(final String s) {
final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isRoman(t) ? t : "");
}
return sb.toString();
}
protected boolean isRoman(final String s) {
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
}
protected boolean isRoman(final String s) {
return s
.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop")
.equals("qwertyuiop");
}
protected String getNumbers(final String s) {
final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isNumber(t) ? t : "");
}
return sb.toString();
}
protected String getNumbers(final String s) {
final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isNumber(t) ? t : "");
}
return sb.toString();
}
public boolean isNumber(String strNum) {
if (strNum == null) {
return false;
}
return numberPattern.matcher(strNum).matches();
}
public boolean isNumber(String strNum) {
if (strNum == null) {
return false;
}
return numberPattern.matcher(strNum).matches();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
}
catch(Exception e) {
return s;
}
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder();
protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
}
return sb.toString().replaceAll("\\s+", " ");
}
protected boolean notNull(final String s) {
return s != null;
}
for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
}
return sb.toString().replaceAll("\\s+", " ");
}
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
protected boolean notNull(final String s) {
return s != null;
}
public String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8);
}
public String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public String unicodeNormalization(final String s) {
public String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8);
}
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
public String unicodeNormalization(final String s) {
protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (!stopwords.contains(token)) {
sb.append(token);
sb.append(" ");
}
}
return sb.toString().trim();
}
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
public String filterAllStopWords(String s) {
protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (!stopwords.contains(token)) {
sb.append(token);
sb.append(" ");
}
}
return sb.toString().trim();
}
s = filterStopWords(s, stopwords_en);
s = filterStopWords(s, stopwords_de);
s = filterStopWords(s, stopwords_it);
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
s = filterStopWords(s, stopwords_gr);
public String filterAllStopWords(String s) {
return s;
}
s = filterStopWords(s, stopwords_en);
s = filterStopWords(s, stopwords_de);
s = filterStopWords(s, stopwords_it);
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
s = filterStopWords(s, stopwords_gr);
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
final Set<String> newset = Sets.newLinkedHashSet();
for (final String s : set) {
if (!ngramBlacklist.contains(s)) {
newset.add(s);
}
}
return newset;
}
return s;
}
public static Set<String> loadFromClasspath(final String classpath) {
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
final Set<String> newset = Sets.newLinkedHashSet();
for (final String s : set) {
if (!ngramBlacklist.contains(s)) {
newset.add(s);
}
}
return newset;
}
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
public static Map<String, String> loadMapFromClasspath(final String classpath) {
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
public static Map<String, String> loadMapFromClasspath(final String classpath) {
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
public String removeKeywords(String s, Set<String> keywords) {
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
// string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
s = " " + s + " ";
for (String k : keywords) {
s = s.replaceAll(k.toLowerCase(), "");
}
public String removeKeywords(String s, Set<String> keywords) {
return s.trim();
}
s = " " + s + " ";
for (String k : keywords) {
s = s.replaceAll(k.toLowerCase(), "");
}
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
return s.trim();
}
double longer = Math.max(s1.size(), s2.size());
return (double) s1.stream().filter(s2::contains).count() / longer;
}
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
//convert the set of keywords to codes
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
}
double longer = Math.max(s1.size(), s2.size());
return (double) s1.stream().filter(s2::contains).count() / longer;
}
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
return toCodes(keywords, translationMap);
}
// convert the set of keywords to codes
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
}
public Set<String> citiesToCodes(Set<String> keywords) {
return toCodes(keywords, cityMap);
}
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
return toCodes(keywords, translationMap);
}
protected String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase();
}
public Set<String> citiesToCodes(Set<String> keywords) {
return toCodes(keywords, cityMap);
}
protected Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
protected String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase();
}
public String normalizePid(String pid) {
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
}
protected Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
//get the list of keywords into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
public String normalizePid(String pid) {
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
}
String s = s1;
// get the list of keywords into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
String s = s1;
Set<String> codes = new HashSet<>();
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
if (tokens.size() < windowSize)
windowSize = tokens.size();
Set<String> codes = new HashSet<>();
int length = windowSize;
if (tokens.size() < windowSize)
windowSize = tokens.size();
while (length != 0) {
int length = windowSize;
for (int i = 0; i <= tokens.size() - length; i++) {
String candidate = concat(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
codes.add(candidate);
s = s.replace(candidate, "").trim();
}
}
while (length != 0) {
tokens = Arrays.asList(s.split(" "));
length -= 1;
}
for (int i = 0; i <= tokens.size() - length; i++) {
String candidate = concat(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
codes.add(candidate);
s = s.replace(candidate, "").trim();
}
}
return codes;
}
tokens = Arrays.asList(s.split(" "));
length -= 1;
}
public Set<String> getCities(String s1, int windowSize) {
return getKeywords(s1, cityMap, windowSize);
}
return codes;
}
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
public Set<String> getCities(String s1, int windowSize) {
return getKeywords(s1, cityMap, windowSize);
}
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.config;
import java.util.List;
@ -44,7 +45,6 @@ public interface Config {
*/
public Map<String, Predicate<String>> blacklists();
/**
* Translation map.
*

View File

@ -1,16 +1,5 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
package eu.dnetlib.pace.config;
import java.io.IOException;
import java.io.Serializable;
@ -25,139 +14,167 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceException;
public class DedupConfig implements Config, Serializable {
private static String CONFIG_TEMPLATE = "dedupConfig.st";
private static String CONFIG_TEMPLATE = "dedupConfig.st";
private PaceConfig pace;
private PaceConfig pace;
private WfConfig wf;
private WfConfig wf;
@JsonIgnore
private Map<String, Predicate<String>> blacklists;
@JsonIgnore
private Map<String, Predicate<String>> blacklists;
private static Map<String, String> defaults = Maps.newHashMap();
private static Map<String, String> defaults = Maps.newHashMap();
static {
defaults.put("dedupRun", "001");
defaults.put("entityType", "result");
defaults.put("subEntityType", "resulttype");
defaults.put("subEntityValue", "publication");
defaults.put("orderField", "title");
defaults.put("queueMaxSize", "2000");
defaults.put("groupMaxSize", "10");
defaults.put("slidingWindowSize", "200");
defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20");
defaults.put("idPath", "$.id");
}
static {
defaults.put("dedupRun", "001");
defaults.put("entityType", "result");
defaults.put("subEntityType", "resulttype");
defaults.put("subEntityValue", "publication");
defaults.put("orderField", "title");
defaults.put("queueMaxSize", "2000");
defaults.put("groupMaxSize", "10");
defaults.put("slidingWindowSize", "200");
defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20");
defaults.put("idPath", "$.id");
}
public DedupConfig() {
}
public DedupConfig() {
}
public static DedupConfig load(final String json) {
public static DedupConfig load(final String json) {
final DedupConfig config;
try {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
config.getPace().initTranslationMap();
final DedupConfig config;
try {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
config.getPace().initTranslationMap();
config.blacklists = config.getPace().getBlacklists().entrySet()
.stream()
.map(e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList())))
.collect(Collectors.toMap(e -> e.getKey(),
e -> (Predicate<String> & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent()))
config.blacklists = config
.getPace()
.getBlacklists()
.entrySet()
.stream()
.map(
e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(),
e
.getValue()
.stream()
.filter(s -> !StringUtils.isBlank(s))
.map(Pattern::compile)
.collect(Collectors.toList())))
.collect(
Collectors
.toMap(
e -> e.getKey(),
e -> (Predicate<String> & Serializable) s -> e
.getValue()
.stream()
.filter(p -> p.matcher(s).matches())
.findFirst()
.isPresent()))
;
;
return config;
} catch (IOException |
PatternSyntaxException e) {
throw new PaceException("Error in parsing configuration json", e);
}
return config;
} catch (IOException | PatternSyntaxException e) {
throw new PaceException("Error in parsing configuration json", e);
}
}
}
public static DedupConfig loadDefault() throws IOException {
return loadDefault(new HashMap<String, String>());
}
public static DedupConfig loadDefault() throws IOException {
return loadDefault(new HashMap<String, String>());
}
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
for (final Entry<String, String> e : defaults.entrySet()) {
template.setAttribute(e.getKey(), e.getValue());
}
for (final Entry<String, String> e : params.entrySet()) {
if (template.getAttribute(e.getKey()) != null) {
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
} else {
template.setAttribute(e.getKey(), e.getValue());
}
}
for (final Entry<String, String> e : defaults.entrySet()) {
template.setAttribute(e.getKey(), e.getValue());
}
for (final Entry<String, String> e : params.entrySet()) {
if (template.getAttribute(e.getKey()) != null) {
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
} else {
template.setAttribute(e.getKey(), e.getValue());
}
}
final String json = template.toString();
return load(json);
}
final String json = template.toString();
return load(json);
}
private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
}
private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
}
public PaceConfig getPace() {
return pace;
}
public PaceConfig getPace() {
return pace;
}
public void setPace(final PaceConfig pace) {
this.pace = pace;
}
public void setPace(final PaceConfig pace) {
this.pace = pace;
}
public WfConfig getWf() {
return wf;
}
public WfConfig getWf() {
return wf;
}
public void setWf(final WfConfig wf) {
this.wf = wf;
}
public void setWf(final WfConfig wf) {
this.wf = wf;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise configuration", e);
}
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise configuration", e);
}
}
@Override
public Map<String, TreeNodeDef> decisionTree() {
return getPace().getDecisionTree();
}
@Override
public Map<String, TreeNodeDef> decisionTree() {
return getPace().getDecisionTree();
}
@Override
public List<FieldDef> model() {
return getPace().getModel();
}
@Override
public List<FieldDef> model() {
return getPace().getModel();
}
@Override
public List<ClusteringDef> clusterings() {
return getPace().getClustering();
}
@Override
public List<ClusteringDef> clusterings() {
return getPace().getClustering();
}
@Override
public Map<String, Predicate<String>> blacklists() {
return blacklists;
}
@Override
public Map<String, Predicate<String>> blacklists() {
return blacklists;
}
@Override
public Map<String, String> translationMap() {
return getPace().translationMap();
}
@Override
public Map<String, String> translationMap() {
return getPace().translationMap();
}
}

View File

@ -1,19 +1,20 @@
package eu.dnetlib.pace.config;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.Maps;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
private List<FieldDef> model;
@ -37,7 +38,8 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
@JsonIgnore
public static PaceResolver resolver = new PaceResolver();
public PaceConfig() {}
public PaceConfig() {
}
public void initModel() {
modelMap = Maps.newHashMap();
@ -46,20 +48,21 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
}
}
public void initTranslationMap(){
public void initTranslationMap() {
translationMap = Maps.newHashMap();
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
for (String key : synonyms.keySet()) {
for (String term : synonyms.get(key)){
translationMap.put(
for (String term : synonyms.get(key)) {
translationMap
.put(
fixAliases(transliterator.transliterate(term.toLowerCase())),
key);
key);
}
}
}
public Map<String, String> translationMap(){
public Map<String, String> translationMap() {
return translationMap;
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.config;
public enum Type {

View File

@ -1,10 +1,5 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.config;
import java.io.IOException;
import java.io.Serializable;
@ -12,6 +7,13 @@ import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
public class WfConfig implements Serializable {
@ -76,7 +78,6 @@ public class WfConfig implements Serializable {
/** Maximum number of allowed children. */
private int maxChildren = MAX_CHILDREN;
/** Default maximum number of iterations. */
private final static int MAX_ITERATIONS = 20;
@ -84,9 +85,10 @@ public class WfConfig implements Serializable {
private int maxIterations = MAX_ITERATIONS;
/** The Jquery path to retrieve the identifier */
private String idPath = "$.id";
private String idPath = "$.id";
public WfConfig() {}
public WfConfig() {
}
/**
* Instantiates a new dedup config.
@ -114,8 +116,10 @@ public class WfConfig implements Serializable {
* @param idPath
* the path for the id of the entity
*/
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder,
final String dedupRun,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize,
final boolean includeChildren, final int maxIterations, final String idPath) {
super();
this.entityType = entityType;
this.orderField = orderField;
@ -257,7 +261,6 @@ public class WfConfig implements Serializable {
this.maxChildren = maxChildren;
}
public int getMaxIterations() {
return maxIterations;
}
@ -277,7 +280,6 @@ public class WfConfig implements Serializable {
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override

View File

@ -1,15 +1,16 @@
package eu.dnetlib.pace.model;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
public class ClusteringDef implements Serializable {
@ -19,7 +20,8 @@ public class ClusteringDef implements Serializable {
private Map<String, Integer> params;
public ClusteringDef() {}
public ClusteringDef() {
}
public String getName() {
return name;

View File

@ -1,13 +1,15 @@
package eu.dnetlib.pace.model;
import java.io.Serializable;
import java.util.List;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Type;
import java.io.Serializable;
import java.util.List;
import eu.dnetlib.pace.config.Type;
/**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
@ -34,7 +36,8 @@ public class FieldDef implements Serializable {
*/
private int length = -1;
public FieldDef() {}
public FieldDef() {
}
public String getName() {
return name;

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.model;
import java.nio.charset.Charset;
@ -43,7 +44,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
if (s.contains(",")) { // if the name contains a comma it is easy derivable the name and the surname
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.model;
import java.util.ArrayList;
@ -57,7 +58,7 @@ public class PersonComparatorUtils {
private static boolean verifyNames(List<String> list1, List<String> list2) {
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
}
private static boolean verifySurnames(List<String> list1, List<String> list2) {
@ -76,7 +77,7 @@ public class PersonComparatorUtils {
Collections.sort(list1);
Collections.sort(list2);
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
}
private static List<String> extractExtendedNames(List<String> list) {
@ -107,7 +108,7 @@ public class PersonComparatorUtils {
for (String s : list1) {
int curr = list2.indexOf(s);
if (curr > pos) {
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
pos = curr;
} else {
return false;

View File

@ -1,9 +1,11 @@
package eu.dnetlib.pace.model;
import eu.dnetlib.pace.clustering.NGramUtils;
import java.util.Comparator;
import org.apache.spark.sql.Row;
import java.util.Comparator;
import eu.dnetlib.pace.clustering.NGramUtils;
/**
* The Class MapDocumentComparator.
@ -25,13 +27,12 @@ public class RowDataOrderingComparator implements Comparator<Row> {
/*
* (non-Javadoc)
*
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
*/
@Override
public int compare(final Row d1, final Row d2) {
if (d1 == null)
return d2==null ? 0: -1;
return d2 == null ? 0 : -1;
else if (d2 == null) {
return 1;
}
@ -40,7 +41,7 @@ public class RowDataOrderingComparator implements Comparator<Row> {
final String o2 = d2.getString(comparatorField);
if (o1 == null)
return o2==null ? 0: -1;
return o2 == null ? 0 : -1;
else if (o2 == null) {
return 1;
}

View File

@ -1,32 +1,30 @@
package eu.dnetlib.dhp.oa.dedup.model
package eu.dnetlib.pace.model
import com.jayway.jsonpath.{Configuration, JsonPath, Option}
import eu.dnetlib.dhp.oa.dedup.{DedupUtility, SparkReporter}
import eu.dnetlib.pace.config.{DedupConfig, Type}
import eu.dnetlib.pace.model.{ClusteringDef, FieldDef}
import eu.dnetlib.pace.tree.support.TreeProcessor
import eu.dnetlib.pace.util.MapDocumentUtil.truncateValue
import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil}
import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil, SparkReporter}
import org.apache.spark.SparkContext
import org.apache.spark.sql.{Column, Dataset, Row, functions}
import org.apache.spark.sql.catalyst.expressions.{GenericRowWithSchema, Literal}
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
import org.apache.spark.sql.functions.{col, lit, udf}
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
import org.apache.spark.sql.{Column, Dataset, Row, functions}
import java.util
import java.util.function.Predicate
import java.util.regex.Pattern
import scala.collection.JavaConverters._
import scala.collection.mutable
import org.apache.spark.sql.functions.{col, lit, udf}
class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")
private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")
private var urlFilter = (s: String) => URL_REGEX.matcher(s).matches
private val urlFilter = (s: String) => URL_REGEX.matcher(s).matches
val modelExtractor: (Dataset[String] => Dataset[Row]) = df => {
df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0))))
@ -226,60 +224,59 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
val orderingFieldPosition: Int = rowDataType.fieldIndex(conf.getWf.getOrderField)
val rowFromJsonUDF = udf(
(json: String) => {
val documentContext =
JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
val values = new Array[Any](rowDataType.size)
val rowFromJson = (json: String) => {
val documentContext =
JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
val values = new Array[Any](rowDataType.size)
values(identityFieldPosition) = DFMapDocumentUtils.getJPathString(conf.getWf.getIdPath, documentContext)
values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
case ((res, (fname, index))) => {
val fdef = conf.getPace.getModelMap.get(fname)
rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
case ((res, (fname, index))) => {
val fdef = conf.getPace.getModelMap.get(fname)
if (fdef != null) {
res(index) = fdef.getType match {
case Type.String | Type.Int =>
MapDocumentUtil.truncateValue(
DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext),
fdef.getLength
)
if (fdef != null) {
res(index) = fdef.getType match {
case Type.String | Type.Int =>
MapDocumentUtil.truncateValue(
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
fdef.getLength
)
case Type.URL =>
var uv = DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext)
if (!urlFilter(uv)) uv = ""
uv
case Type.URL =>
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
if (!urlFilter(uv)) uv = ""
uv
case Type.List | Type.JSON =>
MapDocumentUtil.truncateList(
DFMapDocumentUtils.getJPathList(fdef.getPath, documentContext, fdef.getType),
fdef.getSize
)
case Type.List | Type.JSON =>
MapDocumentUtil.truncateList(
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
fdef.getSize
)
case Type.StringConcat =>
val jpaths = CONCAT_REGEX.split(fdef.getPath)
case Type.StringConcat =>
val jpaths = CONCAT_REGEX.split(fdef.getPath)
truncateValue(
jpaths
.map(jpath => DFMapDocumentUtils.getJPathString(jpath, documentContext))
.mkString(" "),
fdef.getLength
)
truncateValue(
jpaths
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
.mkString(" "),
fdef.getLength
)
case Type.DoubleArray =>
MapDocumentUtil.getJPathArray(fdef.getPath, json)
}
case Type.DoubleArray =>
MapDocumentUtil.getJPathArray(fdef.getPath, json)
}
res
}
}
new GenericRowWithSchema(values, rowDataType)
},
rowDataType
)
res
}
}
new GenericRowWithSchema(values, rowDataType)
}
val rowFromJsonUDF = udf(rowFromJson, rowDataType)
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
@ -310,7 +307,7 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
}
def processBlock(implicit sc: SparkContext) = {
val accumulators = DedupUtility.constructAccumulator(conf, sc)
val accumulators = SparkReporter.constructAccumulator(conf, sc)
udf[Array[Tuple2[String, String]], mutable.WrappedArray[Row]](block => {
val reporter = new SparkReporter(accumulators)

View File

@ -1,41 +1,42 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("alwaysMatch")
public class AlwaysMatch<T> extends AbstractComparator<T> {
public AlwaysMatch(final Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double compare(final Object a, final Object b, final Config conf) {
return 1.0;
}
@Override
public double compare(final Object a, final Object b, final Config conf) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,148 +1,157 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("authorsMatch")
public class AuthorsMatch extends AbstractListComparator {
Map<String, String> params;
Map<String, String> params;
private double SURNAME_THRESHOLD;
private double NAME_THRESHOLD;
private double FULLNAME_THRESHOLD;
private String MODE; //full or surname
private int SIZE_THRESHOLD;
private String TYPE; //count or percentage
private int common;
private double SURNAME_THRESHOLD;
private double NAME_THRESHOLD;
private double FULLNAME_THRESHOLD;
private String MODE; // full or surname
private int SIZE_THRESHOLD;
private String TYPE; // count or percentage
private int common;
public AuthorsMatch(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
public AuthorsMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
MODE = params.getOrDefault("mode", "full");
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage");
common = 0;
}
MODE = params.getOrDefault("mode", "full");
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage");
common = 0;
}
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
if (a.isEmpty() || b.isEmpty())
return -1;
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
return 1.0;
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
return 1.0;
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
common = 0;
//compare each element of List1 with each element of List2
for (Person p1 : aList)
common = 0;
// compare each element of List1 with each element of List2
for (Person p1 : aList)
for (Person p2 : bList) {
for (Person p2 : bList) {
//both persons are inaccurate
if (!p1.isAccurate() && !p2.isAccurate()) {
//compare just normalized fullnames
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
// both persons are inaccurate
if (!p1.isAccurate() && !p2.isAccurate()) {
// compare just normalized fullnames
String fullname1 = normalization(
p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname());
String fullname2 = normalization(
p2.getNormalisedFullname().isEmpty() ? p2.getOriginal() : p2.getNormalisedFullname());
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
common += 1;
break;
}
}
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
common += 1;
break;
}
}
//one person is inaccurate
if (p1.isAccurate() ^ p2.isAccurate()) {
//prepare data
//data for the accurate person
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
// one person is inaccurate
if (p1.isAccurate() ^ p2.isAccurate()) {
// prepare data
// data for the accurate person
String name = normalization(
p1.isAccurate() ? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
String surname = normalization(
p1.isAccurate() ? p1.getNormalisedSurname() : p2.getNormalisedSurname());
//data for the inaccurate person
String fullname = normalization(
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
);
// data for the inaccurate person
String fullname = normalization(
p1.isAccurate()
? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname())
: (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()));
if (fullname.contains(surname)) {
if (MODE.equals("full")) {
if (fullname.contains(name)) {
common += 1;
break;
}
}
else { //MODE equals "surname"
common += 1;
break;
}
}
}
if (fullname.contains(surname)) {
if (MODE.equals("full")) {
if (fullname.contains(name)) {
common += 1;
break;
}
} else { // MODE equals "surname"
common += 1;
break;
}
}
}
//both persons are accurate
if (p1.isAccurate() && p2.isAccurate()) {
// both persons are accurate
if (p1.isAccurate() && p2.isAccurate()) {
if (compareSurname(p1, p2)) {
if (MODE.equals("full")) {
if(compareFirstname(p1, p2)) {
common += 1;
break;
}
}
else { //MODE equals "surname"
common += 1;
break;
}
}
if (compareSurname(p1, p2)) {
if (MODE.equals("full")) {
if (compareFirstname(p1, p2)) {
common += 1;
break;
}
} else { // MODE equals "surname"
common += 1;
break;
}
}
}
}
}
}
//normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
// normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
if(TYPE.equals("percentage")) {
return (double) common / normFactor;
}
else {
return (double) common;
}
}
if (TYPE.equals("percentage")) {
return (double) common / normFactor;
} else {
return (double) common;
}
}
public boolean compareSurname(Person p1, Person p2) {
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
}
public boolean compareSurname(Person p1, Person p2) {
return ssalgo
.score(
normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
}
public boolean compareFirstname(Person p1, Person p2) {
public boolean compareFirstname(Person p1, Person p2) {
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
return true;
}
if (p1.getNormalisedFirstName().length() <= 2 || p2.getNormalisedFirstName().length() <= 2) {
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
return true;
}
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
}
return ssalgo
.score(
normalization(p1.getNormalisedFirstName()),
normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
}
public String normalization(String s) {
return normalize(utf8(cleanup(s)));
}
public String normalization(String s) {
return normalize(utf8(cleanup(s)));
}
}

View File

@ -1,47 +1,48 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
private Map<String, String> params;
private Map<String, String> params;
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
//if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -1,47 +1,47 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator<double[]> {
Map<String, String> params;
Map<String, String> params;
public CosineSimilarity(Map<String,String> params) {
super(params);
}
public CosineSimilarity(Map<String, String> params) {
super(params);
}
@Override
public double compare(Object a, Object b, Config config) {
return compare((double[])a, (double[])b, config);
}
@Override
public double compare(Object a, Object b, Config config) {
return compare((double[]) a, (double[]) b, config);
}
public double compare(final double[] a, final double[] b, final Config conf) {
public double compare(final double[] a, final double[] b, final Config conf) {
if (a.length == 0 || b.length == 0)
return -1;
if (a.length == 0 || b.length == 0)
return -1;
return cosineSimilarity(a, b);
}
return cosineSimilarity(a, b);
}
double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0;
double normASum = 0;
double normBSum = 0;
double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0;
double normASum = 0;
double normBSum = 0;
for(int i = 0; i < a.length; i ++) {
dotProduct += a[i] * b[i];
normASum += a[i] * a[i];
normBSum += b[i] * b[i];
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
for (int i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normASum += a[i] * a[i];
normBSum += b[i] * b[i];
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
}

View File

@ -1,9 +1,10 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* The Class ExactMatch.
*
@ -12,15 +13,15 @@ import java.util.Map;
@ComparatorClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public DoiExactMatch(final Map<String, String> params) {
super(params);
}
public DoiExactMatch(final Map<String, String> params) {
super(params);
}
@Override
protected String toString(final Object f) {
return super.toString(f).replaceAll(PREFIX, "");
}
@Override
protected String toString(final Object f) {
return super.toString(f).replaceAll(PREFIX, "");
}
}

View File

@ -1,29 +1,30 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("domainExactMatch")
public class DomainExactMatch extends ExactMatchIgnoreCase {
public DomainExactMatch(final Map<String, String> params) {
super(params);
}
public DomainExactMatch(final Map<String, String> params) {
super(params);
}
@Override
protected String toString(final Object f) {
@Override
protected String toString(final Object f) {
try {
return asUrl(super.toString(f)).getHost();
} catch (MalformedURLException e) {
return "";
}
}
try {
return asUrl(super.toString(f)).getHost();
} catch (MalformedURLException e) {
return "";
}
}
private URL asUrl(final String value) throws MalformedURLException {
return new URL(value);
}
private URL asUrl(final String value) throws MalformedURLException {
return new URL(value);
}
}

View File

@ -1,42 +1,44 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("exactMatch")
public class ExactMatch extends AbstractStringComparator {
public ExactMatch(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; //return -1 if a field is missing
}
return a.equals(b) ? 1.0 : 0;
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; // return -1 if a field is missing
}
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,30 +1,32 @@
package eu.dnetlib.pace.tree;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractStringComparator {
public ExactMatchIgnoreCase(Map<String, String> params) {
super(params);
}
public ExactMatchIgnoreCase(Map<String, String> params) {
super(params);
}
@Override
public double compare(String a, String b, final Config conf) {
@Override
public double compare(String a, String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
if (a.isEmpty() || b.isEmpty())
return -1;
return a.equalsIgnoreCase(b) ? 1 : 0;
}
return a.equalsIgnoreCase(b) ? 1 : 0;
}
protected String toString(final Object object) {
return toFirstString(object);
}
}
protected String toString(final Object object) {
return toFirstString(object);
}
}

View File

@ -1,9 +1,5 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.util.HashMap;
import java.util.List;
@ -11,70 +7,74 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("instanceTypeMatch")
public class InstanceTypeMatch extends AbstractListComparator {
final Map<String, String> translationMap = new HashMap<>();
final Map<String, String> translationMap = new HashMap<>();
public InstanceTypeMatch(Map<String, String> params){
super(params);
public InstanceTypeMatch(Map<String, String> params) {
super(params);
//jolly types
translationMap.put("Conference object", "*");
translationMap.put("Other literature type", "*");
translationMap.put("Unknown", "*");
// jolly types
translationMap.put("Conference object", "*");
translationMap.put("Other literature type", "*");
translationMap.put("Unknown", "*");
//article types
translationMap.put("Article", "Article");
translationMap.put("Data Paper", "Article");
translationMap.put("Software Paper", "Article");
translationMap.put("Preprint", "Article");
// article types
translationMap.put("Article", "Article");
translationMap.put("Data Paper", "Article");
translationMap.put("Software Paper", "Article");
translationMap.put("Preprint", "Article");
//thesis types
translationMap.put("Thesis", "Thesis");
translationMap.put("Master thesis", "Thesis");
translationMap.put("Bachelor thesis", "Thesis");
translationMap.put("Doctoral thesis", "Thesis");
}
// thesis types
translationMap.put("Thesis", "Thesis");
translationMap.put("Master thesis", "Thesis");
translationMap.put("Bachelor thesis", "Thesis");
translationMap.put("Doctoral thesis", "Thesis");
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a == null || b == null) {
return -1;
}
if (a == null || b == null) {
return -1;
}
if (a.isEmpty() || b.isEmpty()) {
return -1;
}
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
if (a.isEmpty() || b.isEmpty()) {
return -1;
}
// if at least one is a jolly type, it must produce a match
if (ca.contains("*") || cb.contains("*"))
return 1.0;
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
int incommon = Sets.intersection(ca, cb).size();
//if at least one is a jolly type, it must produce a match
if (ca.contains("*") || cb.contains("*"))
return 1.0;
// if at least one is in common, it must produce a match
return incommon >= 1 ? 1 : 0;
}
int incommon = Sets.intersection(ca, cb).size();
public String translate(String term) {
return translationMap.getOrDefault(term, term);
}
//if at least one is in common, it must produce a match
return incommon >= 1 ? 1 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
public String translate(String term){
return translationMap.getOrDefault(term, term);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,44 +1,46 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinkler")
public class JaroWinkler extends AbstractStringComparator {
public JaroWinkler(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
return normalize(ssalgo.score(ca, cb));
}
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
@Override
protected double normalize(double d) {
return d;
}
}
}

View File

@ -1,70 +1,74 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params;
private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,17 +1,19 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinklerTitle")
public class JaroWinklerTitle extends AbstractStringComparator {
public JaroWinklerTitle(Map<String, String> params){
public JaroWinklerTitle(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
}
@ -22,7 +24,7 @@ public class JaroWinklerTitle extends AbstractStringComparator {
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
@ -30,7 +32,7 @@ public class JaroWinklerTitle extends AbstractStringComparator {
boolean check = checkNumbers(ca, cb);
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
}
}
@Override
public double getWeight() {

View File

@ -1,72 +1,76 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
@ComparatorClass("jsonListMatch")
public class JsonListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map<String, String> params;
private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map<String, String> params;
private String MODE; //"percentage" or "count"
private String MODE; // "percentage" or "count"
public JsonListMatch(final Map<String, String> params) {
super(params);
this.params = params;
public JsonListMatch(final Map<String, String> params) {
super(params);
this.params = params;
MODE = params.getOrDefault("mode", "percentage");
}
MODE = params.getOrDefault("mode", "percentage");
}
@Override
public double compare(final List<String> sa, final List<String> sb, final Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
@Override
public double compare(final List<String> sa, final List<String> sb, final Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
int incommon = Sets.intersection(ca, cb).size();
int simDiff = Sets.symmetricDifference(ca, cb).size();
int incommon = Sets.intersection(ca, cb).size();
int simDiff = Sets.symmetricDifference(ca, cb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
if (incommon + simDiff == 0) {
return 0.0;
}
if (MODE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
if (MODE.equals("percentage"))
return (double) incommon / (incommon + simDiff);
else
return incommon;
}
}
//converts every json into a comparable string basing on parameters
private String toComparableString(String json){
// converts every json into a comparable string basing on parameters
private String toComparableString(String json) {
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
// parameters
//for each path in the param list
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key);
String value = MapDocumentUtil.getJPathString(path, json);
if (value == null || value.isEmpty())
value = "";
st.append(value);
st.append("::");
}
// for each path in the param list
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key);
String value = MapDocumentUtil.getJPathString(path, json);
if (value == null || value.isEmpty())
value = "";
st.append(value);
st.append("::");
}
st.setLength(st.length()-2);
return st.toString();
}
st.setLength(st.length() - 2);
return st.toString();
}
}

View File

@ -1,47 +1,50 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params;
Map<String, String> params;
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
//if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; // undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinkler")
public class Level2JaroWinkler extends AbstractStringComparator {
public Level2JaroWinkler(Map<String, String> params){
public Level2JaroWinkler(Map<String, String> params) {
super(params, new com.wcohen.ss.Level2JaroWinkler());
}

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends AbstractStringComparator {
public Level2JaroWinklerTitle(Map<String,String> params){
public Level2JaroWinklerTitle(Map<String, String> params) {
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
@ -29,7 +31,8 @@ public class Level2JaroWinklerTitle extends AbstractStringComparator {
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
if (check)
return 0.5;
return ssalgo.score(ca, cb);
}

View File

@ -1,15 +1,17 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2Levenstein")
public class Level2Levenstein extends AbstractStringComparator {
public Level2Levenstein(Map<String,String> params){
public Level2Levenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Level2Levenstein());
}

View File

@ -1,15 +1,17 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("levenstein")
public class Levenstein extends AbstractStringComparator {
public Levenstein(Map<String,String> params){
public Levenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
}

View File

@ -1,20 +1,23 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("levensteinTitle")
public class LevensteinTitle extends AbstractStringComparator {
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
public LevensteinTitle(Map<String,String> params){
public LevensteinTitle(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
}
@ -33,7 +36,8 @@ public class LevensteinTitle extends AbstractStringComparator {
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
if (check)
return 0.5;
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
}

View File

@ -1,19 +1,21 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@ComparatorClass("levensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends AbstractStringComparator {
public LevensteinTitleIgnoreVersion(Map<String,String> params){
public LevensteinTitleIgnoreVersion(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
}

View File

@ -1,13 +1,14 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* The Class Contains match
*
@ -16,51 +17,50 @@ import java.util.stream.Collectors;
@ComparatorClass("listContainsMatch")
public class ListContainsMatch extends AbstractListComparator {
private Map<String, String> params;
private boolean CASE_SENSITIVE;
private String STRING;
private String AGGREGATOR;
private Map<String, String> params;
private boolean CASE_SENSITIVE;
private String STRING;
private String AGGREGATOR;
public ListContainsMatch(Map<String, String> params) {
super(params);
this.params = params;
public ListContainsMatch(Map<String, String> params) {
super(params);
this.params = params;
//read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string");
AGGREGATOR = params.get("bool");
}
// read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string");
AGGREGATOR = params.get("bool");
}
@Override
public double compare(List<String> sa, List<String> sb, Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
@Override
public double compare(List<String> sa, List<String> sb, Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
if (!CASE_SENSITIVE) {
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
STRING = STRING.toLowerCase();
}
if (!CASE_SENSITIVE) {
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
STRING = STRING.toLowerCase();
}
switch(AGGREGATOR) {
case "AND":
if(sa.contains(STRING) && sb.contains(STRING))
return 1.0;
break;
case "OR":
if(sa.contains(STRING) || sb.contains(STRING))
return 1.0;
break;
case "XOR":
if(sa.contains(STRING) ^ sb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
switch (AGGREGATOR) {
case "AND":
if (sa.contains(STRING) && sb.contains(STRING))
return 1.0;
break;
case "OR":
if (sa.contains(STRING) || sb.contains(STRING))
return 1.0;
break;
case "XOR":
if (sa.contains(STRING) ^ sb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
}
}

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("mustBeDifferent")
public class MustBeDifferent extends AbstractStringComparator {
public MustBeDifferent(Map<String,String> params){
public MustBeDifferent(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
}

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo.
@ -13,7 +14,7 @@ import java.util.Map;
@ComparatorClass("null")
public class NullDistanceAlgo<T> implements Comparator<T> {
public NullDistanceAlgo(Map<String, String> params){
public NullDistanceAlgo(Map<String, String> params) {
}
@Override

View File

@ -1,34 +1,35 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractStringComparator {
Map<String, String> params;
Map<String, String> params;
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(String a, String b, Config conf) {
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
// extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2);
}
return Math.abs(n1 - n2);
}
}

View File

@ -1,36 +1,36 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersMatch")
public class NumbersMatch extends AbstractStringComparator {
public NumbersMatch(Map<String, String> params) {
super(params);
}
public NumbersMatch(Map<String, String> params) {
super(params);
}
@Override
public double distance(String a, String b, Config conf) {
@Override
public double distance(String a, String b, Config conf) {
// extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() && numbers2.isEmpty())
return 1.0;
if (numbers1.isEmpty() && numbers2.isEmpty())
return 1.0;
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
if (numbers1.equals(numbers2))
return 1.0;
if (numbers1.equals(numbers2))
return 1.0;
return 0.0;
}
}
return 0.0;
}
}

View File

@ -1,36 +1,36 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("romansMatch")
public class RomansMatch extends AbstractStringComparator {
public RomansMatch(Map<String, String> params) {
super(params);
}
public RomansMatch(Map<String, String> params) {
super(params);
}
@Override
public double distance(String a, String b, Config conf) {
@Override
public double distance(String a, String b, Config conf) {
// extracts romans from the field
String romans1 = getRomans(nfd(a));
String romans2 = getRomans(nfd(b));
//extracts romans from the field
String romans1 = getRomans(nfd(a));
String romans2 = getRomans(nfd(b));
if (romans1.isEmpty() && romans2.isEmpty())
return 1.0;
if (romans1.isEmpty() && romans2.isEmpty())
return 1.0;
if (romans1.isEmpty() || romans2.isEmpty())
return -1.0;
if (romans1.isEmpty() || romans2.isEmpty())
return -1.0;
if (romans1.equals(romans2))
return 1.0;
if (romans1.equals(romans2))
return 1.0;
return 0.0;
}
return 0.0;
}
}

View File

@ -1,13 +1,15 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
package eu.dnetlib.pace.tree;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
/**
* Returns true if the number of values in the fields is the same.
*
@ -16,23 +18,23 @@ import java.util.Map;
@ComparatorClass("sizeMatch")
public class SizeMatch extends AbstractListComparator {
/**
* Instantiates a new size match.
*
* @param params
* the parameters
*/
public SizeMatch(final Map<String, String> params) {
super(params);
}
/**
* Instantiates a new size match.
*
* @param params
* the parameters
*/
public SizeMatch(final Map<String, String> params) {
super(params);
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1.0;
if (a.isEmpty() || b.isEmpty())
return -1.0;
return a.size() == b.size() ? 1.0 : 0.0;
}
return a.size() == b.size() ? 1.0 : 0.0;
}
}

View File

@ -1,18 +1,20 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedJaroWinkler")
public class SortedJaroWinkler extends AbstractSortedComparator {
public SortedJaroWinkler(Map<String,String> params){
public SortedJaroWinkler(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
}
@ -40,7 +42,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
@ -50,7 +51,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override

View File

@ -1,11 +1,13 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ -22,7 +24,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
super(weight, new com.wcohen.ss.Level2JaroWinkler());
}
public SortedLevel2JaroWinkler(final Map<String, String> params){
public SortedLevel2JaroWinkler(final Map<String, String> params) {
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
@ -40,7 +42,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
@ -50,7 +51,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override

View File

@ -1,12 +1,13 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class Contains match
*
@ -15,50 +16,50 @@ import java.util.Map;
@ComparatorClass("stringContainsMatch")
public class StringContainsMatch extends AbstractStringComparator {
private Map<String, String> params;
private Map<String, String> params;
private boolean CASE_SENSITIVE;
private String STRING;
private String AGGREGATOR;
private boolean CASE_SENSITIVE;
private String STRING;
private String AGGREGATOR;
public StringContainsMatch(Map<String, String> params) {
super(params);
this.params = params;
public StringContainsMatch(Map<String, String> params) {
super(params);
this.params = params;
//read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string");
AGGREGATOR = params.get("aggregator");
// read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string");
AGGREGATOR = params.get("aggregator");
}
}
@Override
public double distance(final String a, final String b, final Config conf) {
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = a;
String cb = b;
if (!CASE_SENSITIVE) {
ca = a.toLowerCase();
cb = b.toLowerCase();
STRING = STRING.toLowerCase();
}
String ca = a;
String cb = b;
if (!CASE_SENSITIVE) {
ca = a.toLowerCase();
cb = b.toLowerCase();
STRING = STRING.toLowerCase();
}
switch(AGGREGATOR) {
case "AND":
if(ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if(ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if(ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
switch (AGGREGATOR) {
case "AND":
if (ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if (ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if (ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
}

View File

@ -1,53 +1,56 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
package eu.dnetlib.pace.tree;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("stringListMatch")
public class StringListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
final private String TYPE; //percentage or count
final private String TYPE; // percentage or count
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
TYPE = params.getOrDefault("type", "percentage");
}
TYPE = params.getOrDefault("type", "percentage");
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
final Set<String> pa = new HashSet<>(a);
final Set<String> pb = new HashSet<>(b);
final Set<String> pa = new HashSet<>(a);
final Set<String> pb = new HashSet<>(b);
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists is empty
}
if (pa.isEmpty() || pb.isEmpty()) {
return -1; // return undefined if one of the two lists is empty
}
int incommon = Sets.intersection(pa, pb).size();
int simDiff = Sets.symmetricDifference(pa, pb).size();
int incommon = Sets.intersection(pa, pb).size();
int simDiff = Sets.symmetricDifference(pa, pb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
if (incommon + simDiff == 0) {
return 0.0;
}
if(TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
if (TYPE.equals("percentage"))
return (double) incommon / (incommon + simDiff);
else
return incommon;
}
}
}
}

View File

@ -1,12 +1,15 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.util.Map;
/**
* The Class SubStringLevenstein.
@ -14,76 +17,74 @@ import java.util.Map;
@ComparatorClass("subStringLevenstein")
public class SubStringLevenstein extends AbstractStringComparator {
/**
* The limit.
*/
protected int limit;
/**
* The limit.
*/
protected int limit;
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
public SubStringLevenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
}
public SubStringLevenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
* @param limit the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
* @param limit the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
* @param limit the limit
* @param ssalgo the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
* @param limit the limit
* @param ssalgo the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
this.limit = limit;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final String a, final String b, final Config conf) {
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
}
/*
* (non-Javadoc)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field,
* eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final String a, final String b, final Config conf) {
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
/*
* (non-Javadoc)
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
*
@ -15,24 +16,24 @@ import java.util.Map;
@ComparatorClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractStringComparator {
public TitleVersionMatch(final Map<String, String> params) {
super(params);
}
public TitleVersionMatch(final Map<String, String> params) {
super(params);
}
@Override
public double compare(final String valueA, final String valueB, final Config conf) {
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
@Override
public double compare(final String valueA, final String valueB, final Config conf) {
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
}
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
protected String toString(final Object object) {
return toFirstString(object);
}
}
protected String toString(final Object object) {
return toFirstString(object);
}
}

View File

@ -1,61 +1,63 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
package eu.dnetlib.pace.tree;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("urlMatcher")
public class UrlMatcher extends Levenstein {
private Map<String, String> params;
private Map<String, String> params;
public UrlMatcher(Map<String, String> params){
super(params);
this.params = params;
}
public UrlMatcher(Map<String, String> params) {
super(params);
this.params = params;
}
public UrlMatcher(double weight, Map<String, String> params) {
super(weight);
this.params = params;
}
public UrlMatcher(double weight, Map<String, String> params) {
super(weight);
this.params = params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
@Override
public double distance(String a, String b, final Config conf) {
final URL urlA = asUrl(a);
final URL urlB = asUrl(b);
@Override
public double distance(String a, String b, final Config conf) {
final URL urlA = asUrl(a);
final URL urlB = asUrl(b);
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
}
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
}
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5;
}
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5;
}
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
}
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
}
private URL asUrl(final String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
private URL asUrl(final String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
protected String toString(final Object object) {
return toFirstString(object);
}
protected String toString(final Object object) {
return toFirstString(object);
}
}

View File

@ -1,11 +1,13 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.util.Map;
/**
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
@ -15,36 +17,36 @@ import java.util.Map;
@ComparatorClass("yearMatch")
public class YearMatch extends AbstractStringComparator {
private int limit = 4;
private int limit = 4;
public YearMatch(final Map<String, String> params) {
super(params);
}
public YearMatch(final Map<String, String> params) {
super(params);
}
@Override
public double compare(final String a, final String b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
@Override
public double compare(final String a, final String b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
}
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
}
protected boolean checkLength(final String s) {
return s.length() == limit;
}
protected boolean checkLength(final String s) {
return s.length() == limit;
}
protected String getFirstValue(final String value) {
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
}
protected String getFirstValue(final String value) {
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}

View File

@ -1,130 +1,131 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
package eu.dnetlib.pace.tree.support;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
public abstract class AbstractComparator<T> extends AbstractPaceFunctions implements Comparator<T> {
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
/** The weight. */
protected double weight = 0.0;
/** The weight. */
protected double weight = 0.0;
private Map<String, String> params;
private Map<String, String> params;
protected AbstractComparator(Map<String, String> params) {
this.params = params;
}
protected AbstractComparator(Map<String, String> params) {
this.params = params;
}
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = 1.0;
this.ssalgo = ssalgo;
}
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo) {
this.params = params;
this.weight = 1.0;
this.ssalgo = ssalgo;
}
/**
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
this.weight = weight;
}
/**
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
this.weight = weight;
}
protected AbstractComparator(final AbstractStringDistance ssalgo){
this.ssalgo = ssalgo;
}
protected AbstractComparator(final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
}
/**
* Normalize.
*
* @param d
* the d
* @return the double
*/
protected double normalize(double d) {
return d;
}
/**
* Normalize.
*
* @param d
* the d
* @return the double
*/
protected double normalize(double d) {
return d;
}
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
protected double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
protected double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; // return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
protected double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
protected double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
/**
* Convert the given argument to a List of Strings
*
* @param object
* function argument
* @return the list
*/
protected List<String> toList(final Object object) {
if (object instanceof List) {
return (List<String>)object;
}
/**
* Convert the given argument to a List of Strings
*
* @param object
* function argument
* @return the list
*/
protected List<String> toList(final Object object) {
if (object instanceof List) {
return (List<String>) object;
}
return Lists.newArrayList(object.toString());
}
return Lists.newArrayList(object.toString());
}
/**
* Convert the given argument to a String
*
* @param object
* function argument
* @return the list
*/
protected String toString(final Object object) {
if (object instanceof List) {
List<String> l = (List<String>) object;
return Joiner.on(" ").join(l);
}
/**
* Convert the given argument to a String
*
* @param object
* function argument
* @return the list
*/
protected String toString(final Object object) {
if (object instanceof List) {
List<String> l = (List<String>) object;
return Joiner.on(" ").join(l);
}
return object.toString();
}
return object.toString();
}
protected String toFirstString(final Object object) {
if (object instanceof List) {
List<String> l = (List<String>) object;
return l.isEmpty() ? "" : l.get(0);
}
protected String toFirstString(final Object object) {
if (object instanceof List) {
List<String> l = (List<String>) object;
return l.isEmpty() ? "" : l.get(0);
}
return object.toString();
}
return object.toString();
}
public double getWeight(){
return this.weight;
}
public double getWeight() {
return this.weight;
}
}

View File

@ -1,39 +1,41 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
package eu.dnetlib.pace.tree.support;
import java.util.List;
import java.util.Map;
abstract public class AbstractListComparator extends AbstractComparator<List<String>>{
protected AbstractListComparator(Map<String, String> params) {
super(params);
}
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(params, ssalgo);
}
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
abstract public class AbstractListComparator extends AbstractComparator<List<String>> {
protected AbstractListComparator(Map<String, String> params) {
super(params);
}
protected AbstractListComparator(AbstractStringDistance ssalgo) {
super(ssalgo);
}
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(params, ssalgo);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toList(a), toList(b), conf);
}
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
protected AbstractListComparator(AbstractStringDistance ssalgo) {
super(ssalgo);
}
return distance(concat(a), concat(b), conf);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toList(a), toList(b), conf);
}
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(concat(a), concat(b), conf);
}
}

View File

@ -1,40 +1,41 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
package eu.dnetlib.pace.tree.support;
import java.util.AbstractList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
public abstract class AbstractSortedComparator extends AbstractListComparator {
/**
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/**
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo){
super(Double.parseDouble(params.get("weight")), ssalgo);
}
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo) {
super(Double.parseDouble(params.get("weight")), ssalgo);
}
@Override
protected List<String> toList(final Object object) {
if (object instanceof List) {
List<String> fl = (List<String>) object;
List<String> values = Lists.newArrayList(fl);
Collections.sort(values);
return values;
}
@Override
protected List<String> toList(final Object object) {
if (object instanceof List) {
List<String> fl = (List<String>) object;
List<String> values = Lists.newArrayList(fl);
Collections.sort(values);
return values;
}
return Lists.newArrayList(object.toString());
}
return Lists.newArrayList(object.toString());
}
}

View File

@ -1,44 +1,46 @@
package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
package eu.dnetlib.pace.tree.support;
import java.util.Map;
public abstract class AbstractStringComparator extends AbstractComparator<String>{
protected AbstractStringComparator(Map<String, String> params) {
super(params);
}
import com.wcohen.ss.AbstractStringDistance;
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(params, ssalgo);
}
import eu.dnetlib.pace.config.Config;
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
public abstract class AbstractStringComparator extends AbstractComparator<String> {
protected AbstractStringComparator(Map<String, String> params) {
super(params);
}
protected AbstractStringComparator(AbstractStringDistance ssalgo) {
super(ssalgo);
}
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(params, ssalgo);
}
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toString(a), toString(b), conf);
}
protected AbstractStringComparator(AbstractStringDistance ssalgo) {
super(ssalgo);
}
public double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; // return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toString(a), toString(b), conf);
}
public double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
}

View File

@ -1,24 +1,21 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
public enum AggType {
W_MEAN, //weighted mean
AVG, //average
SUM,
MAX,
MIN,
AND, //used for necessary conditions
OR; //used for sufficient conditions
W_MEAN, // weighted mean
AVG, // average
SUM, MAX, MIN, AND, // used for necessary conditions
OR; // used for sufficient conditions
public static AggType getEnum(String value) {
public static AggType getEnum(String value) {
try {
return AggType.valueOf(value);
}
catch (IllegalArgumentException e) {
throw new PaceException("Undefined aggregation type", e);
}
}
try {
return AggType.valueOf(value);
} catch (IllegalArgumentException e) {
throw new PaceException("Undefined aggregation type", e);
}
}
}

View File

@ -1,12 +1,12 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
public interface Comparator<T> {
/*
* return : -1 -> can't decide (i.e. missing field)
* >0 -> similarity degree (depends on the algorithm)
* */
public double compare(Object a, Object b, Config conf);
/*
* return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm)
*/
public double compare(Object a, Object b, Config conf);
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.tree.support;
import java.lang.annotation.ElementType;
@ -9,5 +10,5 @@ import java.lang.annotation.Target;
@Target(ElementType.TYPE)
public @interface ComparatorClass {
public String value();
public String value();
}

View File

@ -1,82 +1,84 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
/**
* The class that defines the configuration of each field in the decision tree.
* */
public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator
private String comparator; //comparator name
private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,String> params; //parameters
private String field; // name of the field on which apply the comparator
private String comparator; // comparator name
private double weight = 1.0; // weight for the field (to be used in the aggregation)
private Map<String, String> params; // parameters
private boolean countIfUndefined;
private boolean countIfUndefined;
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public FieldConf() {
}
public FieldConf() {
}
public FieldConf(String field, String comparator, double weight, Map<String, String> params, boolean countIfUndefined) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
this.countIfUndefined = countIfUndefined;
}
public FieldConf(String field, String comparator, double weight, Map<String, String> params,
boolean countIfUndefined) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
this.countIfUndefined = countIfUndefined;
}
public String getField() {
return field;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public void setField(String field) {
this.field = field;
}
public String getComparator() {
return comparator;
}
public String getComparator() {
return comparator;
}
public void setComparator(String comparator) {
this.comparator = comparator;
}
public void setComparator(String comparator) {
this.comparator = comparator;
}
public double getWeight() {
return weight;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public Map<String, String> getParams() {
return params;
}
public Map<String, String> getParams() {
return params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,87 +1,89 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
package eu.dnetlib.pace.tree.support;
import java.io.IOException;
import java.io.Serializable;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
/**
* The class that contains the result of each comparison in the decision tree
* */
public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation)
private double threshold; //threshold for the field (to be used in some kind of aggregations)
private double result; //the result of the comparison
private Object a;
private Object b;
private double weight; // weight for the field (to be used in the aggregation)
private double threshold; // threshold for the field (to be used in some kind of aggregations)
private double result; // the result of the comparison
private Object a;
private Object b;
private boolean countIfUndefined;
private boolean countIfUndefined;
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
this.weight = weight;
this.threshold = threshold;
this.result = result;
this.countIfUndefined = countIfUndefined;
this.a = a;
this.b = b;
}
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
this.weight = weight;
this.threshold = threshold;
this.result = result;
this.countIfUndefined = countIfUndefined;
this.a = a;
this.b = b;
}
public double getThreshold() {
return threshold;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public double getWeight() {
return weight;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public double getResult() {
return result;
}
public double getResult() {
return result;
}
public void setResult(double result) {
this.result = result;
}
public void setResult(double result) {
this.result = result;
}
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public Object getA() {
return a;
}
public Object getA() {
return a;
}
public void setA(Object a) {
this.a = a;
}
public void setA(Object a) {
this.a = a;
}
public Object getB() {
return b;
}
public Object getB() {
return b;
}
public void setB(Object b) {
this.b = b;
}
public void setB(Object b) {
this.b = b;
}
@Override
public String toString(){
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,20 +1,19 @@
package eu.dnetlib.pace.tree.support;
public enum MatchType {
MATCH,
NO_MATCH,
UNDEFINED;
MATCH, NO_MATCH, UNDEFINED;
public static MatchType parse(String value) {
public static MatchType parse(String value) {
if (MATCH.name().equals(value)) {
return MATCH;
} else if (NO_MATCH.name().equals(value)) {
return NO_MATCH;
} else {
return UNDEFINED;
}
if (MATCH.name().equals(value)) {
return MATCH;
} else if (NO_MATCH.name().equals(value)) {
return NO_MATCH;
} else {
return UNDEFINED;
}
// try {
// return MatchType.valueOf(value);
@ -22,5 +21,5 @@ public enum MatchType {
// catch (IllegalArgumentException e) {
// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
// }
}
}
}

View File

@ -1,166 +1,170 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
package eu.dnetlib.pace.tree.support;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
public class TreeNodeDef implements Serializable {
final static String CROSS_COMPARE = "crossCompare";
final static String CROSS_COMPARE = "crossCompare";
private List<FieldConf> fields;
private AggType aggregation;
private List<FieldConf> fields;
private AggType aggregation;
private double threshold;
private double threshold;
private String positive;
private String negative;
private String undefined;
private String positive;
private String negative;
private String undefined;
boolean ignoreUndefined;
boolean ignoreUndefined;
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) {
this.fields = fields;
this.aggregation = aggregation;
this.threshold = threshold;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.ignoreUndefined = ignoreUndefined;
}
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative,
String undefined, boolean ignoreUndefined) {
this.fields = fields;
this.aggregation = aggregation;
this.threshold = threshold;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.ignoreUndefined = ignoreUndefined;
}
public TreeNodeDef() {}
public TreeNodeDef() {
}
//function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
// function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats();
TreeNodeStats stats = new TreeNodeStats();
//for each field in the node, it computes the
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double result;
// for each field in the node, it computes the
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double result;
Object value1 = getJavaValue(doc1,fieldConf.getField());
Object value2 = getJavaValue(doc2,fieldConf.getField());
Object value1 = getJavaValue(doc1, fieldConf.getField());
Object value2 = getJavaValue(doc2, fieldConf.getField());
// if the param specifies a cross comparison (i.e. compare elements from different fields), compute the
// result for both sides and return the maximum
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
if (crossField != null) {
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf);
double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf);
result = Math.max(result1, result2);
} else {
result = comparator(fieldConf).compare(value1, value2, conf);
}
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
if (crossField != null) {
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2,crossField), conf);
double result2 = comparator(fieldConf).compare(getJavaValue(doc1,crossField), value2, conf);
result = Math.max(result1,result2);
}
else {
result = comparator(fieldConf).compare(value1, value2, conf);
}
stats
.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
result,
fieldConf.isCountIfUndefined(),
value1,
value2));
}
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
result,
fieldConf.isCountIfUndefined(),
value1,
value2
));
}
return stats;
}
return stats;
}
public Object getJavaValue(Row row, String name) {
int pos = row.fieldIndex(name);
if (pos >= 0) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
}
public Object getJavaValue(Row row, String name) {
int pos = row.fieldIndex(name);
if (pos >= 0) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
}
return null;
}
return null;
}
private Comparator comparator(final FieldConf field) {
private Comparator comparator(final FieldConf field){
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
}
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
}
public List<FieldConf> getFields() {
return fields;
}
public List<FieldConf> getFields() {
return fields;
}
public void setFields(List<FieldConf> fields) {
this.fields = fields;
}
public void setFields(List<FieldConf> fields) {
this.fields = fields;
}
public AggType getAggregation() {
return aggregation;
}
public AggType getAggregation() {
return aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
}
public double getThreshold() {
return threshold;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public String getPositive() {
return positive;
}
public String getPositive() {
return positive;
}
public void setPositive(String positive) {
this.positive = positive;
}
public void setPositive(String positive) {
this.positive = positive;
}
public String getNegative() {
return negative;
}
public String getNegative() {
return negative;
}
public void setNegative(String negative) {
this.negative = negative;
}
public void setNegative(String negative) {
this.negative = negative;
}
public String getUndefined() {
return undefined;
}
public String getUndefined() {
return undefined;
}
public void setUndefined(String undefined) {
this.undefined = undefined;
}
public void setUndefined(String undefined) {
this.undefined = undefined;
}
public boolean isIgnoreUndefined() {
return ignoreUndefined;
}
public boolean isIgnoreUndefined() {
return ignoreUndefined;
}
public void setIgnoreUndefined(boolean ignoreUndefined) {
this.ignoreUndefined = ignoreUndefined;
}
public void setIgnoreUndefined(boolean ignoreUndefined) {
this.ignoreUndefined = ignoreUndefined;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.tree.support;
import java.io.Serializable;
@ -6,129 +7,128 @@ import java.util.Map;
public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; //this is an accumulator for the results of the node
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
public TreeNodeStats(){
this.results = new HashMap<>();
}
public TreeNodeStats() {
this.results = new HashMap<>();
}
public Map<String, FieldStats> getResults() {
return results;
}
public Map<String, FieldStats> getResults() {
return results;
}
public void addFieldStats(String id, FieldStats fieldStats){
this.results.put(id, fieldStats);
}
public void addFieldStats(String id, FieldStats fieldStats) {
this.results.put(id, fieldStats);
}
public int fieldsCount(){
return this.results.size();
}
public int fieldsCount() {
return this.results.size();
}
public int undefinedCount(){
int undefinedCount = 0;
for(FieldStats fs: this.results.values()){
if(fs.getResult() == -1)
undefinedCount ++;
}
return undefinedCount;
}
public int undefinedCount() {
int undefinedCount = 0;
for (FieldStats fs : this.results.values()) {
if (fs.getResult() == -1)
undefinedCount++;
}
return undefinedCount;
}
public double scoreSum(){
double scoreSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0) {
scoreSum += fs.getResult();
}
}
return scoreSum;
}
public double scoreSum() {
double scoreSum = 0.0;
for (FieldStats fs : this.results.values()) {
if (fs.getResult() >= 0.0) {
scoreSum += fs.getResult();
}
}
return scoreSum;
}
//return the sum of the weights without considering the fields with countIfMissing=false && result=-1
public double weightSum(){
double weightSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) {
weightSum += fs.getWeight();
}
}
return weightSum;
}
// return the sum of the weights without considering the fields with countIfMissing=false && result=-1
public double weightSum() {
double weightSum = 0.0;
for (FieldStats fs : this.results.values()) {
if (fs.getResult() >= 0.0 || (fs.getResult() < 0.0 && fs.isCountIfUndefined())) {
weightSum += fs.getWeight();
}
}
return weightSum;
}
public double weightedScoreSum(){
double weightedScoreSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0) {
weightedScoreSum += fs.getResult()*fs.getWeight();
}
}
return weightedScoreSum;
}
public double weightedScoreSum() {
double weightedScoreSum = 0.0;
for (FieldStats fs : this.results.values()) {
if (fs.getResult() >= 0.0) {
weightedScoreSum += fs.getResult() * fs.getWeight();
}
}
return weightedScoreSum;
}
public double max(){
double max = -1.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>max)
max = fs.getResult();
}
return max;
}
public double max() {
double max = -1.0;
for (FieldStats fs : this.results.values()) {
if (fs.getResult() > max)
max = fs.getResult();
}
return max;
}
public double min(){
double min = 100.0; //random high value
for(FieldStats fs: this.results.values()){
if(fs.getResult()<min) {
if (fs.getResult()>=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
min = fs.getResult();
}
}
return min;
}
public double min() {
double min = 100.0; // random high value
for (FieldStats fs : this.results.values()) {
if (fs.getResult() < min) {
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
min = fs.getResult();
}
}
return min;
}
//if at least one is true, return 1.0
public double or(){
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0;
}
return 0.0;
}
// if at least one is true, return 1.0
public double or() {
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0;
}
return 0.0;
}
//if at least one is false, return 0.0
public double and() {
for (FieldStats fieldStats : this.results.values()) {
// if at least one is false, return 0.0
public double and() {
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined())
return 0.0;
}
else {
if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0;
}
if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined())
return 0.0;
} else {
if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0;
}
}
return 1.0;
}
}
return 1.0;
}
public double getFinalScore(AggType aggregation){
public double getFinalScore(AggType aggregation) {
switch (aggregation){
case AVG:
return scoreSum()/fieldsCount();
case SUM:
return scoreSum();
case MAX:
return max();
case MIN:
return min();
case W_MEAN:
return weightedScoreSum()/weightSum();
case OR:
return or();
case AND:
return and();
default:
return 0.0;
}
}
switch (aggregation) {
case AVG:
return scoreSum() / fieldsCount();
case SUM:
return scoreSum();
case MAX:
return max();
case MIN:
return min();
case W_MEAN:
return weightedScoreSum() / weightSum();
case OR:
return or();
case AND:
return and();
default:
return 0.0;
}
}
}

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Row;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.util.PaceException;
/**
* The compare between two documents is given by the weighted mean of the field distances
@ -23,11 +24,11 @@ public class TreeProcessor {
// row based copies
public boolean compare(final Row a, final Row b) {
//evaluate the decision tree
// evaluate the decision tree
return evaluateTree(a, b).getResult() == MatchType.MATCH;
}
public TreeStats evaluateTree(final Row doc1, final Row doc2){
public TreeStats evaluateTree(final Row doc1, final Row doc2) {
TreeStats treeStats = new TreeStats();
@ -36,26 +37,25 @@ public class TreeProcessor {
do {
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
//throw an exception if the node doesn't exist
// throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("Missing tree node: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats);
//if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
// if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
nextNodeName = currentNode.getUndefined();
}
//if ignoreUndefined=true the miss is ignored and the score computed anyway
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive();
}
else {
} else {
nextNodeName = currentNode.getNegative();
}
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED);
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
treeStats.setResult(MatchType.parse(nextNodeName));
return treeStats;
@ -68,25 +68,24 @@ public class TreeProcessor {
do {
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
//throw an exception if the node doesn't exist
// throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
score = stats.getFinalScore(currentNode.getAggregation());
//if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
// if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
nextNodeName = currentNode.getUndefined();
}
//if ignoreUndefined=true the miss is ignored and the score computed anyway
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive();
}
else {
} else {
nextNodeName = currentNode.getNegative();
}
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED);
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
return score;
}

View File

@ -1,51 +1,52 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import com.fasterxml.jackson.databind.ObjectMapper;
package eu.dnetlib.pace.tree.support;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
public class TreeStats {
//<layer_id, <field:comparator, result>>
Map<String, TreeNodeStats> stats;
MatchType result;
// <layer_id, <field:comparator, result>>
Map<String, TreeNodeStats> stats;
MatchType result;
public TreeStats(){
this.stats = new HashMap<>();
this.result = MatchType.NO_MATCH;
}
public TreeStats() {
this.stats = new HashMap<>();
this.result = MatchType.NO_MATCH;
}
public MatchType getResult(){
return this.result;
}
public MatchType getResult() {
return this.result;
}
public void setResult(MatchType result){
this.result = result;
}
public void setResult(MatchType result) {
this.result = result;
}
public Map<String, TreeNodeStats> getStats() {
return stats;
}
public Map<String, TreeNodeStats> getStats() {
return stats;
}
public void setStats(Map<String, TreeNodeStats> stats) {
this.stats = stats;
}
public void setStats(Map<String, TreeNodeStats> stats) {
this.stats = stats;
}
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){
this.stats.put(layerID, treeNodeStats);
}
@Override
public String toString(){
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats) {
this.stats.put(layerID, treeNodeStats);
}
@Override
public String toString() {
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -1,8 +1,11 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -12,127 +15,137 @@ import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
public class BlockProcessor {
public static final List<String> accumulators= new ArrayList<>();
public static final List<String> accumulators = new ArrayList<>();
private static final Log log = LogFactory.getLog(BlockProcessor.class);
private static final Log log = LogFactory.getLog(BlockProcessor.class);
private DedupConfig dedupConf;
private DedupConfig dedupConf;
private final int identifierFieldPos;
private final int orderFieldPos;
private final int identifierFieldPos;
private final int orderFieldPos;
public static void constructAccumulator( final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
}
public static void constructAccumulator(final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators
.add(
String
.format(
"%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
accumulators
.add(
String
.format(
"%s::%s", dedupConf.getWf().getEntityType(),
String
.format(
"Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(),
dedupConf.getWf().getGroupMaxSize())));
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"));
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
accumulators
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
}
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
this.dedupConf = dedupConf;
this.identifierFieldPos = identifierFieldPos;
this.orderFieldPos = orderFieldPos;
}
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
this.dedupConf = dedupConf;
this.identifierFieldPos = identifierFieldPos;
this.orderFieldPos = orderFieldPos;
}
public void processSortedRows(final Collection<Row> documents, final Reporter context) {
if (documents.size() > 1) {
public void processSortedRows(final Collection<Row> documents, final Reporter context) {
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
processRows(documents, context);
processRows(documents, context);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
private void processRows(final Collection<Row> queue, final Reporter context) {
private void processRows(final Collection<Row> queue, final Reporter context) {
Iterator<Row> it = queue.iterator();
while (it.hasNext()) {
Iterator<Row> it = queue.iterator();
while (it.hasNext()) {
final Row pivot = it.next();
it.remove();
final Row pivot = it.next();
it.remove();
final String idPivot = pivot.getString(identifierFieldPos); // identifier
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
final WfConfig wf = dedupConf.getWf();
if (fieldPivot != null) {
int i = 0;
for (final Row curr : queue) {
final String idCurr = curr.getString(identifierFieldPos); // identifier
final String idPivot = pivot.getString(identifierFieldPos); //identifier
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
final WfConfig wf = dedupConf.getWf();
if (mustSkip(idCurr)) {
if (fieldPivot != null) {
int i = 0;
for (final Row curr : queue) {
final String idCurr = curr.getString(identifierFieldPos); //identifier
context.incrementCounter(wf.getEntityType(), "skip list", 1);
if (mustSkip(idCurr)) {
break;
}
context.incrementCounter(wf.getEntityType(), "skip list", 1);
if (i > wf.getSlidingWindowSize()) {
break;
}
break;
}
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
if (i > wf.getSlidingWindowSize()) {
break;
}
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
}
}
}
}
}
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
public Object getJavaValue(Row row, int pos) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
}
}
}
}
}
return null;
}
public Object getJavaValue(Row row, int pos) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
return null;
}
if (result) {
writeSimilarity(context, idPivot, idCurr);
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
}
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}
if (result) {
writeSimilarity(context, idPivot, idCurr);
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
}
}
private String getNsPrefix(final String id) {
return StringUtils.substringBetween(id, "|", "::");
}
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}
private void writeSimilarity(final Reporter context, final String from, final String to) {
final String type = dedupConf.getWf().getEntityType();
private String getNsPrefix(final String id) {
return StringUtils.substringBetween(id, "|", "::");
}
private void writeSimilarity(final Reporter context, final String from, final String to) {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
context.emit(type, to, from);
}
context.emit(type, from, to);
context.emit(type, to, from);
}
}

View File

@ -1,15 +1,18 @@
package eu.dnetlib.pace.util;
import org.apache.commons.lang3.text.WordUtils;
import com.google.common.base.Function;
import org.apache.commons.lang3.text.WordUtils;
public class Capitalise implements Function<String, String> {
private final char[] DELIM = {' ', '-'};
private final char[] DELIM = {
' ', '-'
};
@Override
public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM);
}
@Override
public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM);
}
};

View File

@ -1,3 +1,4 @@
package eu.dnetlib.pace.util;
import com.google.common.base.Function;
@ -7,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
public String apply(String s) {
return s.length() == 1 ? s + "." : s;
}
};
};

View File

@ -1,117 +1,172 @@
package eu.dnetlib.pace.util;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
package eu.dnetlib.pace.util;
import java.math.BigDecimal;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
public class MapDocumentUtil {
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
public static List<String> getJPathList(String path, String json, Type type) {
if (type == Type.List)
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
Object jresult;
List<String> result = new ArrayList<>();
try {
jresult = JsonPath.read(json, path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
public static List<String> getJPathList(String path, String json, Type type) {
if (type == Type.List)
return JsonPath
.using(
Configuration
.defaultConfiguration()
.addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS))
.parse(json)
.read(path);
Object jresult;
List<String> result = new ArrayList<>();
try {
jresult = JsonPath.read(json, path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
((JSONArray) jresult).forEach(it -> {
((JSONArray) jresult).forEach(it -> {
try {
result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) {
try {
result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) {
}
}
);
return result;
}
}
});
return result;
}
if (jresult instanceof LinkedHashMap) {
try {
result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) {
if (jresult instanceof LinkedHashMap) {
try {
result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) {
}
return result;
}
if (jresult instanceof String) {
result.add((String) jresult);
}
return result;
}
}
return result;
}
if (jresult instanceof String) {
result.add((String) jresult);
}
return result;
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String)o;
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
return (String)((JSONArray)o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
public static double[] getJPathArray(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal) objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
} catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static double[] getJPathArray(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static String truncateValue(String value, int length) {
if (value == null)
return "";
if (length == -1 || length > value.length())
return value;
public static String truncateValue(String value, int length) {
if (value == null)
return "";
return value.substring(0, length);
}
if (length == -1 || length > value.length())
return value;
public static List<String> truncateList(List<String> list, int size) {
if (size == -1 || size > list.size())
return list;
return value.substring(0, length);
}
return list.subList(0, size);
}
public static List<String> truncateList(List<String> list, int size) {
if (size == -1 || size > list.size())
return list;
public static String getJPathString(final String jsonPath, final DocumentContext json) {
try {
Object o = json.read(jsonPath);
if (o instanceof String)
return (String) o;
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
return (String) ((JSONArray) o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
return list.subList(0, size);
}
public static List<String> getJPathList(String path, DocumentContext json, Type type) {
// if (type == Type.List)
// return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST,
// Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
Object jresult;
List<String> result = new ArrayList<>();
try {
jresult = json.read(path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
((JSONArray) jresult).forEach(it -> {
try {
result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) {
}
});
return result;
}
if (jresult instanceof LinkedHashMap) {
try {
result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) {
}
return result;
}
if (jresult instanceof String) {
result.add((String) jresult);
}
return result;
}
}

View File

@ -1,13 +1,14 @@
package eu.dnetlib.pace.util;
public class PaceException extends RuntimeException {
public PaceException(String s, Throwable e){
super(s, e);
}
public PaceException(String s, Throwable e) {
super(s, e);
}
public PaceException(String s){
super(s);
}
public PaceException(String s) {
super(s);
}
}

View File

@ -1,49 +1,61 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.reflections.Reflections;
package eu.dnetlib.pace.util;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
public class PaceResolver implements Serializable {
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<Comparator>> comparators;
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<Comparator>> comparators;
public PaceResolver() {
public PaceResolver() {
this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
this.clusteringFunctions = CLUSTERING_RESOLVER
.getTypesAnnotatedWith(ClusteringClass.class)
.stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(
Collectors
.toMap(
cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>) cl));
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
}
this.comparators = COMPARATOR_RESOLVER
.getTypesAnnotatedWith(ComparatorClass.class)
.stream()
.filter(Comparator.class::isAssignableFrom)
.collect(
Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
try {
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + " not found ", e);
}
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
try {
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException
| NoSuchMethodException e) {
throw new PaceException(name + " not found ", e);
}
}
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
throw new PaceException(name + " not found ", e);
}
}
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException
| NullPointerException e) {
throw new PaceException(name + " not found ", e);
}
}
}

View File

@ -1,11 +1,11 @@
package eu.dnetlib.pace.util;
package eu.dnetlib.pace.util;
import java.io.Serializable;
public interface Reporter extends Serializable {
void incrementCounter(String counterGroup, String counterName, long delta);
void incrementCounter(String counterGroup, String counterName, long delta);
void emit(String type, String from, String to);
void emit(String type, String from, String to);
}

View File

@ -0,0 +1,86 @@
package eu.dnetlib.pace.util;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.spark.SparkContext;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.Reporter;
import scala.Serializable;
import scala.Tuple2;
public class SparkReporter implements Serializable, Reporter {
private final List<Tuple2<String, String>> relations = new ArrayList<>();
private final Map<String, LongAccumulator> accumulators;
public SparkReporter(Map<String, LongAccumulator> accumulators) {
this.accumulators = accumulators;
}
public void incrementCounter(
String counterGroup,
String counterName,
long delta,
Map<String, LongAccumulator> accumulators) {
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
if (accumulators.containsKey(accumulatorName)) {
accumulators.get(accumulatorName).add(delta);
}
}
@Override
public void incrementCounter(String counterGroup, String counterName, long delta) {
incrementCounter(counterGroup, counterName, delta, accumulators);
}
@Override
public void emit(String type, String from, String to) {
relations.add(new Tuple2<>(from, to));
}
public List<Tuple2<String, String>> getRelations() {
return relations;
}
public static Map<String, LongAccumulator> constructAccumulator(
final DedupConfig dedupConf, final SparkContext context) {
Map<String, LongAccumulator> accumulators = new HashMap<>();
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
accumulators.put(acc1, context.longAccumulator(acc1));
String acc2 = String
.format(
"%s::%s",
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
accumulators.put(acc2, context.longAccumulator(acc2));
String acc3 = String
.format(
"%s::%s",
dedupConf.getWf().getEntityType(),
String
.format(
"Skipped records for count(%s) >= %s",
dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
accumulators.put(acc3, context.longAccumulator(acc3));
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
accumulators.put(acc4, context.longAccumulator(acc4));
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
accumulators.put(acc5, context.longAccumulator(acc5));
String acc6 = String
.format(
"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
accumulators.put(acc6, context.longAccumulator(acc6));
return accumulators;
}
}

View File

@ -1,12 +1,14 @@
package eu.dnetlib.pace;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import org.apache.commons.io.IOUtils;
package eu.dnetlib.pace;
import java.io.IOException;
import java.io.StringWriter;
import java.util.List;
import org.apache.commons.io.IOUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) {
@ -35,7 +37,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
return a;
}
protected List<String> createFieldList(List<String> strings, String fieldName){
protected List<String> createFieldList(List<String> strings, String fieldName) {
return strings;
}

View File

@ -1,17 +1,20 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
package eu.dnetlib.pace.clustering;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.junit.jupiter.api.*;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.DedupConfig;
public class ClusteringFunctionTest extends AbstractPaceTest {
private static Map<String, Integer> params;
@ -20,7 +23,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
@BeforeAll
public static void setUp() throws Exception {
params = Maps.newHashMap();
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
conf = DedupConfig
.load(
AbstractPaceFunctions
.readFromClasspath(
"/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
}
@Test
@ -210,7 +217,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
@Test
public void testPersonClustering(){
public void testPersonClustering() {
final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N.";
@ -224,7 +231,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
@Test
public void testPersonHash(){
public void testPersonHash() {
final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo";
@ -238,7 +245,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
@Test
public void testLastNameFirstInitial(){
public void testLastNameFirstInitial() {
final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong";
@ -246,4 +253,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
}
}
}

View File

@ -1,56 +1,57 @@
package eu.dnetlib.pace.common;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.*;
public class PaceFunctionTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
@Test
public void normalizePidTest(){
@Test
public void normalizePidTest() {
assertEquals("identifier", normalizePid("IdentifIer"));
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
}
assertEquals("identifier", normalizePid("IdentifIer"));
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
}
@Test
public void filterAllStopwordsTest(){
@Test
public void filterAllStopwordsTest() {
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
}
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
}
@Test
public void normalizeTest() {
assertEquals("universitat", normalize("Universität"));
@Test
public void normalizeTest() {
assertEquals("universitat", normalize("Universität"));
System.out.println(normalize("İstanbul Ticarət Universiteti"));
}
System.out.println(normalize("İstanbul Ticarət Universiteti"));
}
@Test
public void cleanupTest() {
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
@Test
public void cleanupTest() {
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
System.out.println("cleaned up : " + cleanup(TEST_STRING));
}
System.out.println("cleaned up : " + cleanup(TEST_STRING));
}
@Test
public void testGetNumbers() {
System.out.println("Numbers : " + getNumbers(TEST_STRING));
}
@Test
public void testGetNumbers() {
System.out.println("Numbers : " + getNumbers(TEST_STRING));
}
@Test
public void testRemoveSymbols() {
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
}
@Test
public void testRemoveSymbols() {
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
}
@Test
public void testFixAliases() {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
}
@Test
public void testFixAliases() {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
}
}

View File

@ -1,16 +1,18 @@
package eu.dnetlib.pace.comparators;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.*;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.tree.*;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import java.util.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class ComparatorTest extends AbstractPaceTest {
@ -26,7 +28,8 @@ public class ComparatorTest extends AbstractPaceTest {
params.put("name_th", "0.95");
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
conf = DedupConfig
.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@Test
@ -39,32 +42,38 @@ public class ComparatorTest extends AbstractPaceTest {
public void cityMatchTest() {
final CityMatch cityMatch = new CityMatch(params);
//both names with no cities
// both names with no cities
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
//one of the two names with no cities
// one of the two names with no cities
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
//both names with cities (same)
// both names with cities (same)
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
//both names with cities (different)
// both names with cities (different)
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
//particular cases
// particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
assertEquals(
1.0,
cityMatch
.distance(
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
conf));
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
}
@Test
public void keywordMatchTest(){
public void keywordMatchTest() {
params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
@ -77,7 +86,7 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
public void listContainsMatchTest(){
public void listContainsMatchTest() {
List<String> a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
List<String> b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
@ -100,7 +109,7 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
public void stringContainsMatchTest(){
public void stringContainsMatchTest() {
params.put("string", "openorgs");
params.put("bool", "XOR");
@ -120,7 +129,7 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
public void numbersMatchTest(){
public void numbersMatchTest() {
final NumbersMatch numbersMatch = new NumbersMatch(params);
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
@ -128,7 +137,7 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
public void romansMatchTest(){
public void romansMatchTest() {
final RomansMatch romansMatch = new RomansMatch(params);
@ -142,8 +151,9 @@ public class ComparatorTest extends AbstractPaceTest {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result);
double result = jaroWinklerNormalizedName
.distance("AT&T (United States)", "United States Military Academy", conf);
System.out.println("result = " + result);
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result);
@ -171,7 +181,11 @@ public class ComparatorTest extends AbstractPaceTest {
final LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result = levensteinTitle.distance("Degradation of lignin βaryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK6", "Degradation of lignin β-aryl ether units in <i>Arabidopsis thaliana</i> expressing <i>LigD</i>, <i>LigF</i> and <i>LigG</i> from <i>Sphingomonas paucimobilis</i> SYK-6", conf);
double result = levensteinTitle
.distance(
"Degradation of lignin βaryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK6",
"Degradation of lignin β-aryl ether units in <i>Arabidopsis thaliana</i> expressing <i>LigD</i>, <i>LigF</i> and <i>LigG</i> from <i>Sphingomonas paucimobilis</i> SYK-6",
conf);
System.out.println("result = " + result);
}
@ -195,13 +209,16 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result);
List<String> c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
List<String> c = createFieldList(
Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
result = instanceTypeMatch.compare(c, b, conf);
assertEquals(1.0, result);
List<String> d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
List<String> e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
List<String> d = createFieldList(
Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
List<String> e = createFieldList(
Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
result = instanceTypeMatch.compare(d, e, conf);
assertEquals(1.0, result);
@ -222,7 +239,8 @@ public class ComparatorTest extends AbstractPaceTest {
AuthorsMatch authorsMatch = new AuthorsMatch(params);
List<String> a = createFieldList(Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
List<String> a = createFieldList(
Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
List<String> b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
double result = authorsMatch.compare(a, b, conf);
@ -232,7 +250,7 @@ public class ComparatorTest extends AbstractPaceTest {
List<String> d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
result = authorsMatch.compare(c, d, conf);
assertEquals(0.0, result) ;
assertEquals(0.0, result);
params.put("mode", "surname");
authorsMatch = new AuthorsMatch(params);
@ -246,7 +264,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.25, result);
List<String> f = createFieldList(new ArrayList<>(), "authors");
result = authorsMatch.compare(f,f, conf);
result = authorsMatch.compare(f, f, conf);
System.out.println("result = " + result);
}
@ -256,8 +274,19 @@ public class ComparatorTest extends AbstractPaceTest {
JsonListMatch jsonListMatch = new JsonListMatch(params);
List<String> a = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors");
List<String> b = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), "authors");
List<String> a = createFieldList(
Arrays
.asList(
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
"authors");
List<String> b = createFieldList(
Arrays
.asList(
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
"authors");
double result = jsonListMatch.compare(a, b, conf);
@ -287,13 +316,16 @@ public class ComparatorTest extends AbstractPaceTest {
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
double[] a = new double[]{1,2,3};
double[] b = new double[]{1,2,3};
double[] a = new double[] {
1, 2, 3
};
double[] b = new double[] {
1, 2, 3
};
double compare = cosineSimilarity.compare(a, b, conf);
System.out.println("compare = " + compare);
}
}

View File

@ -1,17 +1,17 @@
package eu.dnetlib.pace.config;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.util.MapDocumentUtil;
public class ConfigTest extends AbstractPaceTest {
@ -56,7 +56,7 @@ public class ConfigTest extends AbstractPaceTest {
System.out.println("translationMap = " + translationMap.size());
for (String key: translationMap.keySet()) {
for (String key : translationMap.keySet()) {
if (translationMap.get(key).equals("key::1"))
System.out.println("key = " + key);
}
@ -70,13 +70,13 @@ public class ConfigTest extends AbstractPaceTest {
assertEquals(0, load.getPace().translationMap().keySet().size());
}
@Test
public void testJPath() {
final String json = readFromClasspath("organization.json");
@Test
public void testJPath() {
final String json = readFromClasspath("organization.json");
final String jpath ="$.id";
final String jpath = "$.id";
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
}
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
}
}

View File

@ -1,40 +1,43 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.HashMap;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.*;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
public class UtilTest {
static Map<String, String> params;
static Map<String, String> params;
@BeforeAll
public static void setUp(){
params = new HashMap<>();
}
@BeforeAll
public static void setUp() {
params = new HashMap<>();
}
@Test
@Ignore
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
}
@Test
@Ignore
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
}
@Test
public void personTest() {
Person p = new Person("j. f. kennedy", false);
@Test
public void personTest() {
Person p = new Person("j. f. kennedy", false);
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
p = new Person("Guan-Hua Du", false);
p = new Person("Guan-Hua Du", false);
System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString());
}
System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString());
}
}

View File

@ -1,16 +1,18 @@
package eu.dnetlib.dhp.broker.oa.util;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.IOException;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkDedupConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
public class TrustUtils {
@ -18,13 +20,18 @@ public class TrustUtils {
private static DedupConfig dedupConfig;
private static SparkDedupConfig sparkDedupConfig;
private static final ObjectMapper mapper;
static {
final ObjectMapper mapper = new ObjectMapper();
mapper = new ObjectMapper();
try {
dedupConfig = mapper
.readValue(
DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
DedupConfig.class);
sparkDedupConfig = new SparkDedupConfig(dedupConfig, 1);
} catch (final IOException e) {
log.error("Error loading dedupConfig, e");
}
@ -40,11 +47,8 @@ public class TrustUtils {
}
try {
final ObjectMapper objectMapper = new ObjectMapper();
final Row doc1 = MapDocumentUtil
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
final Row doc2 = MapDocumentUtil
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
final Row doc1 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r1));
final Row doc2 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r2));
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);

View File

@ -53,13 +53,17 @@
</dependencyManagement>
<dependencies>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-api</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>io.opentelemetry</groupId>
<artifactId>opentelemetry-sdk</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
@ -83,31 +87,21 @@
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.arakelian</groupId>
<artifactId>java-jq</artifactId>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
</dependency>
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-graphx_2.11</artifactId>
@ -141,12 +135,7 @@
<version>1.4.200</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.11</artifactId>
<version>2.4.0.cloudera2</version>
<scope>compile</scope>
</dependency>
</dependencies>

View File

@ -3,29 +3,20 @@ package eu.dnetlib.dhp.oa.dedup
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.oa.dedup.dsl.{Clustering, Deduper}
import eu.dnetlib.dhp.oa.dedup.model.BlockStats
import eu.dnetlib.dhp.oa.dedup.model.SparkDedupConfig
import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService
import eu.dnetlib.pace.config.DedupConfig
import eu.dnetlib.pace.model.RowDataOrderingComparator
import eu.dnetlib.enabling.is.lookup.rmi.{ISLookUpException, ISLookUpService}
import eu.dnetlib.pace.model.{RowDataOrderingComparator, SparkDedupConfig}
import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types.DataTypes
import org.dom4j.DocumentException
import org.slf4j.Logger
import org.slf4j.LoggerFactory
import org.xml.sax.SAXException
import java.io.IOException
import java.util
import java.util.Optional
import java.util.stream.Collectors
import scala.collection.Seq
object DSLExample {
private val log = LoggerFactory.getLogger(classOf[DSLExample])
@ -64,15 +55,15 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends
DSLExample.log.info("isLookUpUrl: '{}'", isLookUpUrl)
DSLExample.log.info("actionSetId: '{}'", actionSetId)
DSLExample.log.info("workingPath: '{}'", workingPath)
// for each dedup configuration
// for each dedup configuration
import scala.collection.JavaConversions._
for (dedupConf <- getConfigurations(isLookUpService, actionSetId).subList(0, 1)) {
val subEntity = dedupConf.getWf.getSubEntityValue
DSLExample.log.info("Creating blockstats for: '{}'", subEntity)
val outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity)
AbstractSparkAction.removeOutputDir(spark, outputPath)
val sc = JavaSparkContext.fromSparkContext(spark.sparkContext)
val sparkConfig = new SparkDedupConfig(dedupConf, numPartitions)
val sparkConfig = SparkDedupConfig(dedupConf, numPartitions)
val inputDF = spark.read
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
@ -87,8 +78,7 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends
Clustering("suffixprefix", Seq("legalname"), Map("max" -> 1, "len" -> 3)),
Clustering("urlclustering", Seq("websiteurl")),
Clustering("keywordsclustering", Seq("fields"), Map("max" -> 2, "windowSize" -> 4))
);
)
simRels
.map[BlockStats](

Some files were not shown because too many files have changed in this diff Show More