forked from D-Net/dnet-hadoop
WIP: various refactors
This commit is contained in:
parent
4c2dfcbdf7
commit
649679de8d
|
@ -81,9 +81,12 @@
|
|||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-catalyst_2.11</artifactId>
|
||||
<version>2.4.0.cloudera2</version>
|
||||
<scope>compile</scope>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
|
|
@ -1,8 +1,5 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
|
@ -10,32 +7,39 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
protected Map<String, Integer> params;
|
||||
|
||||
|
||||
public AbstractClusteringFunction(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
protected abstract Collection<String> doApply(Config conf, String s);
|
||||
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<String> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
return fields
|
||||
.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
public Map<String, Integer> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
|
||||
protected Integer param(String name) {
|
||||
return params.get(name);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -6,6 +7,7 @@ import java.util.Set;
|
|||
import java.util.StringTokenizer;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("acronyms")
|
||||
|
@ -19,16 +21,16 @@ public class Acronyms extends AbstractClusteringFunction {
|
|||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||
}
|
||||
|
||||
|
||||
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
|
||||
|
||||
|
||||
final Set<String> acronyms = Sets.newLinkedHashSet();
|
||||
|
||||
|
||||
for (int i = 0; i < maxAcronyms; i++) {
|
||||
|
||||
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (sb.length() > maxLen) {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
|
@ -9,5 +10,5 @@ import java.lang.annotation.Target;
|
|||
@Target(ElementType.TYPE)
|
||||
public @interface ClusteringClass {
|
||||
|
||||
public String value();
|
||||
}
|
||||
public String value();
|
||||
}
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
public interface ClusteringFunction {
|
||||
|
||||
|
||||
public Collection<String> apply(Config config, List<String> fields);
|
||||
|
||||
|
||||
public Map<String, Integer> getParams();
|
||||
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -5,6 +6,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("immutablefieldvalue")
|
||||
|
|
|
@ -1,50 +1,54 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("keywordsclustering")
|
||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||
|
||||
public KeywordsClustering(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
public KeywordsClustering(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
|
||||
//takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||
// takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||
|
||||
//list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
// list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
|
||||
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
|
||||
for (String city: citiesToCodes(cities)) {
|
||||
combinations.add(keyword+"-"+city);
|
||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||
return combinations;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
||||
for (String city : citiesToCodes(cities)) {
|
||||
combinations.add(keyword + "-" + city);
|
||||
if (combinations.size() >= params.getOrDefault("max", 2)) {
|
||||
return combinations;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return combinations;
|
||||
}
|
||||
return combinations;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(this::cleanup)
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||
return fields
|
||||
.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(this::cleanup)
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,75 +1,79 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("lnfi")
|
||||
public class LastNameFirstInitial extends AbstractClusteringFunction{
|
||||
public class LastNameFirstInitial extends AbstractClusteringFunction {
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = true;
|
||||
private boolean DEFAULT_AGGRESSIVE = true;
|
||||
|
||||
public LastNameFirstInitial(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
public LastNameFirstInitial(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<String> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(this::normalize)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
@Override
|
||||
public Collection<String> apply(Config conf, List<String> fields) {
|
||||
return fields
|
||||
.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(this::normalize)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
@Override
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
|
||||
final List<String> res = Lists.newArrayList();
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
|
||||
: DEFAULT_AGGRESSIVE);
|
||||
|
||||
Person p = new Person(s, aggressive);
|
||||
Person p = new Person(s, aggressive);
|
||||
|
||||
if (p.isAccurate()) {
|
||||
String lastName = p.getNormalisedSurname().toLowerCase();
|
||||
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
|
||||
if (p.isAccurate()) {
|
||||
String lastName = p.getNormalisedSurname().toLowerCase();
|
||||
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0, 1);
|
||||
|
||||
res.add(firstInitial.concat(lastName));
|
||||
}
|
||||
else { // is not accurate, meaning it has no defined name and surname
|
||||
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
|
||||
if (fullname.size() == 1) {
|
||||
res.add(p.getNormalisedFullname().toLowerCase());
|
||||
}
|
||||
else if (fullname.size() == 2) {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
|
||||
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
else {
|
||||
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
|
||||
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
}
|
||||
res.add(firstInitial.concat(lastName));
|
||||
} else { // is not accurate, meaning it has no defined name and surname
|
||||
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
|
||||
if (fullname.size() == 1) {
|
||||
res.add(p.getNormalisedFullname().toLowerCase());
|
||||
} else if (fullname.size() == 2) {
|
||||
res.add(fullname.get(0).substring(0, 1).concat(fullname.get(1)).toLowerCase());
|
||||
res.add(fullname.get(1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
|
||||
} else {
|
||||
res.add(fullname.get(0).substring(0, 1).concat(fullname.get(fullname.size() - 1)).toLowerCase());
|
||||
res.add(fullname.get(fullname.size() - 1).substring(0, 1).concat(fullname.get(0)).toLowerCase());
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("lowercase")
|
||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||
|
||||
|
@ -19,7 +22,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
|||
@Override
|
||||
public Collection<String> apply(Config conf, List<String> fields) {
|
||||
Collection<String> c = Sets.newLinkedHashSet();
|
||||
for(String f : fields) {
|
||||
for (String f : fields) {
|
||||
c.addAll(doApply(conf, f));
|
||||
}
|
||||
return c;
|
||||
|
@ -27,7 +30,7 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
|||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
if(StringUtils.isBlank(s)) {
|
||||
if (StringUtils.isBlank(s)) {
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
return Lists.newArrayList(s.toLowerCase().trim());
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Set;
|
||||
|
@ -11,7 +12,8 @@ public class NGramUtils extends AbstractPaceFunctions {
|
|||
|
||||
private static final int SIZE = 100;
|
||||
|
||||
private static final Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
private static final Set<String> stopwords = AbstractPaceFunctions
|
||||
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
|
||||
public static String cleanupForOrdering(String s) {
|
||||
String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords);
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -6,6 +7,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("ngrampairs")
|
||||
|
@ -32,7 +34,7 @@ public class NgramPairs extends Ngrams {
|
|||
break;
|
||||
}
|
||||
res.add(ngrams.get(i) + ngrams.get(j));
|
||||
//System.out.println("-- " + concatNgrams);
|
||||
// System.out.println("-- " + concatNgrams);
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("ngrams")
|
||||
public class Ngrams extends AbstractClusteringFunction {
|
||||
|
||||
|
@ -44,7 +45,7 @@ public class Ngrams extends AbstractClusteringFunction {
|
|||
}
|
||||
}
|
||||
}
|
||||
//System.out.println(ngrams + " n: " + ngrams.size());
|
||||
// System.out.println(ngrams + " n: " + ngrams.size());
|
||||
return ngrams;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("personClustering")
|
||||
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
|
@ -30,7 +33,8 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
|||
|
||||
final Person person = new Person(f, false);
|
||||
|
||||
if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
|
||||
if (StringUtils.isNotBlank(person.getNormalisedFirstName())
|
||||
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
|
||||
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
|
||||
} else {
|
||||
for (final String token1 : tokens(f, MAX_TOKENS)) {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -22,7 +23,8 @@ public class PersonHash extends AbstractClusteringFunction {
|
|||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive")
|
||||
: DEFAULT_AGGRESSIVE);
|
||||
|
||||
res.add(new Person(s, aggressive).hash());
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
public class RandomClusteringFunction extends AbstractClusteringFunction {
|
||||
|
||||
public RandomClusteringFunction(Map<String, Integer> params) {
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
|
@ -5,6 +6,7 @@ import java.util.*;
|
|||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("sortedngrampairs")
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang3.RandomStringUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("spacetrimmingfieldvalue")
|
||||
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||
|
||||
|
@ -21,7 +23,10 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
|||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
|
||||
res
|
||||
.add(
|
||||
StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength"))
|
||||
: s.toLowerCase().replaceAll("\\s+", ""));
|
||||
|
||||
return res;
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -5,6 +6,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("suffixprefix")
|
||||
|
@ -18,7 +20,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
|
|||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
|
||||
|
||||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||
int i = 0;
|
||||
|
|
|
@ -1,7 +1,5 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
@ -11,42 +9,44 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("urlclustering")
|
||||
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
protected Map<String, Integer> params;
|
||||
protected Map<String, Integer> params;
|
||||
|
||||
public UrlClustering(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
public UrlClustering(final Map<String, Integer> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||
try {
|
||||
return fields.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(this::asUrl)
|
||||
.map(URL::getHost)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
catch (IllegalStateException e){
|
||||
return new HashSet<>();
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||
try {
|
||||
return fields
|
||||
.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(this::asUrl)
|
||||
.map(URL::getHost)
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
} catch (IllegalStateException e) {
|
||||
return new HashSet<>();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Integer> getParams() {
|
||||
return null;
|
||||
}
|
||||
|
||||
private URL asUrl(String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public Map<String, Integer> getParams() {
|
||||
return null;
|
||||
}
|
||||
|
||||
private URL asUrl(String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,90 +1,91 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("wordsStatsSuffixPrefixChain")
|
||||
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
||||
|
||||
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefixChain(s, param("mod"));
|
||||
}
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefixChain(s, param("mod"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||
|
||||
//create the list of words from the string (remove short words)
|
||||
List<String> wordsList =
|
||||
Arrays.stream(s.split(" "))
|
||||
.filter(si -> si.length() > 3)
|
||||
.collect(Collectors.toList());
|
||||
// create the list of words from the string (remove short words)
|
||||
List<String> wordsList = Arrays
|
||||
.stream(s.split(" "))
|
||||
.filter(si -> si.length() > 3)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
final int words = wordsList.size();
|
||||
final int letters = s.length();
|
||||
final int words = wordsList.size();
|
||||
final int letters = s.length();
|
||||
|
||||
//create the prefix: number of words + number of letters/mod
|
||||
String prefix = words + "-" + letters/mod + "-";
|
||||
// create the prefix: number of words + number of letters/mod
|
||||
String prefix = words + "-" + letters / mod + "-";
|
||||
|
||||
return doSuffixPrefixChain(wordsList, prefix);
|
||||
return doSuffixPrefixChain(wordsList, prefix);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||
|
||||
Set<String> set = Sets.newLinkedHashSet();
|
||||
switch(wordsList.size()){
|
||||
case 0:
|
||||
case 1:
|
||||
break;
|
||||
case 2:
|
||||
set.add(
|
||||
prefix +
|
||||
suffix(wordsList.get(0), 3) +
|
||||
prefix(wordsList.get(1), 3)
|
||||
);
|
||||
Set<String> set = Sets.newLinkedHashSet();
|
||||
switch (wordsList.size()) {
|
||||
case 0:
|
||||
case 1:
|
||||
break;
|
||||
case 2:
|
||||
set
|
||||
.add(
|
||||
prefix +
|
||||
suffix(wordsList.get(0), 3) +
|
||||
prefix(wordsList.get(1), 3));
|
||||
|
||||
set.add(
|
||||
prefix +
|
||||
prefix(wordsList.get(0), 3) +
|
||||
suffix(wordsList.get(1), 3)
|
||||
);
|
||||
set
|
||||
.add(
|
||||
prefix +
|
||||
prefix(wordsList.get(0), 3) +
|
||||
suffix(wordsList.get(1), 3));
|
||||
|
||||
break;
|
||||
default:
|
||||
set.add(
|
||||
prefix +
|
||||
suffix(wordsList.get(0), 3) +
|
||||
prefix(wordsList.get(1), 3) +
|
||||
suffix(wordsList.get(2), 3)
|
||||
);
|
||||
break;
|
||||
default:
|
||||
set
|
||||
.add(
|
||||
prefix +
|
||||
suffix(wordsList.get(0), 3) +
|
||||
prefix(wordsList.get(1), 3) +
|
||||
suffix(wordsList.get(2), 3));
|
||||
|
||||
set.add(
|
||||
prefix +
|
||||
prefix(wordsList.get(0), 3) +
|
||||
suffix(wordsList.get(1), 3) +
|
||||
prefix(wordsList.get(2), 3)
|
||||
);
|
||||
break;
|
||||
}
|
||||
set
|
||||
.add(
|
||||
prefix +
|
||||
prefix(wordsList.get(0), 3) +
|
||||
suffix(wordsList.get(1), 3) +
|
||||
prefix(wordsList.get(2), 3));
|
||||
break;
|
||||
}
|
||||
|
||||
return set;
|
||||
return set;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private String suffix(String s, int len) {
|
||||
return s.substring(s.length() - len);
|
||||
}
|
||||
|
||||
private String suffix(String s, int len) {
|
||||
return s.substring(s.length()-len);
|
||||
}
|
||||
|
||||
private String prefix(String s, int len) {
|
||||
return s.substring(0, len);
|
||||
}
|
||||
private String prefix(String s, int len) {
|
||||
return s.substring(0, len);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Collection;
|
||||
|
@ -5,53 +6,54 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("wordssuffixprefix")
|
||||
public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
||||
|
||||
public WordsSuffixPrefix(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
public WordsSuffixPrefix(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
@Override
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
|
||||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||
private Collection<String> suffixPrefix(String s, int len, int max) {
|
||||
|
||||
final int words = s.split(" ").length;
|
||||
final int words = s.split(" ").length;
|
||||
|
||||
// adjust the token length according to the number of words
|
||||
switch (words) {
|
||||
case 1:
|
||||
return Sets.newLinkedHashSet();
|
||||
case 2:
|
||||
return doSuffixPrefix(s, len+2, max, words);
|
||||
case 3:
|
||||
return doSuffixPrefix(s, len+1, max, words);
|
||||
default:
|
||||
return doSuffixPrefix(s, len, max, words);
|
||||
}
|
||||
}
|
||||
// adjust the token length according to the number of words
|
||||
switch (words) {
|
||||
case 1:
|
||||
return Sets.newLinkedHashSet();
|
||||
case 2:
|
||||
return doSuffixPrefix(s, len + 2, max, words);
|
||||
case 3:
|
||||
return doSuffixPrefix(s, len + 1, max, words);
|
||||
default:
|
||||
return doSuffixPrefix(s, len, max, words);
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
|
||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||
int i = 0;
|
||||
while (++i < s.length() && bigrams.size() < max) {
|
||||
int j = s.indexOf(" ", i);
|
||||
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
|
||||
final Set<String> bigrams = Sets.newLinkedHashSet();
|
||||
int i = 0;
|
||||
while (++i < s.length() && bigrams.size() < max) {
|
||||
int j = s.indexOf(" ", i);
|
||||
|
||||
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
|
||||
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
|
||||
|
||||
if (j - len > 0) {
|
||||
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
|
||||
if (bigram.length() >= 4) {
|
||||
bigrams.add(words+bigram);
|
||||
}
|
||||
}
|
||||
}
|
||||
return bigrams;
|
||||
}
|
||||
if (j - len > 0) {
|
||||
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
|
||||
if (bigram.length() >= 4) {
|
||||
bigrams.add(words + bigram);
|
||||
}
|
||||
}
|
||||
}
|
||||
return bigrams;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,14 +1,5 @@
|
|||
package eu.dnetlib.pace.common;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
|
@ -19,6 +10,18 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
|
||||
/**
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
|
@ -26,321 +29,325 @@ import java.util.stream.Collectors;
|
|||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
//city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
// city map to be used when translating the city names into codes
|
||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
//list of stopwords in different languages
|
||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
// list of stopwords in different languages
|
||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
|
||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
//transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
// transliterator
|
||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
//blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
// blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
//html regex for normalization
|
||||
public final String HTML_REGEX = "<[^>]*>";
|
||||
// html regex for normalization
|
||||
public final String HTML_REGEX = "<[^>]*>";
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
//doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
// doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||
|
||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||
|
||||
protected String concat(final List<String> l) {
|
||||
return Joiner.on(" ").skipNulls().join(l);
|
||||
}
|
||||
protected String concat(final List<String> l) {
|
||||
return Joiner.on(" ").skipNulls().join(l);
|
||||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
protected String cleanup(final String s) {
|
||||
|
||||
final String s1 = s.replaceAll(HTML_REGEX, "");
|
||||
final String s2 = unicodeNormalization(s1.toLowerCase());
|
||||
final String s3 = nfd(s2);
|
||||
final String s4 = fixXML(s3);
|
||||
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
|
||||
final String s6 = transliterate(s5);
|
||||
final String s7 = fixAliases(s6);
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
|
||||
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
|
||||
final String s10 = s9.replaceAll("\\n", " ");
|
||||
final String s11 = s10.replaceAll("(?m)\\s+", " ");
|
||||
final String s12 = s11.trim();
|
||||
return s12;
|
||||
}
|
||||
final String s1 = s.replaceAll(HTML_REGEX, "");
|
||||
final String s2 = unicodeNormalization(s1.toLowerCase());
|
||||
final String s3 = nfd(s2);
|
||||
final String s4 = fixXML(s3);
|
||||
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
|
||||
final String s6 = transliterate(s5);
|
||||
final String s7 = fixAliases(s6);
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
|
||||
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
|
||||
final String s10 = s9.replaceAll("\\n", " ");
|
||||
final String s11 = s10.replaceAll("(?m)\\s+", " ");
|
||||
final String s12 = s11.trim();
|
||||
return s12;
|
||||
}
|
||||
|
||||
protected String fixXML(final String a){
|
||||
protected String fixXML(final String a) {
|
||||
|
||||
return a.replaceAll("–", " ")
|
||||
.replaceAll("&", " ")
|
||||
.replaceAll(""", " ")
|
||||
.replaceAll("−", " ");
|
||||
}
|
||||
return a
|
||||
.replaceAll("–", " ")
|
||||
.replaceAll("&", " ")
|
||||
.replaceAll(""", " ")
|
||||
.replaceAll("−", " ");
|
||||
}
|
||||
|
||||
protected boolean checkNumbers(final String a, final String b) {
|
||||
final String numbersA = getNumbers(a);
|
||||
final String numbersB = getNumbers(b);
|
||||
final String romansA = getRomans(a);
|
||||
final String romansB = getRomans(b);
|
||||
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
||||
}
|
||||
protected boolean checkNumbers(final String a, final String b) {
|
||||
final String numbersA = getNumbers(a);
|
||||
final String numbersB = getNumbers(b);
|
||||
final String romansA = getRomans(a);
|
||||
final String romansB = getRomans(b);
|
||||
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
||||
}
|
||||
|
||||
protected String getRomans(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isRoman(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
protected String getRomans(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isRoman(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected boolean isRoman(final String s) {
|
||||
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
|
||||
}
|
||||
protected boolean isRoman(final String s) {
|
||||
return s
|
||||
.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop")
|
||||
.equals("qwertyuiop");
|
||||
}
|
||||
|
||||
protected String getNumbers(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isNumber(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
protected String getNumbers(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final String t : s.split(" ")) {
|
||||
sb.append(isNumber(t) ? t : "");
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public boolean isNumber(String strNum) {
|
||||
if (strNum == null) {
|
||||
return false;
|
||||
}
|
||||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
public boolean isNumber(String strNum) {
|
||||
if (strNum == null) {
|
||||
return false;
|
||||
}
|
||||
return numberPattern.matcher(strNum).matches();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
}
|
||||
catch(Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
protected static String transliterate(final String s) {
|
||||
try {
|
||||
return transliterator.transliterate(s);
|
||||
} catch (Exception e) {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
protected String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
protected String removeSymbols(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
|
||||
}
|
||||
return sb.toString().replaceAll("\\s+", " ");
|
||||
}
|
||||
|
||||
protected boolean notNull(final String s) {
|
||||
return s != null;
|
||||
}
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
|
||||
}
|
||||
return sb.toString().replaceAll("\\s+", " ");
|
||||
}
|
||||
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
protected boolean notNull(final String s) {
|
||||
return s != null;
|
||||
}
|
||||
|
||||
public String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
protected String normalize(final String s) {
|
||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||
// strings
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
.replaceAll("(\\n)+", " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
public String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
public String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
public String unicodeNormalization(final String s) {
|
||||
public String utf8(final String s) {
|
||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||
return new String(bytes, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
public String unicodeNormalization(final String s) {
|
||||
|
||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (!stopwords.contains(token)) {
|
||||
sb.append(token);
|
||||
sb.append(" ");
|
||||
}
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
Matcher m = hexUnicodePattern.matcher(s);
|
||||
StringBuffer buf = new StringBuffer(s.length());
|
||||
while (m.find()) {
|
||||
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||
}
|
||||
m.appendTail(buf);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
public String filterAllStopWords(String s) {
|
||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
||||
final StringTokenizer st = new StringTokenizer(s);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (st.hasMoreTokens()) {
|
||||
final String token = st.nextToken();
|
||||
if (!stopwords.contains(token)) {
|
||||
sb.append(token);
|
||||
sb.append(" ");
|
||||
}
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
s = filterStopWords(s, stopwords_it);
|
||||
s = filterStopWords(s, stopwords_fr);
|
||||
s = filterStopWords(s, stopwords_pt);
|
||||
s = filterStopWords(s, stopwords_es);
|
||||
s = filterStopWords(s, stopwords_gr);
|
||||
public String filterAllStopWords(String s) {
|
||||
|
||||
return s;
|
||||
}
|
||||
s = filterStopWords(s, stopwords_en);
|
||||
s = filterStopWords(s, stopwords_de);
|
||||
s = filterStopWords(s, stopwords_it);
|
||||
s = filterStopWords(s, stopwords_fr);
|
||||
s = filterStopWords(s, stopwords_pt);
|
||||
s = filterStopWords(s, stopwords_es);
|
||||
s = filterStopWords(s, stopwords_gr);
|
||||
|
||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
||||
final Set<String> newset = Sets.newLinkedHashSet();
|
||||
for (final String s : set) {
|
||||
if (!ngramBlacklist.contains(s)) {
|
||||
newset.add(s);
|
||||
}
|
||||
}
|
||||
return newset;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
||||
final Set<String> newset = Sets.newLinkedHashSet();
|
||||
for (final String s : set) {
|
||||
if (!ngramBlacklist.contains(s)) {
|
||||
newset.add(s);
|
||||
}
|
||||
}
|
||||
return newset;
|
||||
}
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
|
||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return Sets.newHashSet();
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
|
||||
//string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i = 1; i < line.length; i++) {
|
||||
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
public String removeKeywords(String s, Set<String> keywords) {
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
|
||||
// string is like this: code;word1;word2;word3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i = 1; i < line.length; i++) {
|
||||
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
}
|
||||
|
||||
s = " " + s + " ";
|
||||
for (String k : keywords) {
|
||||
s = s.replaceAll(k.toLowerCase(), "");
|
||||
}
|
||||
public String removeKeywords(String s, Set<String> keywords) {
|
||||
|
||||
return s.trim();
|
||||
}
|
||||
s = " " + s + " ";
|
||||
for (String k : keywords) {
|
||||
s = s.replaceAll(k.toLowerCase(), "");
|
||||
}
|
||||
|
||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
|
||||
return s.trim();
|
||||
}
|
||||
|
||||
double longer = Math.max(s1.size(), s2.size());
|
||||
return (double) s1.stream().filter(s2::contains).count() / longer;
|
||||
}
|
||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
|
||||
|
||||
//convert the set of keywords to codes
|
||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
}
|
||||
double longer = Math.max(s1.size(), s2.size());
|
||||
return (double) s1.stream().filter(s2::contains).count() / longer;
|
||||
}
|
||||
|
||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return toCodes(keywords, translationMap);
|
||||
}
|
||||
// convert the set of keywords to codes
|
||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public Set<String> citiesToCodes(Set<String> keywords) {
|
||||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return toCodes(keywords, translationMap);
|
||||
}
|
||||
|
||||
protected String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
public Set<String> citiesToCodes(Set<String> keywords) {
|
||||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
|
||||
protected Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
protected String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
||||
public String normalizePid(String pid) {
|
||||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||
}
|
||||
protected Iterable<String> tokens(final String s, final int maxTokens) {
|
||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||
}
|
||||
|
||||
//get the list of keywords into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
|
||||
public String normalizePid(String pid) {
|
||||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||
}
|
||||
|
||||
String s = s1;
|
||||
// get the list of keywords into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
|
||||
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
String s = s1;
|
||||
|
||||
Set<String> codes = new HashSet<>();
|
||||
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
|
||||
|
||||
if (tokens.size() < windowSize)
|
||||
windowSize = tokens.size();
|
||||
Set<String> codes = new HashSet<>();
|
||||
|
||||
int length = windowSize;
|
||||
if (tokens.size() < windowSize)
|
||||
windowSize = tokens.size();
|
||||
|
||||
while (length != 0) {
|
||||
int length = windowSize;
|
||||
|
||||
for (int i = 0; i <= tokens.size() - length; i++) {
|
||||
String candidate = concat(tokens.subList(i, i + length));
|
||||
if (translationMap.containsKey(candidate)) {
|
||||
codes.add(candidate);
|
||||
s = s.replace(candidate, "").trim();
|
||||
}
|
||||
}
|
||||
while (length != 0) {
|
||||
|
||||
tokens = Arrays.asList(s.split(" "));
|
||||
length -= 1;
|
||||
}
|
||||
for (int i = 0; i <= tokens.size() - length; i++) {
|
||||
String candidate = concat(tokens.subList(i, i + length));
|
||||
if (translationMap.containsKey(candidate)) {
|
||||
codes.add(candidate);
|
||||
s = s.replace(candidate, "").trim();
|
||||
}
|
||||
}
|
||||
|
||||
return codes;
|
||||
}
|
||||
tokens = Arrays.asList(s.split(" "));
|
||||
length -= 1;
|
||||
}
|
||||
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
return codes;
|
||||
}
|
||||
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.config;
|
||||
|
||||
import java.util.List;
|
||||
|
@ -44,7 +45,6 @@ public interface Config {
|
|||
*/
|
||||
public Map<String, Predicate<String>> blacklists();
|
||||
|
||||
|
||||
/**
|
||||
* Translation map.
|
||||
*
|
||||
|
|
|
@ -1,16 +1,5 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.antlr.stringtemplate.StringTemplate;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
package eu.dnetlib.pace.config;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
@ -25,139 +14,167 @@ import java.util.regex.Pattern;
|
|||
import java.util.regex.PatternSyntaxException;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.antlr.stringtemplate.StringTemplate;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public class DedupConfig implements Config, Serializable {
|
||||
private static String CONFIG_TEMPLATE = "dedupConfig.st";
|
||||
private static String CONFIG_TEMPLATE = "dedupConfig.st";
|
||||
|
||||
private PaceConfig pace;
|
||||
private PaceConfig pace;
|
||||
|
||||
private WfConfig wf;
|
||||
private WfConfig wf;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, Predicate<String>> blacklists;
|
||||
@JsonIgnore
|
||||
private Map<String, Predicate<String>> blacklists;
|
||||
|
||||
private static Map<String, String> defaults = Maps.newHashMap();
|
||||
private static Map<String, String> defaults = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
defaults.put("dedupRun", "001");
|
||||
defaults.put("entityType", "result");
|
||||
defaults.put("subEntityType", "resulttype");
|
||||
defaults.put("subEntityValue", "publication");
|
||||
defaults.put("orderField", "title");
|
||||
defaults.put("queueMaxSize", "2000");
|
||||
defaults.put("groupMaxSize", "10");
|
||||
defaults.put("slidingWindowSize", "200");
|
||||
defaults.put("rootBuilder", "result");
|
||||
defaults.put("includeChildren", "true");
|
||||
defaults.put("maxIterations", "20");
|
||||
defaults.put("idPath", "$.id");
|
||||
}
|
||||
static {
|
||||
defaults.put("dedupRun", "001");
|
||||
defaults.put("entityType", "result");
|
||||
defaults.put("subEntityType", "resulttype");
|
||||
defaults.put("subEntityValue", "publication");
|
||||
defaults.put("orderField", "title");
|
||||
defaults.put("queueMaxSize", "2000");
|
||||
defaults.put("groupMaxSize", "10");
|
||||
defaults.put("slidingWindowSize", "200");
|
||||
defaults.put("rootBuilder", "result");
|
||||
defaults.put("includeChildren", "true");
|
||||
defaults.put("maxIterations", "20");
|
||||
defaults.put("idPath", "$.id");
|
||||
}
|
||||
|
||||
public DedupConfig() {
|
||||
}
|
||||
public DedupConfig() {
|
||||
}
|
||||
|
||||
public static DedupConfig load(final String json) {
|
||||
public static DedupConfig load(final String json) {
|
||||
|
||||
final DedupConfig config;
|
||||
try {
|
||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||
config.getPace().initModel();
|
||||
config.getPace().initTranslationMap();
|
||||
final DedupConfig config;
|
||||
try {
|
||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||
config.getPace().initModel();
|
||||
config.getPace().initTranslationMap();
|
||||
|
||||
config.blacklists = config.getPace().getBlacklists().entrySet()
|
||||
.stream()
|
||||
.map(e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList())))
|
||||
.collect(Collectors.toMap(e -> e.getKey(),
|
||||
e -> (Predicate<String> & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent()))
|
||||
config.blacklists = config
|
||||
.getPace()
|
||||
.getBlacklists()
|
||||
.entrySet()
|
||||
.stream()
|
||||
.map(
|
||||
e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(),
|
||||
e
|
||||
.getValue()
|
||||
.stream()
|
||||
.filter(s -> !StringUtils.isBlank(s))
|
||||
.map(Pattern::compile)
|
||||
.collect(Collectors.toList())))
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
e -> e.getKey(),
|
||||
e -> (Predicate<String> & Serializable) s -> e
|
||||
.getValue()
|
||||
.stream()
|
||||
.filter(p -> p.matcher(s).matches())
|
||||
.findFirst()
|
||||
.isPresent()))
|
||||
|
||||
;
|
||||
;
|
||||
|
||||
return config;
|
||||
} catch (IOException |
|
||||
PatternSyntaxException e) {
|
||||
throw new PaceException("Error in parsing configuration json", e);
|
||||
}
|
||||
return config;
|
||||
} catch (IOException | PatternSyntaxException e) {
|
||||
throw new PaceException("Error in parsing configuration json", e);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public static DedupConfig loadDefault() throws IOException {
|
||||
return loadDefault(new HashMap<String, String>());
|
||||
}
|
||||
public static DedupConfig loadDefault() throws IOException {
|
||||
return loadDefault(new HashMap<String, String>());
|
||||
}
|
||||
|
||||
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
|
||||
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
|
||||
|
||||
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
|
||||
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
|
||||
|
||||
for (final Entry<String, String> e : defaults.entrySet()) {
|
||||
template.setAttribute(e.getKey(), e.getValue());
|
||||
}
|
||||
for (final Entry<String, String> e : params.entrySet()) {
|
||||
if (template.getAttribute(e.getKey()) != null) {
|
||||
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
|
||||
} else {
|
||||
template.setAttribute(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
for (final Entry<String, String> e : defaults.entrySet()) {
|
||||
template.setAttribute(e.getKey(), e.getValue());
|
||||
}
|
||||
for (final Entry<String, String> e : params.entrySet()) {
|
||||
if (template.getAttribute(e.getKey()) != null) {
|
||||
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
|
||||
} else {
|
||||
template.setAttribute(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
final String json = template.toString();
|
||||
return load(json);
|
||||
}
|
||||
final String json = template.toString();
|
||||
return load(json);
|
||||
}
|
||||
|
||||
private String readFromClasspath(final String resource) throws IOException {
|
||||
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
|
||||
}
|
||||
private String readFromClasspath(final String resource) throws IOException {
|
||||
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
public PaceConfig getPace() {
|
||||
return pace;
|
||||
}
|
||||
public PaceConfig getPace() {
|
||||
return pace;
|
||||
}
|
||||
|
||||
public void setPace(final PaceConfig pace) {
|
||||
this.pace = pace;
|
||||
}
|
||||
public void setPace(final PaceConfig pace) {
|
||||
this.pace = pace;
|
||||
}
|
||||
|
||||
public WfConfig getWf() {
|
||||
return wf;
|
||||
}
|
||||
public WfConfig getWf() {
|
||||
return wf;
|
||||
}
|
||||
|
||||
public void setWf(final WfConfig wf) {
|
||||
this.wf = wf;
|
||||
}
|
||||
public void setWf(final WfConfig wf) {
|
||||
this.wf = wf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise configuration", e);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("unable to serialise configuration", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, TreeNodeDef> decisionTree() {
|
||||
return getPace().getDecisionTree();
|
||||
}
|
||||
@Override
|
||||
public Map<String, TreeNodeDef> decisionTree() {
|
||||
return getPace().getDecisionTree();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FieldDef> model() {
|
||||
return getPace().getModel();
|
||||
}
|
||||
@Override
|
||||
public List<FieldDef> model() {
|
||||
return getPace().getModel();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ClusteringDef> clusterings() {
|
||||
return getPace().getClustering();
|
||||
}
|
||||
@Override
|
||||
public List<ClusteringDef> clusterings() {
|
||||
return getPace().getClustering();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Predicate<String>> blacklists() {
|
||||
return blacklists;
|
||||
}
|
||||
@Override
|
||||
public Map<String, Predicate<String>> blacklists() {
|
||||
return blacklists;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> translationMap() {
|
||||
return getPace().translationMap();
|
||||
}
|
||||
@Override
|
||||
public Map<String, String> translationMap() {
|
||||
return getPace().translationMap();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,19 +1,20 @@
|
|||
|
||||
package eu.dnetlib.pace.config;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.ibm.icu.text.Transliterator;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
import eu.dnetlib.pace.util.PaceResolver;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
||||
|
||||
private List<FieldDef> model;
|
||||
|
@ -37,7 +38,8 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
|||
@JsonIgnore
|
||||
public static PaceResolver resolver = new PaceResolver();
|
||||
|
||||
public PaceConfig() {}
|
||||
public PaceConfig() {
|
||||
}
|
||||
|
||||
public void initModel() {
|
||||
modelMap = Maps.newHashMap();
|
||||
|
@ -46,20 +48,21 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
|||
}
|
||||
}
|
||||
|
||||
public void initTranslationMap(){
|
||||
public void initTranslationMap() {
|
||||
translationMap = Maps.newHashMap();
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
for (String key : synonyms.keySet()) {
|
||||
for (String term : synonyms.get(key)){
|
||||
translationMap.put(
|
||||
for (String term : synonyms.get(key)) {
|
||||
translationMap
|
||||
.put(
|
||||
fixAliases(transliterator.transliterate(term.toLowerCase())),
|
||||
key);
|
||||
key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, String> translationMap(){
|
||||
public Map<String, String> translationMap() {
|
||||
return translationMap;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.config;
|
||||
|
||||
public enum Type {
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.config;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
@ -12,6 +7,13 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public class WfConfig implements Serializable {
|
||||
|
||||
|
@ -76,7 +78,6 @@ public class WfConfig implements Serializable {
|
|||
/** Maximum number of allowed children. */
|
||||
private int maxChildren = MAX_CHILDREN;
|
||||
|
||||
|
||||
/** Default maximum number of iterations. */
|
||||
private final static int MAX_ITERATIONS = 20;
|
||||
|
||||
|
@ -84,9 +85,10 @@ public class WfConfig implements Serializable {
|
|||
private int maxIterations = MAX_ITERATIONS;
|
||||
|
||||
/** The Jquery path to retrieve the identifier */
|
||||
private String idPath = "$.id";
|
||||
private String idPath = "$.id";
|
||||
|
||||
public WfConfig() {}
|
||||
public WfConfig() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new dedup config.
|
||||
|
@ -114,8 +116,10 @@ public class WfConfig implements Serializable {
|
|||
* @param idPath
|
||||
* the path for the id of the entity
|
||||
*/
|
||||
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
|
||||
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder,
|
||||
final String dedupRun,
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize,
|
||||
final boolean includeChildren, final int maxIterations, final String idPath) {
|
||||
super();
|
||||
this.entityType = entityType;
|
||||
this.orderField = orderField;
|
||||
|
@ -257,7 +261,6 @@ public class WfConfig implements Serializable {
|
|||
this.maxChildren = maxChildren;
|
||||
}
|
||||
|
||||
|
||||
public int getMaxIterations() {
|
||||
return maxIterations;
|
||||
}
|
||||
|
@ -277,7 +280,6 @@ public class WfConfig implements Serializable {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.lang.Object#toString()
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -1,15 +1,16 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public class ClusteringDef implements Serializable {
|
||||
|
||||
|
@ -19,7 +20,8 @@ public class ClusteringDef implements Serializable {
|
|||
|
||||
private Map<String, Integer> params;
|
||||
|
||||
public ClusteringDef() {}
|
||||
public ClusteringDef() {
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
|
||||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
/**
|
||||
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
|
||||
|
@ -34,7 +36,8 @@ public class FieldDef implements Serializable {
|
|||
*/
|
||||
private int length = -1;
|
||||
|
||||
public FieldDef() {}
|
||||
public FieldDef() {
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
@ -43,7 +44,7 @@ public class Person {
|
|||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
|
||||
if (s.contains(",")) { // if the name contains a comma it is easy derivable the name and the surname
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
fullname = splitTerms(arr[0]);
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
@ -57,7 +58,7 @@ public class PersonComparatorUtils {
|
|||
|
||||
private static boolean verifyNames(List<String> list1, List<String> list2) {
|
||||
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
|
||||
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
|
||||
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
|
||||
}
|
||||
|
||||
private static boolean verifySurnames(List<String> list1, List<String> list2) {
|
||||
|
@ -76,7 +77,7 @@ public class PersonComparatorUtils {
|
|||
Collections.sort(list1);
|
||||
Collections.sort(list2);
|
||||
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
|
||||
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
|
||||
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
|
||||
}
|
||||
|
||||
private static List<String> extractExtendedNames(List<String> list) {
|
||||
|
@ -107,7 +108,7 @@ public class PersonComparatorUtils {
|
|||
for (String s : list1) {
|
||||
int curr = list2.indexOf(s);
|
||||
if (curr > pos) {
|
||||
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
|
||||
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
|
||||
pos = curr;
|
||||
} else {
|
||||
return false;
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
|
||||
package eu.dnetlib.pace.model;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import java.util.Comparator;
|
||||
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
import java.util.Comparator;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
|
||||
/**
|
||||
* The Class MapDocumentComparator.
|
||||
|
@ -25,13 +27,12 @@ public class RowDataOrderingComparator implements Comparator<Row> {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public int compare(final Row d1, final Row d2) {
|
||||
if (d1 == null)
|
||||
return d2==null ? 0: -1;
|
||||
return d2 == null ? 0 : -1;
|
||||
else if (d2 == null) {
|
||||
return 1;
|
||||
}
|
||||
|
@ -40,7 +41,7 @@ public class RowDataOrderingComparator implements Comparator<Row> {
|
|||
final String o2 = d2.getString(comparatorField);
|
||||
|
||||
if (o1 == null)
|
||||
return o2==null ? 0: -1;
|
||||
return o2 == null ? 0 : -1;
|
||||
else if (o2 == null) {
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -1,32 +1,30 @@
|
|||
package eu.dnetlib.dhp.oa.dedup.model
|
||||
package eu.dnetlib.pace.model
|
||||
|
||||
import com.jayway.jsonpath.{Configuration, JsonPath, Option}
|
||||
import eu.dnetlib.dhp.oa.dedup.{DedupUtility, SparkReporter}
|
||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||
import eu.dnetlib.pace.model.{ClusteringDef, FieldDef}
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil.truncateValue
|
||||
import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil}
|
||||
import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil, SparkReporter}
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.sql.{Column, Dataset, Row, functions}
|
||||
import org.apache.spark.sql.catalyst.expressions.{GenericRowWithSchema, Literal}
|
||||
import org.apache.spark.sql.expressions.{UserDefinedFunction, Window}
|
||||
import org.apache.spark.sql.functions.{col, lit, udf}
|
||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||
import org.apache.spark.sql.{Column, Dataset, Row, functions}
|
||||
|
||||
import java.util
|
||||
import java.util.function.Predicate
|
||||
import java.util.regex.Pattern
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
import org.apache.spark.sql.functions.{col, lit, udf}
|
||||
|
||||
class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
|
||||
case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable {
|
||||
|
||||
private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")
|
||||
|
||||
private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")
|
||||
|
||||
private var urlFilter = (s: String) => URL_REGEX.matcher(s).matches
|
||||
private val urlFilter = (s: String) => URL_REGEX.matcher(s).matches
|
||||
|
||||
val modelExtractor: (Dataset[String] => Dataset[Row]) = df => {
|
||||
df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0))))
|
||||
|
@ -226,60 +224,59 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
|
|||
|
||||
val orderingFieldPosition: Int = rowDataType.fieldIndex(conf.getWf.getOrderField)
|
||||
|
||||
val rowFromJsonUDF = udf(
|
||||
(json: String) => {
|
||||
val documentContext =
|
||||
JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
|
||||
val values = new Array[Any](rowDataType.size)
|
||||
val rowFromJson = (json: String) => {
|
||||
val documentContext =
|
||||
JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json)
|
||||
val values = new Array[Any](rowDataType.size)
|
||||
|
||||
values(identityFieldPosition) = DFMapDocumentUtils.getJPathString(conf.getWf.getIdPath, documentContext)
|
||||
values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
|
||||
|
||||
rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
|
||||
case ((res, (fname, index))) => {
|
||||
val fdef = conf.getPace.getModelMap.get(fname)
|
||||
rowDataType.fieldNames.zipWithIndex.foldLeft(values) {
|
||||
case ((res, (fname, index))) => {
|
||||
val fdef = conf.getPace.getModelMap.get(fname)
|
||||
|
||||
if (fdef != null) {
|
||||
res(index) = fdef.getType match {
|
||||
case Type.String | Type.Int =>
|
||||
MapDocumentUtil.truncateValue(
|
||||
DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
if (fdef != null) {
|
||||
res(index) = fdef.getType match {
|
||||
case Type.String | Type.Int =>
|
||||
MapDocumentUtil.truncateValue(
|
||||
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||
fdef.getLength
|
||||
)
|
||||
|
||||
case Type.URL =>
|
||||
var uv = DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext)
|
||||
if (!urlFilter(uv)) uv = ""
|
||||
uv
|
||||
case Type.URL =>
|
||||
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||
if (!urlFilter(uv)) uv = ""
|
||||
uv
|
||||
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
DFMapDocumentUtils.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
)
|
||||
case Type.List | Type.JSON =>
|
||||
MapDocumentUtil.truncateList(
|
||||
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||
fdef.getSize
|
||||
)
|
||||
|
||||
case Type.StringConcat =>
|
||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||
case Type.StringConcat =>
|
||||
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||
|
||||
truncateValue(
|
||||
jpaths
|
||||
.map(jpath => DFMapDocumentUtils.getJPathString(jpath, documentContext))
|
||||
.mkString(" "),
|
||||
fdef.getLength
|
||||
)
|
||||
truncateValue(
|
||||
jpaths
|
||||
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||
.mkString(" "),
|
||||
fdef.getLength
|
||||
)
|
||||
|
||||
case Type.DoubleArray =>
|
||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||
}
|
||||
case Type.DoubleArray =>
|
||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
new GenericRowWithSchema(values, rowDataType)
|
||||
},
|
||||
rowDataType
|
||||
)
|
||||
res
|
||||
}
|
||||
}
|
||||
|
||||
new GenericRowWithSchema(values, rowDataType)
|
||||
}
|
||||
|
||||
val rowFromJsonUDF = udf(rowFromJson, rowDataType)
|
||||
|
||||
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
|
||||
|
||||
|
@ -310,7 +307,7 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab
|
|||
}
|
||||
|
||||
def processBlock(implicit sc: SparkContext) = {
|
||||
val accumulators = DedupUtility.constructAccumulator(conf, sc)
|
||||
val accumulators = SparkReporter.constructAccumulator(conf, sc)
|
||||
|
||||
udf[Array[Tuple2[String, String]], mutable.WrappedArray[Row]](block => {
|
||||
val reporter = new SparkReporter(accumulators)
|
|
@ -1,41 +1,42 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("alwaysMatch")
|
||||
public class AlwaysMatch<T> extends AbstractComparator<T> {
|
||||
|
||||
public AlwaysMatch(final Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public AlwaysMatch(final Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Object a, final Object b, final Config conf) {
|
||||
return 1.0;
|
||||
}
|
||||
@Override
|
||||
public double compare(final Object a, final Object b, final Config conf) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,148 +1,157 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("authorsMatch")
|
||||
public class AuthorsMatch extends AbstractListComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
Map<String, String> params;
|
||||
|
||||
private double SURNAME_THRESHOLD;
|
||||
private double NAME_THRESHOLD;
|
||||
private double FULLNAME_THRESHOLD;
|
||||
private String MODE; //full or surname
|
||||
private int SIZE_THRESHOLD;
|
||||
private String TYPE; //count or percentage
|
||||
private int common;
|
||||
private double SURNAME_THRESHOLD;
|
||||
private double NAME_THRESHOLD;
|
||||
private double FULLNAME_THRESHOLD;
|
||||
private String MODE; // full or surname
|
||||
private int SIZE_THRESHOLD;
|
||||
private String TYPE; // count or percentage
|
||||
private int common;
|
||||
|
||||
public AuthorsMatch(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
public AuthorsMatch(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
|
||||
MODE = params.getOrDefault("mode", "full");
|
||||
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
|
||||
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
|
||||
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
||||
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
|
||||
TYPE = params.getOrDefault("type", "percentage");
|
||||
common = 0;
|
||||
}
|
||||
MODE = params.getOrDefault("mode", "full");
|
||||
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
|
||||
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
|
||||
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
||||
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
|
||||
TYPE = params.getOrDefault("type", "percentage");
|
||||
common = 0;
|
||||
}
|
||||
|
||||
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
||||
return 1.0;
|
||||
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
||||
return 1.0;
|
||||
|
||||
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||
|
||||
common = 0;
|
||||
//compare each element of List1 with each element of List2
|
||||
for (Person p1 : aList)
|
||||
common = 0;
|
||||
// compare each element of List1 with each element of List2
|
||||
for (Person p1 : aList)
|
||||
|
||||
for (Person p2 : bList) {
|
||||
for (Person p2 : bList) {
|
||||
|
||||
//both persons are inaccurate
|
||||
if (!p1.isAccurate() && !p2.isAccurate()) {
|
||||
//compare just normalized fullnames
|
||||
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
|
||||
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
|
||||
// both persons are inaccurate
|
||||
if (!p1.isAccurate() && !p2.isAccurate()) {
|
||||
// compare just normalized fullnames
|
||||
String fullname1 = normalization(
|
||||
p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname());
|
||||
String fullname2 = normalization(
|
||||
p2.getNormalisedFullname().isEmpty() ? p2.getOriginal() : p2.getNormalisedFullname());
|
||||
|
||||
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//one person is inaccurate
|
||||
if (p1.isAccurate() ^ p2.isAccurate()) {
|
||||
//prepare data
|
||||
//data for the accurate person
|
||||
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
|
||||
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
|
||||
// one person is inaccurate
|
||||
if (p1.isAccurate() ^ p2.isAccurate()) {
|
||||
// prepare data
|
||||
// data for the accurate person
|
||||
String name = normalization(
|
||||
p1.isAccurate() ? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
|
||||
String surname = normalization(
|
||||
p1.isAccurate() ? p1.getNormalisedSurname() : p2.getNormalisedSurname());
|
||||
|
||||
//data for the inaccurate person
|
||||
String fullname = normalization(
|
||||
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
|
||||
);
|
||||
// data for the inaccurate person
|
||||
String fullname = normalization(
|
||||
p1.isAccurate()
|
||||
? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname())
|
||||
: (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()));
|
||||
|
||||
if (fullname.contains(surname)) {
|
||||
if (MODE.equals("full")) {
|
||||
if (fullname.contains(name)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else { //MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (fullname.contains(surname)) {
|
||||
if (MODE.equals("full")) {
|
||||
if (fullname.contains(name)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
} else { // MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//both persons are accurate
|
||||
if (p1.isAccurate() && p2.isAccurate()) {
|
||||
// both persons are accurate
|
||||
if (p1.isAccurate() && p2.isAccurate()) {
|
||||
|
||||
if (compareSurname(p1, p2)) {
|
||||
if (MODE.equals("full")) {
|
||||
if(compareFirstname(p1, p2)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else { //MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (compareSurname(p1, p2)) {
|
||||
if (MODE.equals("full")) {
|
||||
if (compareFirstname(p1, p2)) {
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
} else { // MODE equals "surname"
|
||||
common += 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//normalization factor to compute the score
|
||||
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
|
||||
// normalization factor to compute the score
|
||||
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
|
||||
|
||||
if(TYPE.equals("percentage")) {
|
||||
return (double) common / normFactor;
|
||||
}
|
||||
else {
|
||||
return (double) common;
|
||||
}
|
||||
}
|
||||
if (TYPE.equals("percentage")) {
|
||||
return (double) common / normFactor;
|
||||
} else {
|
||||
return (double) common;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean compareSurname(Person p1, Person p2) {
|
||||
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
|
||||
}
|
||||
public boolean compareSurname(Person p1, Person p2) {
|
||||
return ssalgo
|
||||
.score(
|
||||
normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
|
||||
}
|
||||
|
||||
public boolean compareFirstname(Person p1, Person p2) {
|
||||
public boolean compareFirstname(Person p1, Person p2) {
|
||||
|
||||
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
|
||||
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
|
||||
return true;
|
||||
}
|
||||
if (p1.getNormalisedFirstName().length() <= 2 || p2.getNormalisedFirstName().length() <= 2) {
|
||||
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
|
||||
return true;
|
||||
}
|
||||
|
||||
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
|
||||
}
|
||||
return ssalgo
|
||||
.score(
|
||||
normalization(p1.getNormalisedFirstName()),
|
||||
normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
|
||||
}
|
||||
|
||||
public String normalization(String s) {
|
||||
return normalize(utf8(cleanup(s)));
|
||||
}
|
||||
public String normalization(String s) {
|
||||
return normalize(utf8(cleanup(s)));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,47 +1,48 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("cityMatch")
|
||||
public class CityMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
private Map<String, String> params;
|
||||
|
||||
public CityMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
public CityMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = citiesToCodes(cities1);
|
||||
Set<String> codes2 = citiesToCodes(cities2);
|
||||
Set<String> codes1 = citiesToCodes(cities1);
|
||||
Set<String> codes2 = citiesToCodes(cities2);
|
||||
|
||||
//if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; //undefined if one of the two has no cities
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
// if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; // undefined if one of the two has no cities
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,47 +1,47 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("cosineSimilarity")
|
||||
public class CosineSimilarity extends AbstractComparator<double[]> {
|
||||
|
||||
Map<String, String> params;
|
||||
Map<String, String> params;
|
||||
|
||||
public CosineSimilarity(Map<String,String> params) {
|
||||
super(params);
|
||||
}
|
||||
public CosineSimilarity(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Object a, Object b, Config config) {
|
||||
return compare((double[])a, (double[])b, config);
|
||||
}
|
||||
@Override
|
||||
public double compare(Object a, Object b, Config config) {
|
||||
return compare((double[]) a, (double[]) b, config);
|
||||
}
|
||||
|
||||
public double compare(final double[] a, final double[] b, final Config conf) {
|
||||
public double compare(final double[] a, final double[] b, final Config conf) {
|
||||
|
||||
if (a.length == 0 || b.length == 0)
|
||||
return -1;
|
||||
if (a.length == 0 || b.length == 0)
|
||||
return -1;
|
||||
|
||||
return cosineSimilarity(a, b);
|
||||
}
|
||||
return cosineSimilarity(a, b);
|
||||
}
|
||||
|
||||
double cosineSimilarity(double[] a, double[] b) {
|
||||
double dotProduct = 0;
|
||||
double normASum = 0;
|
||||
double normBSum = 0;
|
||||
double cosineSimilarity(double[] a, double[] b) {
|
||||
double dotProduct = 0;
|
||||
double normASum = 0;
|
||||
double normBSum = 0;
|
||||
|
||||
for(int i = 0; i < a.length; i ++) {
|
||||
dotProduct += a[i] * b[i];
|
||||
normASum += a[i] * a[i];
|
||||
normBSum += b[i] * b[i];
|
||||
}
|
||||
|
||||
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
|
||||
return dotProduct / eucledianDist;
|
||||
}
|
||||
for (int i = 0; i < a.length; i++) {
|
||||
dotProduct += a[i] * b[i];
|
||||
normASum += a[i] * a[i];
|
||||
normBSum += b[i] * b[i];
|
||||
}
|
||||
|
||||
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
|
||||
return dotProduct / eucledianDist;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,9 +1,10 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* The Class ExactMatch.
|
||||
*
|
||||
|
@ -12,15 +13,15 @@ import java.util.Map;
|
|||
@ComparatorClass("doiExactMatch")
|
||||
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
public DoiExactMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
public DoiExactMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String toString(final Object f) {
|
||||
return super.toString(f).replaceAll(PREFIX, "");
|
||||
}
|
||||
@Override
|
||||
protected String toString(final Object f) {
|
||||
return super.toString(f).replaceAll(PREFIX, "");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,29 +1,30 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("domainExactMatch")
|
||||
public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public DomainExactMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
public DomainExactMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected String toString(final Object f) {
|
||||
@Override
|
||||
protected String toString(final Object f) {
|
||||
|
||||
try {
|
||||
return asUrl(super.toString(f)).getHost();
|
||||
} catch (MalformedURLException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
try {
|
||||
return asUrl(super.toString(f)).getHost();
|
||||
} catch (MalformedURLException e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) throws MalformedURLException {
|
||||
return new URL(value);
|
||||
}
|
||||
private URL asUrl(final String value) throws MalformedURLException {
|
||||
return new URL(value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,42 +1,44 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("exactMatch")
|
||||
public class ExactMatch extends AbstractStringComparator {
|
||||
|
||||
public ExactMatch(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public ExactMatch(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; //return -1 if a field is missing
|
||||
}
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,30 +1,32 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("exactMatchIgnoreCase")
|
||||
public class ExactMatchIgnoreCase extends AbstractStringComparator {
|
||||
|
||||
public ExactMatchIgnoreCase(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
public ExactMatchIgnoreCase(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(String a, String b, final Config conf) {
|
||||
@Override
|
||||
public double compare(String a, String b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
return a.equalsIgnoreCase(b) ? 1 : 0;
|
||||
}
|
||||
return a.equalsIgnoreCase(b) ? 1 : 0;
|
||||
}
|
||||
|
||||
protected String toString(final Object object) {
|
||||
return toFirstString(object);
|
||||
}
|
||||
}
|
||||
protected String toString(final Object object) {
|
||||
return toFirstString(object);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
@ -11,70 +7,74 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("instanceTypeMatch")
|
||||
public class InstanceTypeMatch extends AbstractListComparator {
|
||||
|
||||
final Map<String, String> translationMap = new HashMap<>();
|
||||
final Map<String, String> translationMap = new HashMap<>();
|
||||
|
||||
public InstanceTypeMatch(Map<String, String> params){
|
||||
super(params);
|
||||
public InstanceTypeMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
|
||||
//jolly types
|
||||
translationMap.put("Conference object", "*");
|
||||
translationMap.put("Other literature type", "*");
|
||||
translationMap.put("Unknown", "*");
|
||||
// jolly types
|
||||
translationMap.put("Conference object", "*");
|
||||
translationMap.put("Other literature type", "*");
|
||||
translationMap.put("Unknown", "*");
|
||||
|
||||
//article types
|
||||
translationMap.put("Article", "Article");
|
||||
translationMap.put("Data Paper", "Article");
|
||||
translationMap.put("Software Paper", "Article");
|
||||
translationMap.put("Preprint", "Article");
|
||||
// article types
|
||||
translationMap.put("Article", "Article");
|
||||
translationMap.put("Data Paper", "Article");
|
||||
translationMap.put("Software Paper", "Article");
|
||||
translationMap.put("Preprint", "Article");
|
||||
|
||||
//thesis types
|
||||
translationMap.put("Thesis", "Thesis");
|
||||
translationMap.put("Master thesis", "Thesis");
|
||||
translationMap.put("Bachelor thesis", "Thesis");
|
||||
translationMap.put("Doctoral thesis", "Thesis");
|
||||
}
|
||||
// thesis types
|
||||
translationMap.put("Thesis", "Thesis");
|
||||
translationMap.put("Master thesis", "Thesis");
|
||||
translationMap.put("Bachelor thesis", "Thesis");
|
||||
translationMap.put("Doctoral thesis", "Thesis");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
if (a == null || b == null) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (a == null || b == null) {
|
||||
return -1;
|
||||
}
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
|
||||
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
// if at least one is a jolly type, it must produce a match
|
||||
if (ca.contains("*") || cb.contains("*"))
|
||||
return 1.0;
|
||||
|
||||
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
|
||||
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
|
||||
//if at least one is a jolly type, it must produce a match
|
||||
if (ca.contains("*") || cb.contains("*"))
|
||||
return 1.0;
|
||||
// if at least one is in common, it must produce a match
|
||||
return incommon >= 1 ? 1 : 0;
|
||||
}
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
public String translate(String term) {
|
||||
return translationMap.getOrDefault(term, term);
|
||||
}
|
||||
|
||||
//if at least one is in common, it must produce a match
|
||||
return incommon >= 1 ? 1 : 0;
|
||||
}
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
public String translate(String term){
|
||||
return translationMap.getOrDefault(term, term);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,44 +1,46 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinkler")
|
||||
public class JaroWinkler extends AbstractStringComparator {
|
||||
|
||||
public JaroWinkler(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public JaroWinkler(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,70 +1,74 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("jaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
private Map<String, String> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, String> params){
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
public JaroWinklerNormalizedName(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords1 = getKeywords(
|
||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(
|
||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca,cb));
|
||||
}
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,17 +1,19 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@ComparatorClass("jaroWinklerTitle")
|
||||
public class JaroWinklerTitle extends AbstractStringComparator {
|
||||
|
||||
public JaroWinklerTitle(Map<String, String> params){
|
||||
public JaroWinklerTitle(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
|
@ -22,7 +24,7 @@ public class JaroWinklerTitle extends AbstractStringComparator {
|
|||
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
|
@ -30,7 +32,7 @@ public class JaroWinklerTitle extends AbstractStringComparator {
|
|||
|
||||
boolean check = checkNumbers(ca, cb);
|
||||
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
|
|
|
@ -1,72 +1,76 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
||||
@ComparatorClass("jsonListMatch")
|
||||
public class JsonListMatch extends AbstractListComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(JsonListMatch.class);
|
||||
private Map<String, String> params;
|
||||
private static final Log log = LogFactory.getLog(JsonListMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
private String MODE; //"percentage" or "count"
|
||||
private String MODE; // "percentage" or "count"
|
||||
|
||||
public JsonListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
public JsonListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
MODE = params.getOrDefault("mode", "percentage");
|
||||
}
|
||||
MODE = params.getOrDefault("mode", "percentage");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> sa, final List<String> sb, final Config conf) {
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
@Override
|
||||
public double compare(final List<String> sa, final List<String> sb, final Config conf) {
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
int incommon = Sets.intersection(ca, cb).size();
|
||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if (MODE.equals("percentage"))
|
||||
return (double)incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
if (MODE.equals("percentage"))
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
//converts every json into a comparable string basing on parameters
|
||||
private String toComparableString(String json){
|
||||
// converts every json into a comparable string basing on parameters
|
||||
private String toComparableString(String json) {
|
||||
|
||||
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
|
||||
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
|
||||
// parameters
|
||||
|
||||
//for each path in the param list
|
||||
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||
String path = params.get(key);
|
||||
String value = MapDocumentUtil.getJPathString(path, json);
|
||||
if (value == null || value.isEmpty())
|
||||
value = "";
|
||||
st.append(value);
|
||||
st.append("::");
|
||||
}
|
||||
// for each path in the param list
|
||||
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||
String path = params.get(key);
|
||||
String value = MapDocumentUtil.getJPathString(path, json);
|
||||
if (value == null || value.isEmpty())
|
||||
value = "";
|
||||
st.append(value);
|
||||
st.append("::");
|
||||
}
|
||||
|
||||
st.setLength(st.length()-2);
|
||||
return st.toString();
|
||||
}
|
||||
st.setLength(st.length() - 2);
|
||||
return st.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,47 +1,50 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
@ComparatorClass("keywordMatch")
|
||||
public class KeywordMatch extends AbstractStringComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
Map<String, String> params;
|
||||
|
||||
public KeywordMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
public KeywordMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords1 = getKeywords(
|
||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(
|
||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
||||
|
||||
//if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1.0; //undefined if one of the two has no keywords
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
// if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1.0; // undefined if one of the two has no keywords
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinkler")
|
||||
public class Level2JaroWinkler extends AbstractStringComparator {
|
||||
|
||||
public Level2JaroWinkler(Map<String, String> params){
|
||||
public Level2JaroWinkler(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2JaroWinklerTitle")
|
||||
public class Level2JaroWinklerTitle extends AbstractStringComparator {
|
||||
|
||||
public Level2JaroWinklerTitle(Map<String,String> params){
|
||||
public Level2JaroWinklerTitle(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
|
@ -29,7 +31,8 @@ public class Level2JaroWinklerTitle extends AbstractStringComparator {
|
|||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
if (check)
|
||||
return 0.5;
|
||||
|
||||
return ssalgo.score(ca, cb);
|
||||
}
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("level2Levenstein")
|
||||
public class Level2Levenstein extends AbstractStringComparator {
|
||||
|
||||
public Level2Levenstein(Map<String,String> params){
|
||||
public Level2Levenstein(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,15 +1,17 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("levenstein")
|
||||
public class Levenstein extends AbstractStringComparator {
|
||||
|
||||
public Levenstein(Map<String,String> params){
|
||||
public Levenstein(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,20 +1,23 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.util.Map;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("levensteinTitle")
|
||||
public class LevensteinTitle extends AbstractStringComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||
|
||||
public LevensteinTitle(Map<String,String> params){
|
||||
public LevensteinTitle(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
|
@ -33,7 +36,8 @@ public class LevensteinTitle extends AbstractStringComparator {
|
|||
|
||||
final boolean check = checkNumbers(ca, cb);
|
||||
|
||||
if (check) return 0.5;
|
||||
if (check)
|
||||
return 0.5;
|
||||
|
||||
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
|
||||
}
|
||||
|
|
|
@ -1,19 +1,21 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||
*/
|
||||
@ComparatorClass("levensteinTitleIgnoreVersion")
|
||||
public class LevensteinTitleIgnoreVersion extends AbstractStringComparator {
|
||||
|
||||
public LevensteinTitleIgnoreVersion(Map<String,String> params){
|
||||
public LevensteinTitleIgnoreVersion(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* The Class Contains match
|
||||
*
|
||||
|
@ -16,51 +17,50 @@ import java.util.stream.Collectors;
|
|||
@ComparatorClass("listContainsMatch")
|
||||
public class ListContainsMatch extends AbstractListComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
private boolean CASE_SENSITIVE;
|
||||
private String STRING;
|
||||
private String AGGREGATOR;
|
||||
private Map<String, String> params;
|
||||
private boolean CASE_SENSITIVE;
|
||||
private String STRING;
|
||||
private String AGGREGATOR;
|
||||
|
||||
public ListContainsMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
public ListContainsMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
//read parameters
|
||||
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||
STRING = params.get("string");
|
||||
AGGREGATOR = params.get("bool");
|
||||
}
|
||||
// read parameters
|
||||
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||
STRING = params.get("string");
|
||||
AGGREGATOR = params.get("bool");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(List<String> sa, List<String> sb, Config conf) {
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
@Override
|
||||
public double compare(List<String> sa, List<String> sb, Config conf) {
|
||||
if (sa.isEmpty() || sb.isEmpty()) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (!CASE_SENSITIVE) {
|
||||
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
|
||||
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
|
||||
STRING = STRING.toLowerCase();
|
||||
}
|
||||
if (!CASE_SENSITIVE) {
|
||||
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
|
||||
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
|
||||
STRING = STRING.toLowerCase();
|
||||
}
|
||||
|
||||
switch(AGGREGATOR) {
|
||||
case "AND":
|
||||
if(sa.contains(STRING) && sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "OR":
|
||||
if(sa.contains(STRING) || sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "XOR":
|
||||
if(sa.contains(STRING) ^ sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
switch (AGGREGATOR) {
|
||||
case "AND":
|
||||
if (sa.contains(STRING) && sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "OR":
|
||||
if (sa.contains(STRING) || sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "XOR":
|
||||
if (sa.contains(STRING) ^ sb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("mustBeDifferent")
|
||||
public class MustBeDifferent extends AbstractStringComparator {
|
||||
|
||||
public MustBeDifferent(Map<String,String> params){
|
||||
public MustBeDifferent(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
|
@ -13,7 +14,7 @@ import java.util.Map;
|
|||
@ComparatorClass("null")
|
||||
public class NullDistanceAlgo<T> implements Comparator<T> {
|
||||
|
||||
public NullDistanceAlgo(Map<String, String> params){
|
||||
public NullDistanceAlgo(Map<String, String> params) {
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,34 +1,35 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("numbersComparator")
|
||||
public class NumbersComparator extends AbstractStringComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
Map<String, String> params;
|
||||
|
||||
public NumbersComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
public NumbersComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
//extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
// extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
int n1 = Integer.parseInt(numbers1);
|
||||
int n2 = Integer.parseInt(numbers2);
|
||||
int n1 = Integer.parseInt(numbers1);
|
||||
int n2 = Integer.parseInt(numbers2);
|
||||
|
||||
return Math.abs(n1 - n2);
|
||||
}
|
||||
return Math.abs(n1 - n2);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,36 +1,36 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("numbersMatch")
|
||||
public class NumbersMatch extends AbstractStringComparator {
|
||||
|
||||
public NumbersMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public NumbersMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
// extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
|
||||
//extracts numbers from the field
|
||||
String numbers1 = getNumbers(nfd(a));
|
||||
String numbers2 = getNumbers(nfd(b));
|
||||
if (numbers1.isEmpty() && numbers2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
if (numbers1.isEmpty() && numbers2.isEmpty())
|
||||
return 1.0;
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
if (numbers1.isEmpty() || numbers2.isEmpty())
|
||||
return -1.0;
|
||||
if (numbers1.equals(numbers2))
|
||||
return 1.0;
|
||||
|
||||
if (numbers1.equals(numbers2))
|
||||
return 1.0;
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,36 +1,36 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@ComparatorClass("romansMatch")
|
||||
public class RomansMatch extends AbstractStringComparator {
|
||||
|
||||
public RomansMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public RomansMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, Config conf) {
|
||||
// extracts romans from the field
|
||||
String romans1 = getRomans(nfd(a));
|
||||
String romans2 = getRomans(nfd(b));
|
||||
|
||||
//extracts romans from the field
|
||||
String romans1 = getRomans(nfd(a));
|
||||
String romans2 = getRomans(nfd(b));
|
||||
if (romans1.isEmpty() && romans2.isEmpty())
|
||||
return 1.0;
|
||||
|
||||
if (romans1.isEmpty() && romans2.isEmpty())
|
||||
return 1.0;
|
||||
if (romans1.isEmpty() || romans2.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
if (romans1.isEmpty() || romans2.isEmpty())
|
||||
return -1.0;
|
||||
if (romans1.equals(romans2))
|
||||
return 1.0;
|
||||
|
||||
if (romans1.equals(romans2))
|
||||
return 1.0;
|
||||
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
/**
|
||||
* Returns true if the number of values in the fields is the same.
|
||||
*
|
||||
|
@ -16,23 +18,23 @@ import java.util.Map;
|
|||
@ComparatorClass("sizeMatch")
|
||||
public class SizeMatch extends AbstractListComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new size match.
|
||||
*
|
||||
* @param params
|
||||
* the parameters
|
||||
*/
|
||||
public SizeMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
/**
|
||||
* Instantiates a new size match.
|
||||
*
|
||||
* @param params
|
||||
* the parameters
|
||||
*/
|
||||
public SizeMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1.0;
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1.0;
|
||||
|
||||
return a.size() == b.size() ? 1.0 : 0.0;
|
||||
}
|
||||
return a.size() == b.size() ? 1.0 : 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,18 +1,20 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@ComparatorClass("sortedJaroWinkler")
|
||||
public class SortedJaroWinkler extends AbstractSortedComparator {
|
||||
|
||||
public SortedJaroWinkler(Map<String,String> params){
|
||||
public SortedJaroWinkler(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
|
@ -40,7 +42,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
|
@ -50,7 +51,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
|
@ -22,7 +24,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
|||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
public SortedLevel2JaroWinkler(final Map<String, String> params){
|
||||
public SortedLevel2JaroWinkler(final Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||
}
|
||||
|
||||
|
@ -40,7 +42,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
|
@ -50,7 +51,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -1,12 +1,13 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class Contains match
|
||||
*
|
||||
|
@ -15,50 +16,50 @@ import java.util.Map;
|
|||
@ComparatorClass("stringContainsMatch")
|
||||
public class StringContainsMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
private Map<String, String> params;
|
||||
|
||||
private boolean CASE_SENSITIVE;
|
||||
private String STRING;
|
||||
private String AGGREGATOR;
|
||||
private boolean CASE_SENSITIVE;
|
||||
private String STRING;
|
||||
private String AGGREGATOR;
|
||||
|
||||
public StringContainsMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
public StringContainsMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
//read parameters
|
||||
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||
STRING = params.get("string");
|
||||
AGGREGATOR = params.get("aggregator");
|
||||
// read parameters
|
||||
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
|
||||
STRING = params.get("string");
|
||||
AGGREGATOR = params.get("aggregator");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = a;
|
||||
String cb = b;
|
||||
if (!CASE_SENSITIVE) {
|
||||
ca = a.toLowerCase();
|
||||
cb = b.toLowerCase();
|
||||
STRING = STRING.toLowerCase();
|
||||
}
|
||||
String ca = a;
|
||||
String cb = b;
|
||||
if (!CASE_SENSITIVE) {
|
||||
ca = a.toLowerCase();
|
||||
cb = b.toLowerCase();
|
||||
STRING = STRING.toLowerCase();
|
||||
}
|
||||
|
||||
switch(AGGREGATOR) {
|
||||
case "AND":
|
||||
if(ca.contains(STRING) && cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "OR":
|
||||
if(ca.contains(STRING) || cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "XOR":
|
||||
if(ca.contains(STRING) ^ cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
switch (AGGREGATOR) {
|
||||
case "AND":
|
||||
if (ca.contains(STRING) && cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "OR":
|
||||
if (ca.contains(STRING) || cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
case "XOR":
|
||||
if (ca.contains(STRING) ^ cb.contains(STRING))
|
||||
return 1.0;
|
||||
break;
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,53 +1,56 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("stringListMatch")
|
||||
public class StringListMatch extends AbstractListComparator {
|
||||
|
||||
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
||||
private Map<String, String> params;
|
||||
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
||||
private Map<String, String> params;
|
||||
|
||||
final private String TYPE; //percentage or count
|
||||
final private String TYPE; // percentage or count
|
||||
|
||||
public StringListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
public StringListMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
|
||||
TYPE = params.getOrDefault("type", "percentage");
|
||||
}
|
||||
TYPE = params.getOrDefault("type", "percentage");
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
@Override
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
|
||||
final Set<String> pa = new HashSet<>(a);
|
||||
final Set<String> pb = new HashSet<>(b);
|
||||
final Set<String> pa = new HashSet<>(a);
|
||||
final Set<String> pb = new HashSet<>(b);
|
||||
|
||||
if (pa.isEmpty() || pb.isEmpty()) {
|
||||
return -1; //return undefined if one of the two lists is empty
|
||||
}
|
||||
if (pa.isEmpty() || pb.isEmpty()) {
|
||||
return -1; // return undefined if one of the two lists is empty
|
||||
}
|
||||
|
||||
int incommon = Sets.intersection(pa, pb).size();
|
||||
int simDiff = Sets.symmetricDifference(pa, pb).size();
|
||||
int incommon = Sets.intersection(pa, pb).size();
|
||||
int simDiff = Sets.symmetricDifference(pa, pb).size();
|
||||
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
if (incommon + simDiff == 0) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
if(TYPE.equals("percentage"))
|
||||
return (double)incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
if (TYPE.equals("percentage"))
|
||||
return (double) incommon / (incommon + simDiff);
|
||||
else
|
||||
return incommon;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SubStringLevenstein.
|
||||
|
@ -14,76 +17,74 @@ import java.util.Map;
|
|||
@ComparatorClass("subStringLevenstein")
|
||||
public class SubStringLevenstein extends AbstractStringComparator {
|
||||
|
||||
/**
|
||||
* The limit.
|
||||
*/
|
||||
protected int limit;
|
||||
/**
|
||||
* The limit.
|
||||
*/
|
||||
protected int limit;
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w the w
|
||||
*/
|
||||
public SubStringLevenstein(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w the w
|
||||
*/
|
||||
public SubStringLevenstein(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public SubStringLevenstein(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
|
||||
}
|
||||
public SubStringLevenstein(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.Levenstein());
|
||||
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w the w
|
||||
* @param limit the limit
|
||||
*/
|
||||
public SubStringLevenstein(final double w, final int limit) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
this.limit = limit;
|
||||
}
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w the w
|
||||
* @param limit the limit
|
||||
*/
|
||||
public SubStringLevenstein(final double w, final int limit) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w the w
|
||||
* @param limit the limit
|
||||
* @param ssalgo the ssalgo
|
||||
*/
|
||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
this.limit = limit;
|
||||
}
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
* @param w the w
|
||||
* @param limit the limit
|
||||
* @param ssalgo the ssalgo
|
||||
*/
|
||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||
super(w, ssalgo);
|
||||
this.limit = limit;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
|
||||
}
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field,
|
||||
* eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
|
||||
*/
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
|
||||
*/
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
|
||||
*
|
||||
|
@ -15,24 +16,24 @@ import java.util.Map;
|
|||
@ComparatorClass("titleVersionMatch")
|
||||
public class TitleVersionMatch extends AbstractStringComparator {
|
||||
|
||||
public TitleVersionMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
public TitleVersionMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final String valueA, final String valueB, final Config conf) {
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
@Override
|
||||
public double compare(final String valueA, final String valueB, final Config conf) {
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
|
||||
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
|
||||
}
|
||||
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
|
||||
protected String toString(final Object object) {
|
||||
return toFirstString(object);
|
||||
}
|
||||
}
|
||||
protected String toString(final Object object) {
|
||||
return toFirstString(object);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,61 +1,63 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("urlMatcher")
|
||||
public class UrlMatcher extends Levenstein {
|
||||
|
||||
private Map<String, String> params;
|
||||
private Map<String, String> params;
|
||||
|
||||
public UrlMatcher(Map<String, String> params){
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
public UrlMatcher(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public UrlMatcher(double weight, Map<String, String> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
public UrlMatcher(double weight, Map<String, String> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
final URL urlA = asUrl(a);
|
||||
final URL urlB = asUrl(b);
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
final URL urlA = asUrl(a);
|
||||
final URL urlB = asUrl(b);
|
||||
|
||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||
return 0.0;
|
||||
}
|
||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
|
||||
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
|
||||
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
|
||||
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
|
||||
|
||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||
return hostW * 0.5;
|
||||
}
|
||||
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
|
||||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
|
||||
}
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
private URL asUrl(final String value) {
|
||||
try {
|
||||
return new URL(value);
|
||||
} catch (MalformedURLException e) {
|
||||
// should not happen as checked by pace typing
|
||||
throw new IllegalStateException("invalid URL: " + value);
|
||||
}
|
||||
}
|
||||
|
||||
protected String toString(final Object object) {
|
||||
return toFirstString(object);
|
||||
}
|
||||
protected String toString(final Object object) {
|
||||
return toFirstString(object);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
|
||||
|
@ -15,36 +17,36 @@ import java.util.Map;
|
|||
@ComparatorClass("yearMatch")
|
||||
public class YearMatch extends AbstractStringComparator {
|
||||
|
||||
private int limit = 4;
|
||||
private int limit = 4;
|
||||
|
||||
public YearMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
public YearMatch(final Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final String a, final String b, final Config conf) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
@Override
|
||||
public double compare(final String a, final String b, final Config conf) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
if (valueA.isEmpty() || valueB.isEmpty())
|
||||
return -1;
|
||||
|
||||
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
|
||||
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
|
||||
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
|
||||
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
|
||||
|
||||
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
|
||||
}
|
||||
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
|
||||
}
|
||||
|
||||
protected boolean checkLength(final String s) {
|
||||
return s.length() == limit;
|
||||
}
|
||||
protected boolean checkLength(final String s) {
|
||||
return s.length() == limit;
|
||||
}
|
||||
|
||||
protected String getFirstValue(final String value) {
|
||||
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
|
||||
}
|
||||
protected String getFirstValue(final String value) {
|
||||
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + ":" + super.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,130 +1,131 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
public abstract class AbstractComparator<T> extends AbstractPaceFunctions implements Comparator<T> {
|
||||
|
||||
/** The ssalgo. */
|
||||
protected AbstractStringDistance ssalgo;
|
||||
/** The ssalgo. */
|
||||
protected AbstractStringDistance ssalgo;
|
||||
|
||||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
|
||||
private Map<String, String> params;
|
||||
private Map<String, String> params;
|
||||
|
||||
protected AbstractComparator(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
protected AbstractComparator(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo){
|
||||
this.params = params;
|
||||
this.weight = 1.0;
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo) {
|
||||
this.params = params;
|
||||
this.weight = 1.0;
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
this.weight = weight;
|
||||
}
|
||||
/**
|
||||
* Instantiates a new second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
protected AbstractComparator(final AbstractStringDistance ssalgo){
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
protected AbstractComparator(final AbstractStringDistance ssalgo) {
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
* @param d
|
||||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
* @param d
|
||||
* the d
|
||||
* @return the double
|
||||
*/
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
/**
|
||||
* Distance.
|
||||
*
|
||||
* @param a
|
||||
* the a
|
||||
* @param b
|
||||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
|
||||
protected double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; //return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
protected double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; // return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
||||
protected double compare(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
return distance(a, b, conf);
|
||||
}
|
||||
protected double compare(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
return distance(a, b, conf);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the given argument to a List of Strings
|
||||
*
|
||||
* @param object
|
||||
* function argument
|
||||
* @return the list
|
||||
*/
|
||||
protected List<String> toList(final Object object) {
|
||||
if (object instanceof List) {
|
||||
return (List<String>)object;
|
||||
}
|
||||
/**
|
||||
* Convert the given argument to a List of Strings
|
||||
*
|
||||
* @param object
|
||||
* function argument
|
||||
* @return the list
|
||||
*/
|
||||
protected List<String> toList(final Object object) {
|
||||
if (object instanceof List) {
|
||||
return (List<String>) object;
|
||||
}
|
||||
|
||||
return Lists.newArrayList(object.toString());
|
||||
}
|
||||
return Lists.newArrayList(object.toString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert the given argument to a String
|
||||
*
|
||||
* @param object
|
||||
* function argument
|
||||
* @return the list
|
||||
*/
|
||||
protected String toString(final Object object) {
|
||||
if (object instanceof List) {
|
||||
List<String> l = (List<String>) object;
|
||||
return Joiner.on(" ").join(l);
|
||||
}
|
||||
/**
|
||||
* Convert the given argument to a String
|
||||
*
|
||||
* @param object
|
||||
* function argument
|
||||
* @return the list
|
||||
*/
|
||||
protected String toString(final Object object) {
|
||||
if (object instanceof List) {
|
||||
List<String> l = (List<String>) object;
|
||||
return Joiner.on(" ").join(l);
|
||||
}
|
||||
|
||||
return object.toString();
|
||||
}
|
||||
return object.toString();
|
||||
}
|
||||
|
||||
protected String toFirstString(final Object object) {
|
||||
if (object instanceof List) {
|
||||
List<String> l = (List<String>) object;
|
||||
return l.isEmpty() ? "" : l.get(0);
|
||||
}
|
||||
protected String toFirstString(final Object object) {
|
||||
if (object instanceof List) {
|
||||
List<String> l = (List<String>) object;
|
||||
return l.isEmpty() ? "" : l.get(0);
|
||||
}
|
||||
|
||||
return object.toString();
|
||||
}
|
||||
return object.toString();
|
||||
}
|
||||
|
||||
|
||||
public double getWeight(){
|
||||
return this.weight;
|
||||
}
|
||||
public double getWeight() {
|
||||
return this.weight;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,39 +1,41 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
abstract public class AbstractListComparator extends AbstractComparator<List<String>>{
|
||||
protected AbstractListComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
|
||||
super(params, ssalgo);
|
||||
}
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
abstract public class AbstractListComparator extends AbstractComparator<List<String>> {
|
||||
protected AbstractListComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
protected AbstractListComparator(AbstractStringDistance ssalgo) {
|
||||
super(ssalgo);
|
||||
}
|
||||
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
|
||||
super(params, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Object a, Object b, Config conf) {
|
||||
return compare(toList(a), toList(b), conf);
|
||||
}
|
||||
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
protected AbstractListComparator(AbstractStringDistance ssalgo) {
|
||||
super(ssalgo);
|
||||
}
|
||||
|
||||
return distance(concat(a), concat(b), conf);
|
||||
}
|
||||
@Override
|
||||
public double compare(Object a, Object b, Config conf) {
|
||||
return compare(toList(a), toList(b), conf);
|
||||
}
|
||||
|
||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
||||
return distance(concat(a), concat(b), conf);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,40 +1,41 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.util.AbstractList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
public abstract class AbstractSortedComparator extends AbstractListComparator {
|
||||
|
||||
/**
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
/**
|
||||
* Instantiates a new sorted second string compare algo.
|
||||
*
|
||||
* @param weight
|
||||
* the weight
|
||||
* @param ssalgo
|
||||
* the ssalgo
|
||||
*/
|
||||
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo){
|
||||
super(Double.parseDouble(params.get("weight")), ssalgo);
|
||||
}
|
||||
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo) {
|
||||
super(Double.parseDouble(params.get("weight")), ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> toList(final Object object) {
|
||||
if (object instanceof List) {
|
||||
List<String> fl = (List<String>) object;
|
||||
List<String> values = Lists.newArrayList(fl);
|
||||
Collections.sort(values);
|
||||
return values;
|
||||
}
|
||||
@Override
|
||||
protected List<String> toList(final Object object) {
|
||||
if (object instanceof List) {
|
||||
List<String> fl = (List<String>) object;
|
||||
List<String> values = Lists.newArrayList(fl);
|
||||
Collections.sort(values);
|
||||
return values;
|
||||
}
|
||||
|
||||
return Lists.newArrayList(object.toString());
|
||||
}
|
||||
return Lists.newArrayList(object.toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,44 +1,46 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public abstract class AbstractStringComparator extends AbstractComparator<String>{
|
||||
protected AbstractStringComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
|
||||
super(params, ssalgo);
|
||||
}
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
public abstract class AbstractStringComparator extends AbstractComparator<String> {
|
||||
protected AbstractStringComparator(Map<String, String> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
protected AbstractStringComparator(AbstractStringDistance ssalgo) {
|
||||
super(ssalgo);
|
||||
}
|
||||
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
|
||||
super(params, ssalgo);
|
||||
}
|
||||
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; //return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Object a, Object b, Config conf) {
|
||||
return compare(toString(a), toString(b), conf);
|
||||
}
|
||||
protected AbstractStringComparator(AbstractStringDistance ssalgo) {
|
||||
super(ssalgo);
|
||||
}
|
||||
|
||||
public double compare(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
return distance(a, b, conf);
|
||||
}
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; // return -1 if a field is missing
|
||||
}
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(Object a, Object b, Config conf) {
|
||||
return compare(toString(a), toString(b), conf);
|
||||
}
|
||||
|
||||
public double compare(final String a, final String b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
return distance(a, b, conf);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,24 +1,21 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public enum AggType {
|
||||
|
||||
W_MEAN, //weighted mean
|
||||
AVG, //average
|
||||
SUM,
|
||||
MAX,
|
||||
MIN,
|
||||
AND, //used for necessary conditions
|
||||
OR; //used for sufficient conditions
|
||||
W_MEAN, // weighted mean
|
||||
AVG, // average
|
||||
SUM, MAX, MIN, AND, // used for necessary conditions
|
||||
OR; // used for sufficient conditions
|
||||
|
||||
public static AggType getEnum(String value) {
|
||||
public static AggType getEnum(String value) {
|
||||
|
||||
try {
|
||||
return AggType.valueOf(value);
|
||||
}
|
||||
catch (IllegalArgumentException e) {
|
||||
throw new PaceException("Undefined aggregation type", e);
|
||||
}
|
||||
}
|
||||
try {
|
||||
return AggType.valueOf(value);
|
||||
} catch (IllegalArgumentException e) {
|
||||
throw new PaceException("Undefined aggregation type", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
public interface Comparator<T> {
|
||||
|
||||
/*
|
||||
* return : -1 -> can't decide (i.e. missing field)
|
||||
* >0 -> similarity degree (depends on the algorithm)
|
||||
* */
|
||||
public double compare(Object a, Object b, Config conf);
|
||||
/*
|
||||
* return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm)
|
||||
*/
|
||||
public double compare(Object a, Object b, Config conf);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
|
@ -9,5 +10,5 @@ import java.lang.annotation.Target;
|
|||
@Target(ElementType.TYPE)
|
||||
public @interface ComparatorClass {
|
||||
|
||||
public String value();
|
||||
public String value();
|
||||
}
|
||||
|
|
|
@ -1,82 +1,84 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
/**
|
||||
* The class that defines the configuration of each field in the decision tree.
|
||||
* */
|
||||
public class FieldConf implements Serializable {
|
||||
|
||||
private String field; //name of the field on which apply the comparator
|
||||
private String comparator; //comparator name
|
||||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private Map<String,String> params; //parameters
|
||||
private String field; // name of the field on which apply the comparator
|
||||
private String comparator; // comparator name
|
||||
private double weight = 1.0; // weight for the field (to be used in the aggregation)
|
||||
private Map<String, String> params; // parameters
|
||||
|
||||
private boolean countIfUndefined;
|
||||
private boolean countIfUndefined;
|
||||
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public FieldConf() {
|
||||
}
|
||||
public FieldConf() {
|
||||
}
|
||||
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, String> params, boolean countIfUndefined) {
|
||||
this.field = field;
|
||||
this.comparator = comparator;
|
||||
this.weight = weight;
|
||||
this.params = params;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, String> params,
|
||||
boolean countIfUndefined) {
|
||||
this.field = field;
|
||||
this.comparator = comparator;
|
||||
this.weight = weight;
|
||||
this.params = params;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public String getField() {
|
||||
return field;
|
||||
}
|
||||
public String getField() {
|
||||
return field;
|
||||
}
|
||||
|
||||
public void setField(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
public void setField(String field) {
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
public String getComparator() {
|
||||
return comparator;
|
||||
}
|
||||
public String getComparator() {
|
||||
return comparator;
|
||||
}
|
||||
|
||||
public void setComparator(String comparator) {
|
||||
this.comparator = comparator;
|
||||
}
|
||||
public void setComparator(String comparator) {
|
||||
this.comparator = comparator;
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
public void setWeight(double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public Map<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
public Map<String, String> getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
public void setParams(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,87 +1,89 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
/**
|
||||
* The class that contains the result of each comparison in the decision tree
|
||||
* */
|
||||
public class FieldStats implements Serializable {
|
||||
|
||||
private double weight; //weight for the field (to be used in the aggregation)
|
||||
private double threshold; //threshold for the field (to be used in some kind of aggregations)
|
||||
private double result; //the result of the comparison
|
||||
private Object a;
|
||||
private Object b;
|
||||
private double weight; // weight for the field (to be used in the aggregation)
|
||||
private double threshold; // threshold for the field (to be used in some kind of aggregations)
|
||||
private double result; // the result of the comparison
|
||||
private Object a;
|
||||
private Object b;
|
||||
|
||||
private boolean countIfUndefined;
|
||||
private boolean countIfUndefined;
|
||||
|
||||
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
|
||||
this.weight = weight;
|
||||
this.threshold = threshold;
|
||||
this.result = result;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
}
|
||||
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
|
||||
this.weight = weight;
|
||||
this.threshold = threshold;
|
||||
this.result = result;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
||||
public void setWeight(double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
public void setWeight(double weight) {
|
||||
this.weight = weight;
|
||||
}
|
||||
|
||||
public double getResult() {
|
||||
return result;
|
||||
}
|
||||
public double getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public void setResult(double result) {
|
||||
this.result = result;
|
||||
}
|
||||
public void setResult(double result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public Object getA() {
|
||||
return a;
|
||||
}
|
||||
public Object getA() {
|
||||
return a;
|
||||
}
|
||||
|
||||
public void setA(Object a) {
|
||||
this.a = a;
|
||||
}
|
||||
public void setA(Object a) {
|
||||
this.a = a;
|
||||
}
|
||||
|
||||
public Object getB() {
|
||||
return b;
|
||||
}
|
||||
public Object getB() {
|
||||
return b;
|
||||
}
|
||||
|
||||
public void setB(Object b) {
|
||||
this.b = b;
|
||||
}
|
||||
public void setB(Object b) {
|
||||
this.b = b;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,20 +1,19 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
public enum MatchType {
|
||||
|
||||
MATCH,
|
||||
NO_MATCH,
|
||||
UNDEFINED;
|
||||
MATCH, NO_MATCH, UNDEFINED;
|
||||
|
||||
public static MatchType parse(String value) {
|
||||
public static MatchType parse(String value) {
|
||||
|
||||
if (MATCH.name().equals(value)) {
|
||||
return MATCH;
|
||||
} else if (NO_MATCH.name().equals(value)) {
|
||||
return NO_MATCH;
|
||||
} else {
|
||||
return UNDEFINED;
|
||||
}
|
||||
if (MATCH.name().equals(value)) {
|
||||
return MATCH;
|
||||
} else if (NO_MATCH.name().equals(value)) {
|
||||
return NO_MATCH;
|
||||
} else {
|
||||
return UNDEFINED;
|
||||
}
|
||||
|
||||
// try {
|
||||
// return MatchType.valueOf(value);
|
||||
|
@ -22,5 +21,5 @@ public enum MatchType {
|
|||
// catch (IllegalArgumentException e) {
|
||||
// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,166 +1,170 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.types.ArrayType;
|
||||
import org.apache.spark.sql.types.DataType;
|
||||
import org.apache.spark.sql.types.StringType;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.types.ArrayType;
|
||||
import org.apache.spark.sql.types.DataType;
|
||||
import org.apache.spark.sql.types.StringType;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public class TreeNodeDef implements Serializable {
|
||||
|
||||
final static String CROSS_COMPARE = "crossCompare";
|
||||
final static String CROSS_COMPARE = "crossCompare";
|
||||
|
||||
private List<FieldConf> fields;
|
||||
private AggType aggregation;
|
||||
private List<FieldConf> fields;
|
||||
private AggType aggregation;
|
||||
|
||||
private double threshold;
|
||||
private double threshold;
|
||||
|
||||
private String positive;
|
||||
private String negative;
|
||||
private String undefined;
|
||||
private String positive;
|
||||
private String negative;
|
||||
private String undefined;
|
||||
|
||||
boolean ignoreUndefined;
|
||||
boolean ignoreUndefined;
|
||||
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) {
|
||||
this.fields = fields;
|
||||
this.aggregation = aggregation;
|
||||
this.threshold = threshold;
|
||||
this.positive = positive;
|
||||
this.negative = negative;
|
||||
this.undefined = undefined;
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative,
|
||||
String undefined, boolean ignoreUndefined) {
|
||||
this.fields = fields;
|
||||
this.aggregation = aggregation;
|
||||
this.threshold = threshold;
|
||||
this.positive = positive;
|
||||
this.negative = negative;
|
||||
this.undefined = undefined;
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public TreeNodeDef() {}
|
||||
public TreeNodeDef() {
|
||||
}
|
||||
|
||||
//function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||
// function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
|
||||
//for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
double weight = fieldConf.getWeight();
|
||||
double result;
|
||||
// for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
double weight = fieldConf.getWeight();
|
||||
double result;
|
||||
|
||||
Object value1 = getJavaValue(doc1,fieldConf.getField());
|
||||
Object value2 = getJavaValue(doc2,fieldConf.getField());
|
||||
Object value1 = getJavaValue(doc1, fieldConf.getField());
|
||||
Object value2 = getJavaValue(doc2, fieldConf.getField());
|
||||
|
||||
// if the param specifies a cross comparison (i.e. compare elements from different fields), compute the
|
||||
// result for both sides and return the maximum
|
||||
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
||||
if (crossField != null) {
|
||||
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf);
|
||||
double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf);
|
||||
result = Math.max(result1, result2);
|
||||
} else {
|
||||
result = comparator(fieldConf).compare(value1, value2, conf);
|
||||
}
|
||||
|
||||
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
|
||||
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
||||
if (crossField != null) {
|
||||
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2,crossField), conf);
|
||||
double result2 = comparator(fieldConf).compare(getJavaValue(doc1,crossField), value2, conf);
|
||||
result = Math.max(result1,result2);
|
||||
}
|
||||
else {
|
||||
result = comparator(fieldConf).compare(value1, value2, conf);
|
||||
}
|
||||
stats
|
||||
.addFieldStats(
|
||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
new FieldStats(
|
||||
weight,
|
||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
|
||||
result,
|
||||
fieldConf.isCountIfUndefined(),
|
||||
value1,
|
||||
value2));
|
||||
}
|
||||
|
||||
stats.addFieldStats(
|
||||
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
|
||||
new FieldStats(
|
||||
weight,
|
||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
|
||||
result,
|
||||
fieldConf.isCountIfUndefined(),
|
||||
value1,
|
||||
value2
|
||||
));
|
||||
}
|
||||
return stats;
|
||||
}
|
||||
|
||||
return stats;
|
||||
}
|
||||
public Object getJavaValue(Row row, String name) {
|
||||
int pos = row.fieldIndex(name);
|
||||
if (pos >= 0) {
|
||||
DataType dt = row.schema().fields()[pos].dataType();
|
||||
if (dt instanceof StringType) {
|
||||
return row.getString(pos);
|
||||
} else if (dt instanceof ArrayType) {
|
||||
return row.getList(pos);
|
||||
}
|
||||
}
|
||||
|
||||
public Object getJavaValue(Row row, String name) {
|
||||
int pos = row.fieldIndex(name);
|
||||
if (pos >= 0) {
|
||||
DataType dt = row.schema().fields()[pos].dataType();
|
||||
if (dt instanceof StringType) {
|
||||
return row.getString(pos);
|
||||
} else if (dt instanceof ArrayType) {
|
||||
return row.getList(pos);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
private Comparator comparator(final FieldConf field) {
|
||||
|
||||
private Comparator comparator(final FieldConf field){
|
||||
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
||||
}
|
||||
|
||||
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
||||
}
|
||||
public List<FieldConf> getFields() {
|
||||
return fields;
|
||||
}
|
||||
|
||||
public List<FieldConf> getFields() {
|
||||
return fields;
|
||||
}
|
||||
public void setFields(List<FieldConf> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public void setFields(List<FieldConf> fields) {
|
||||
this.fields = fields;
|
||||
}
|
||||
public AggType getAggregation() {
|
||||
return aggregation;
|
||||
}
|
||||
|
||||
public AggType getAggregation() {
|
||||
return aggregation;
|
||||
}
|
||||
public void setAggregation(AggType aggregation) {
|
||||
this.aggregation = aggregation;
|
||||
}
|
||||
|
||||
public void setAggregation(AggType aggregation) {
|
||||
this.aggregation = aggregation;
|
||||
}
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
|
||||
public double getThreshold() {
|
||||
return threshold;
|
||||
}
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
|
||||
public void setThreshold(double threshold) {
|
||||
this.threshold = threshold;
|
||||
}
|
||||
public String getPositive() {
|
||||
return positive;
|
||||
}
|
||||
|
||||
public String getPositive() {
|
||||
return positive;
|
||||
}
|
||||
public void setPositive(String positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
|
||||
public void setPositive(String positive) {
|
||||
this.positive = positive;
|
||||
}
|
||||
public String getNegative() {
|
||||
return negative;
|
||||
}
|
||||
|
||||
public String getNegative() {
|
||||
return negative;
|
||||
}
|
||||
public void setNegative(String negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
|
||||
public void setNegative(String negative) {
|
||||
this.negative = negative;
|
||||
}
|
||||
public String getUndefined() {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
public String getUndefined() {
|
||||
return undefined;
|
||||
}
|
||||
public void setUndefined(String undefined) {
|
||||
this.undefined = undefined;
|
||||
}
|
||||
|
||||
public void setUndefined(String undefined) {
|
||||
this.undefined = undefined;
|
||||
}
|
||||
public boolean isIgnoreUndefined() {
|
||||
return ignoreUndefined;
|
||||
}
|
||||
|
||||
public boolean isIgnoreUndefined() {
|
||||
return ignoreUndefined;
|
||||
}
|
||||
public void setIgnoreUndefined(boolean ignoreUndefined) {
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public void setIgnoreUndefined(boolean ignoreUndefined) {
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
@ -6,129 +7,128 @@ import java.util.Map;
|
|||
|
||||
public class TreeNodeStats implements Serializable {
|
||||
|
||||
private Map<String, FieldStats> results; //this is an accumulator for the results of the node
|
||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||
|
||||
public TreeNodeStats(){
|
||||
this.results = new HashMap<>();
|
||||
}
|
||||
public TreeNodeStats() {
|
||||
this.results = new HashMap<>();
|
||||
}
|
||||
|
||||
public Map<String, FieldStats> getResults() {
|
||||
return results;
|
||||
}
|
||||
public Map<String, FieldStats> getResults() {
|
||||
return results;
|
||||
}
|
||||
|
||||
public void addFieldStats(String id, FieldStats fieldStats){
|
||||
this.results.put(id, fieldStats);
|
||||
}
|
||||
public void addFieldStats(String id, FieldStats fieldStats) {
|
||||
this.results.put(id, fieldStats);
|
||||
}
|
||||
|
||||
public int fieldsCount(){
|
||||
return this.results.size();
|
||||
}
|
||||
public int fieldsCount() {
|
||||
return this.results.size();
|
||||
}
|
||||
|
||||
public int undefinedCount(){
|
||||
int undefinedCount = 0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult() == -1)
|
||||
undefinedCount ++;
|
||||
}
|
||||
return undefinedCount;
|
||||
}
|
||||
public int undefinedCount() {
|
||||
int undefinedCount = 0;
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() == -1)
|
||||
undefinedCount++;
|
||||
}
|
||||
return undefinedCount;
|
||||
}
|
||||
|
||||
public double scoreSum(){
|
||||
double scoreSum = 0.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>=0.0) {
|
||||
scoreSum += fs.getResult();
|
||||
}
|
||||
}
|
||||
return scoreSum;
|
||||
}
|
||||
public double scoreSum() {
|
||||
double scoreSum = 0.0;
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() >= 0.0) {
|
||||
scoreSum += fs.getResult();
|
||||
}
|
||||
}
|
||||
return scoreSum;
|
||||
}
|
||||
|
||||
//return the sum of the weights without considering the fields with countIfMissing=false && result=-1
|
||||
public double weightSum(){
|
||||
double weightSum = 0.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) {
|
||||
weightSum += fs.getWeight();
|
||||
}
|
||||
}
|
||||
return weightSum;
|
||||
}
|
||||
// return the sum of the weights without considering the fields with countIfMissing=false && result=-1
|
||||
public double weightSum() {
|
||||
double weightSum = 0.0;
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() >= 0.0 || (fs.getResult() < 0.0 && fs.isCountIfUndefined())) {
|
||||
weightSum += fs.getWeight();
|
||||
}
|
||||
}
|
||||
return weightSum;
|
||||
}
|
||||
|
||||
public double weightedScoreSum(){
|
||||
double weightedScoreSum = 0.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>=0.0) {
|
||||
weightedScoreSum += fs.getResult()*fs.getWeight();
|
||||
}
|
||||
}
|
||||
return weightedScoreSum;
|
||||
}
|
||||
public double weightedScoreSum() {
|
||||
double weightedScoreSum = 0.0;
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() >= 0.0) {
|
||||
weightedScoreSum += fs.getResult() * fs.getWeight();
|
||||
}
|
||||
}
|
||||
return weightedScoreSum;
|
||||
}
|
||||
|
||||
public double max(){
|
||||
double max = -1.0;
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()>max)
|
||||
max = fs.getResult();
|
||||
}
|
||||
return max;
|
||||
}
|
||||
public double max() {
|
||||
double max = -1.0;
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() > max)
|
||||
max = fs.getResult();
|
||||
}
|
||||
return max;
|
||||
}
|
||||
|
||||
public double min(){
|
||||
double min = 100.0; //random high value
|
||||
for(FieldStats fs: this.results.values()){
|
||||
if(fs.getResult()<min) {
|
||||
if (fs.getResult()>=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
||||
min = fs.getResult();
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
public double min() {
|
||||
double min = 100.0; // random high value
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() < min) {
|
||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
||||
min = fs.getResult();
|
||||
}
|
||||
}
|
||||
return min;
|
||||
}
|
||||
|
||||
//if at least one is true, return 1.0
|
||||
public double or(){
|
||||
for (FieldStats fieldStats : this.results.values()) {
|
||||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||
return 1.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
// if at least one is true, return 1.0
|
||||
public double or() {
|
||||
for (FieldStats fieldStats : this.results.values()) {
|
||||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||
return 1.0;
|
||||
}
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
//if at least one is false, return 0.0
|
||||
public double and() {
|
||||
for (FieldStats fieldStats : this.results.values()) {
|
||||
// if at least one is false, return 0.0
|
||||
public double and() {
|
||||
for (FieldStats fieldStats : this.results.values()) {
|
||||
|
||||
if (fieldStats.getResult() == -1) {
|
||||
if (fieldStats.isCountIfUndefined())
|
||||
return 0.0;
|
||||
}
|
||||
else {
|
||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||
return 0.0;
|
||||
}
|
||||
if (fieldStats.getResult() == -1) {
|
||||
if (fieldStats.isCountIfUndefined())
|
||||
return 0.0;
|
||||
} else {
|
||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
}
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
public double getFinalScore(AggType aggregation){
|
||||
public double getFinalScore(AggType aggregation) {
|
||||
|
||||
switch (aggregation){
|
||||
case AVG:
|
||||
return scoreSum()/fieldsCount();
|
||||
case SUM:
|
||||
return scoreSum();
|
||||
case MAX:
|
||||
return max();
|
||||
case MIN:
|
||||
return min();
|
||||
case W_MEAN:
|
||||
return weightedScoreSum()/weightSum();
|
||||
case OR:
|
||||
return or();
|
||||
case AND:
|
||||
return and();
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
switch (aggregation) {
|
||||
case AVG:
|
||||
return scoreSum() / fieldsCount();
|
||||
case SUM:
|
||||
return scoreSum();
|
||||
case MAX:
|
||||
return max();
|
||||
case MIN:
|
||||
return min();
|
||||
case W_MEAN:
|
||||
return weightedScoreSum() / weightSum();
|
||||
case OR:
|
||||
return or();
|
||||
case AND:
|
||||
return and();
|
||||
default:
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.spark.sql.Row;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
/**
|
||||
* The compare between two documents is given by the weighted mean of the field distances
|
||||
|
@ -23,11 +24,11 @@ public class TreeProcessor {
|
|||
// row based copies
|
||||
|
||||
public boolean compare(final Row a, final Row b) {
|
||||
//evaluate the decision tree
|
||||
// evaluate the decision tree
|
||||
return evaluateTree(a, b).getResult() == MatchType.MATCH;
|
||||
}
|
||||
|
||||
public TreeStats evaluateTree(final Row doc1, final Row doc2){
|
||||
public TreeStats evaluateTree(final Row doc1, final Row doc2) {
|
||||
|
||||
TreeStats treeStats = new TreeStats();
|
||||
|
||||
|
@ -36,26 +37,25 @@ public class TreeProcessor {
|
|||
do {
|
||||
|
||||
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
|
||||
//throw an exception if the node doesn't exist
|
||||
// throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("Missing tree node: " + nextNodeName);
|
||||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
treeStats.addNodeStats(nextNodeName, stats);
|
||||
|
||||
//if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
|
||||
// if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||
nextNodeName = currentNode.getUndefined();
|
||||
}
|
||||
//if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
nextNodeName = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
nextNodeName = currentNode.getNegative();
|
||||
}
|
||||
|
||||
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED);
|
||||
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
||||
|
||||
treeStats.setResult(MatchType.parse(nextNodeName));
|
||||
return treeStats;
|
||||
|
@ -68,25 +68,24 @@ public class TreeProcessor {
|
|||
do {
|
||||
|
||||
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
|
||||
//throw an exception if the node doesn't exist
|
||||
// throw an exception if the node doesn't exist
|
||||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + nextNodeName);
|
||||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
|
||||
score = stats.getFinalScore(currentNode.getAggregation());
|
||||
//if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
|
||||
// if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||
nextNodeName = currentNode.getUndefined();
|
||||
}
|
||||
//if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
nextNodeName = currentNode.getPositive();
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
nextNodeName = currentNode.getNegative();
|
||||
}
|
||||
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED);
|
||||
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
||||
|
||||
return score;
|
||||
}
|
||||
|
|
|
@ -1,51 +1,52 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
|
||||
public class TreeStats {
|
||||
|
||||
//<layer_id, <field:comparator, result>>
|
||||
Map<String, TreeNodeStats> stats;
|
||||
MatchType result;
|
||||
// <layer_id, <field:comparator, result>>
|
||||
Map<String, TreeNodeStats> stats;
|
||||
MatchType result;
|
||||
|
||||
public TreeStats(){
|
||||
this.stats = new HashMap<>();
|
||||
this.result = MatchType.NO_MATCH;
|
||||
}
|
||||
public TreeStats() {
|
||||
this.stats = new HashMap<>();
|
||||
this.result = MatchType.NO_MATCH;
|
||||
}
|
||||
|
||||
public MatchType getResult(){
|
||||
return this.result;
|
||||
}
|
||||
public MatchType getResult() {
|
||||
return this.result;
|
||||
}
|
||||
|
||||
public void setResult(MatchType result){
|
||||
this.result = result;
|
||||
}
|
||||
public void setResult(MatchType result) {
|
||||
this.result = result;
|
||||
}
|
||||
|
||||
public Map<String, TreeNodeStats> getStats() {
|
||||
return stats;
|
||||
}
|
||||
public Map<String, TreeNodeStats> getStats() {
|
||||
return stats;
|
||||
}
|
||||
|
||||
public void setStats(Map<String, TreeNodeStats> stats) {
|
||||
this.stats = stats;
|
||||
}
|
||||
public void setStats(Map<String, TreeNodeStats> stats) {
|
||||
this.stats = stats;
|
||||
}
|
||||
|
||||
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){
|
||||
this.stats.put(layerID, treeNodeStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(){
|
||||
try {
|
||||
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats) {
|
||||
this.stats.put(layerID, treeNodeStats);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Impossible to convert to JSON: ", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -12,127 +15,137 @@ import org.apache.spark.sql.types.DataType;
|
|||
import org.apache.spark.sql.types.StringType;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.WfConfig;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
|
||||
public class BlockProcessor {
|
||||
|
||||
public static final List<String> accumulators= new ArrayList<>();
|
||||
public static final List<String> accumulators = new ArrayList<>();
|
||||
|
||||
private static final Log log = LogFactory.getLog(BlockProcessor.class);
|
||||
private static final Log log = LogFactory.getLog(BlockProcessor.class);
|
||||
|
||||
private DedupConfig dedupConf;
|
||||
private DedupConfig dedupConf;
|
||||
|
||||
private final int identifierFieldPos;
|
||||
private final int orderFieldPos;
|
||||
private final int identifierFieldPos;
|
||||
private final int orderFieldPos;
|
||||
|
||||
public static void constructAccumulator( final DedupConfig dedupConf) {
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
||||
}
|
||||
public static void constructAccumulator(final DedupConfig dedupConf) {
|
||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||
accumulators
|
||||
.add(
|
||||
String
|
||||
.format(
|
||||
"%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
||||
accumulators
|
||||
.add(
|
||||
String
|
||||
.format(
|
||||
"%s::%s", dedupConf.getWf().getEntityType(),
|
||||
String
|
||||
.format(
|
||||
"Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(),
|
||||
dedupConf.getWf().getGroupMaxSize())));
|
||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"));
|
||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
|
||||
accumulators
|
||||
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
||||
}
|
||||
|
||||
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
|
||||
this.dedupConf = dedupConf;
|
||||
this.identifierFieldPos = identifierFieldPos;
|
||||
this.orderFieldPos = orderFieldPos;
|
||||
}
|
||||
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
|
||||
this.dedupConf = dedupConf;
|
||||
this.identifierFieldPos = identifierFieldPos;
|
||||
this.orderFieldPos = orderFieldPos;
|
||||
}
|
||||
|
||||
public void processSortedRows(final Collection<Row> documents, final Reporter context) {
|
||||
if (documents.size() > 1) {
|
||||
public void processSortedRows(final Collection<Row> documents, final Reporter context) {
|
||||
if (documents.size() > 1) {
|
||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||
processRows(documents, context);
|
||||
processRows(documents, context);
|
||||
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||
}
|
||||
}
|
||||
|
||||
private void processRows(final Collection<Row> queue, final Reporter context) {
|
||||
|
||||
private void processRows(final Collection<Row> queue, final Reporter context) {
|
||||
Iterator<Row> it = queue.iterator();
|
||||
while (it.hasNext()) {
|
||||
|
||||
Iterator<Row> it = queue.iterator();
|
||||
while (it.hasNext()) {
|
||||
final Row pivot = it.next();
|
||||
it.remove();
|
||||
|
||||
final Row pivot = it.next();
|
||||
it.remove();
|
||||
final String idPivot = pivot.getString(identifierFieldPos); // identifier
|
||||
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
|
||||
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
|
||||
final WfConfig wf = dedupConf.getWf();
|
||||
|
||||
if (fieldPivot != null) {
|
||||
int i = 0;
|
||||
for (final Row curr : queue) {
|
||||
final String idCurr = curr.getString(identifierFieldPos); // identifier
|
||||
|
||||
final String idPivot = pivot.getString(identifierFieldPos); //identifier
|
||||
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
|
||||
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
|
||||
final WfConfig wf = dedupConf.getWf();
|
||||
if (mustSkip(idCurr)) {
|
||||
|
||||
if (fieldPivot != null) {
|
||||
int i = 0;
|
||||
for (final Row curr : queue) {
|
||||
final String idCurr = curr.getString(identifierFieldPos); //identifier
|
||||
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
||||
|
||||
if (mustSkip(idCurr)) {
|
||||
break;
|
||||
}
|
||||
|
||||
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
||||
if (i > wf.getSlidingWindowSize()) {
|
||||
break;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
|
||||
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
|
||||
|
||||
if (i > wf.getSlidingWindowSize()) {
|
||||
break;
|
||||
}
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
|
||||
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
|
||||
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
||||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
||||
|
||||
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
||||
public Object getJavaValue(Row row, int pos) {
|
||||
DataType dt = row.schema().fields()[pos].dataType();
|
||||
if (dt instanceof StringType) {
|
||||
return row.getString(pos);
|
||||
} else if (dt instanceof ArrayType) {
|
||||
return row.getList(pos);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Object getJavaValue(Row row, int pos) {
|
||||
DataType dt = row.schema().fields()[pos].dataType();
|
||||
if (dt instanceof StringType) {
|
||||
return row.getString(pos);
|
||||
} else if (dt instanceof ArrayType) {
|
||||
return row.getList(pos);
|
||||
}
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
|
||||
return null;
|
||||
}
|
||||
if (result) {
|
||||
writeSimilarity(context, idPivot, idCurr);
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
|
||||
if (result) {
|
||||
writeSimilarity(context, idPivot, idCurr);
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||
} else {
|
||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
||||
}
|
||||
}
|
||||
private String getNsPrefix(final String id) {
|
||||
return StringUtils.substringBetween(id, "|", "::");
|
||||
}
|
||||
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
private void writeSimilarity(final Reporter context, final String from, final String to) {
|
||||
final String type = dedupConf.getWf().getEntityType();
|
||||
|
||||
private String getNsPrefix(final String id) {
|
||||
return StringUtils.substringBetween(id, "|", "::");
|
||||
}
|
||||
|
||||
private void writeSimilarity(final Reporter context, final String from, final String to) {
|
||||
final String type = dedupConf.getWf().getEntityType();
|
||||
|
||||
context.emit(type, from, to);
|
||||
context.emit(type, to, from);
|
||||
}
|
||||
context.emit(type, from, to);
|
||||
context.emit(type, to, from);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
public class Capitalise implements Function<String, String> {
|
||||
|
||||
private final char[] DELIM = {' ', '-'};
|
||||
private final char[] DELIM = {
|
||||
' ', '-'
|
||||
};
|
||||
|
||||
@Override
|
||||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
@Override
|
||||
public String apply(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||
}
|
||||
};
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,3 +1,4 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
|
@ -7,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
|
|||
public String apply(String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
};
|
||||
};
|
||||
|
|
|
@ -1,117 +1,172 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.Configuration;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import com.jayway.jsonpath.Option;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import net.minidev.json.JSONArray;
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.jayway.jsonpath.Configuration;
|
||||
import com.jayway.jsonpath.DocumentContext;
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
import com.jayway.jsonpath.Option;
|
||||
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.*;
|
||||
import net.minidev.json.JSONArray;
|
||||
|
||||
public class MapDocumentUtil {
|
||||
|
||||
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
||||
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
||||
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
||||
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
||||
|
||||
public static List<String> getJPathList(String path, String json, Type type) {
|
||||
if (type == Type.List)
|
||||
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
|
||||
Object jresult;
|
||||
List<String> result = new ArrayList<>();
|
||||
try {
|
||||
jresult = JsonPath.read(json, path);
|
||||
} catch (Throwable e) {
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof JSONArray) {
|
||||
public static List<String> getJPathList(String path, String json, Type type) {
|
||||
if (type == Type.List)
|
||||
return JsonPath
|
||||
.using(
|
||||
Configuration
|
||||
.defaultConfiguration()
|
||||
.addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS))
|
||||
.parse(json)
|
||||
.read(path);
|
||||
Object jresult;
|
||||
List<String> result = new ArrayList<>();
|
||||
try {
|
||||
jresult = JsonPath.read(json, path);
|
||||
} catch (Throwable e) {
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof JSONArray) {
|
||||
|
||||
((JSONArray) jresult).forEach(it -> {
|
||||
((JSONArray) jresult).forEach(it -> {
|
||||
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(it));
|
||||
} catch (JsonProcessingException e) {
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(it));
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
}
|
||||
);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
if (jresult instanceof LinkedHashMap) {
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(jresult));
|
||||
} catch (JsonProcessingException e) {
|
||||
if (jresult instanceof LinkedHashMap) {
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(jresult));
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof String) {
|
||||
result.add((String) jresult);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof String) {
|
||||
result.add((String) jresult);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String) o;
|
||||
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
||||
return (String) ((JSONArray) o).get(0);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
public static String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String)o;
|
||||
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
|
||||
return (String)((JSONArray)o).get(0);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
public static double[] getJPathArray(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof double[])
|
||||
return (double[]) o;
|
||||
if (o instanceof JSONArray) {
|
||||
Object[] objects = ((JSONArray) o).toArray();
|
||||
double[] array = new double[objects.length];
|
||||
for (int i = 0; i < objects.length; i++) {
|
||||
if (objects[i] instanceof BigDecimal)
|
||||
array[i] = ((BigDecimal) objects[i]).doubleValue();
|
||||
else
|
||||
array[i] = (double) objects[i];
|
||||
}
|
||||
return array;
|
||||
}
|
||||
return new double[0];
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return new double[0];
|
||||
}
|
||||
}
|
||||
|
||||
public static double[] getJPathArray(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof double[])
|
||||
return (double[]) o;
|
||||
if (o instanceof JSONArray) {
|
||||
Object[] objects = ((JSONArray) o).toArray();
|
||||
double[] array = new double[objects.length];
|
||||
for (int i = 0; i < objects.length; i++) {
|
||||
if (objects[i] instanceof BigDecimal)
|
||||
array[i] = ((BigDecimal)objects[i]).doubleValue();
|
||||
else
|
||||
array[i] = (double) objects[i];
|
||||
}
|
||||
return array;
|
||||
}
|
||||
return new double[0];
|
||||
}
|
||||
catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return new double[0];
|
||||
}
|
||||
}
|
||||
public static String truncateValue(String value, int length) {
|
||||
if (value == null)
|
||||
return "";
|
||||
|
||||
if (length == -1 || length > value.length())
|
||||
return value;
|
||||
|
||||
public static String truncateValue(String value, int length) {
|
||||
if (value == null)
|
||||
return "";
|
||||
return value.substring(0, length);
|
||||
}
|
||||
|
||||
if (length == -1 || length > value.length())
|
||||
return value;
|
||||
public static List<String> truncateList(List<String> list, int size) {
|
||||
if (size == -1 || size > list.size())
|
||||
return list;
|
||||
|
||||
return value.substring(0, length);
|
||||
}
|
||||
return list.subList(0, size);
|
||||
}
|
||||
|
||||
public static List<String> truncateList(List<String> list, int size) {
|
||||
if (size == -1 || size > list.size())
|
||||
return list;
|
||||
public static String getJPathString(final String jsonPath, final DocumentContext json) {
|
||||
try {
|
||||
Object o = json.read(jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String) o;
|
||||
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
||||
return (String) ((JSONArray) o).get(0);
|
||||
return "";
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
return list.subList(0, size);
|
||||
}
|
||||
public static List<String> getJPathList(String path, DocumentContext json, Type type) {
|
||||
// if (type == Type.List)
|
||||
// return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST,
|
||||
// Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
|
||||
Object jresult;
|
||||
List<String> result = new ArrayList<>();
|
||||
try {
|
||||
jresult = json.read(path);
|
||||
} catch (Throwable e) {
|
||||
return result;
|
||||
}
|
||||
|
||||
if (jresult instanceof JSONArray) {
|
||||
((JSONArray) jresult).forEach(it -> {
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(it));
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
if (jresult instanceof LinkedHashMap) {
|
||||
try {
|
||||
result.add(new ObjectMapper().writeValueAsString(jresult));
|
||||
} catch (JsonProcessingException e) {
|
||||
|
||||
}
|
||||
return result;
|
||||
}
|
||||
if (jresult instanceof String) {
|
||||
result.add((String) jresult);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
public class PaceException extends RuntimeException {
|
||||
|
||||
public PaceException(String s, Throwable e){
|
||||
super(s, e);
|
||||
}
|
||||
public PaceException(String s, Throwable e) {
|
||||
super(s, e);
|
||||
}
|
||||
|
||||
public PaceException(String s){
|
||||
super(s);
|
||||
}
|
||||
public PaceException(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,49 +1,61 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.reflections.Reflections;
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.reflections.Reflections;
|
||||
|
||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
||||
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
public class PaceResolver implements Serializable {
|
||||
|
||||
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
|
||||
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
|
||||
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
|
||||
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
|
||||
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<Comparator>> comparators;
|
||||
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
|
||||
private final Map<String, Class<Comparator>> comparators;
|
||||
|
||||
public PaceResolver() {
|
||||
public PaceResolver() {
|
||||
|
||||
this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream()
|
||||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||
this.clusteringFunctions = CLUSTERING_RESOLVER
|
||||
.getTypesAnnotatedWith(ClusteringClass.class)
|
||||
.stream()
|
||||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>) cl));
|
||||
|
||||
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
|
||||
.filter(Comparator.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
|
||||
}
|
||||
this.comparators = COMPARATOR_RESOLVER
|
||||
.getTypesAnnotatedWith(ComparatorClass.class)
|
||||
.stream()
|
||||
.filter(Comparator.class::isAssignableFrom)
|
||||
.collect(
|
||||
Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||
try {
|
||||
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
||||
try {
|
||||
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException
|
||||
| NoSuchMethodException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
|
||||
try {
|
||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
|
||||
try {
|
||||
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException
|
||||
| NullPointerException e) {
|
||||
throw new PaceException(name + " not found ", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
package eu.dnetlib.pace.util;
|
||||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public interface Reporter extends Serializable {
|
||||
|
||||
void incrementCounter(String counterGroup, String counterName, long delta);
|
||||
void incrementCounter(String counterGroup, String counterName, long delta);
|
||||
|
||||
void emit(String type, String from, String to);
|
||||
void emit(String type, String from, String to);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.spark.SparkContext;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.util.Reporter;
|
||||
import scala.Serializable;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkReporter implements Serializable, Reporter {
|
||||
|
||||
private final List<Tuple2<String, String>> relations = new ArrayList<>();
|
||||
|
||||
private final Map<String, LongAccumulator> accumulators;
|
||||
|
||||
public SparkReporter(Map<String, LongAccumulator> accumulators) {
|
||||
this.accumulators = accumulators;
|
||||
}
|
||||
|
||||
public void incrementCounter(
|
||||
String counterGroup,
|
||||
String counterName,
|
||||
long delta,
|
||||
Map<String, LongAccumulator> accumulators) {
|
||||
|
||||
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
|
||||
if (accumulators.containsKey(accumulatorName)) {
|
||||
accumulators.get(accumulatorName).add(delta);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incrementCounter(String counterGroup, String counterName, long delta) {
|
||||
|
||||
incrementCounter(counterGroup, counterName, delta, accumulators);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void emit(String type, String from, String to) {
|
||||
relations.add(new Tuple2<>(from, to));
|
||||
}
|
||||
|
||||
public List<Tuple2<String, String>> getRelations() {
|
||||
return relations;
|
||||
}
|
||||
|
||||
public static Map<String, LongAccumulator> constructAccumulator(
|
||||
final DedupConfig dedupConf, final SparkContext context) {
|
||||
|
||||
Map<String, LongAccumulator> accumulators = new HashMap<>();
|
||||
|
||||
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
|
||||
accumulators.put(acc1, context.longAccumulator(acc1));
|
||||
String acc2 = String
|
||||
.format(
|
||||
"%s::%s",
|
||||
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
|
||||
accumulators.put(acc2, context.longAccumulator(acc2));
|
||||
String acc3 = String
|
||||
.format(
|
||||
"%s::%s",
|
||||
dedupConf.getWf().getEntityType(),
|
||||
String
|
||||
.format(
|
||||
"Skipped records for count(%s) >= %s",
|
||||
dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
|
||||
accumulators.put(acc3, context.longAccumulator(acc3));
|
||||
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
|
||||
accumulators.put(acc4, context.longAccumulator(acc4));
|
||||
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
|
||||
accumulators.put(acc5, context.longAccumulator(acc5));
|
||||
String acc6 = String
|
||||
.format(
|
||||
"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
|
||||
accumulators.put(acc6, context.longAccumulator(acc6));
|
||||
|
||||
return accumulators;
|
||||
}
|
||||
}
|
|
@ -1,12 +1,14 @@
|
|||
package eu.dnetlib.pace;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
package eu.dnetlib.pace;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
||||
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
|
||||
|
||||
protected String readFromClasspath(final String filename) {
|
||||
|
@ -35,7 +37,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
|
|||
return a;
|
||||
}
|
||||
|
||||
protected List<String> createFieldList(List<String> strings, String fieldName){
|
||||
protected List<String> createFieldList(List<String> strings, String fieldName) {
|
||||
return strings;
|
||||
|
||||
}
|
||||
|
|
|
@ -1,17 +1,20 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import org.junit.jupiter.api.*;
|
||||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||
|
||||
private static Map<String, Integer> params;
|
||||
|
@ -20,7 +23,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
@BeforeAll
|
||||
public static void setUp() throws Exception {
|
||||
params = Maps.newHashMap();
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
|
||||
conf = DedupConfig
|
||||
.load(
|
||||
AbstractPaceFunctions
|
||||
.readFromClasspath(
|
||||
"/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -210,7 +217,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testPersonClustering(){
|
||||
public void testPersonClustering() {
|
||||
|
||||
final ClusteringFunction cf = new PersonClustering(params);
|
||||
final String s = "Abd-Alla, Abo-el-nour N.";
|
||||
|
@ -224,7 +231,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testPersonHash(){
|
||||
public void testPersonHash() {
|
||||
|
||||
final ClusteringFunction cf = new PersonHash(params);
|
||||
final String s = "Manghi, Paolo";
|
||||
|
@ -238,7 +245,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testLastNameFirstInitial(){
|
||||
public void testLastNameFirstInitial() {
|
||||
|
||||
final ClusteringFunction cf = new LastNameFirstInitial(params);
|
||||
final String s = "LI Yonghong";
|
||||
|
@ -246,4 +253,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,56 +1,57 @@
|
|||
|
||||
package eu.dnetlib.pace.common;
|
||||
|
||||
import org.junit.jupiter.api.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
public class PaceFunctionTest extends AbstractPaceFunctions {
|
||||
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
|
||||
@Test
|
||||
public void normalizePidTest(){
|
||||
@Test
|
||||
public void normalizePidTest() {
|
||||
|
||||
assertEquals("identifier", normalizePid("IdentifIer"));
|
||||
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
|
||||
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
|
||||
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
|
||||
}
|
||||
assertEquals("identifier", normalizePid("IdentifIer"));
|
||||
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
|
||||
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
|
||||
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void filterAllStopwordsTest(){
|
||||
@Test
|
||||
public void filterAllStopwordsTest() {
|
||||
|
||||
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
|
||||
}
|
||||
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void normalizeTest() {
|
||||
assertEquals("universitat", normalize("Universität"));
|
||||
@Test
|
||||
public void normalizeTest() {
|
||||
assertEquals("universitat", normalize("Universität"));
|
||||
|
||||
System.out.println(normalize("İstanbul Ticarət Universiteti"));
|
||||
}
|
||||
System.out.println(normalize("İstanbul Ticarət Universiteti"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cleanupTest() {
|
||||
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
|
||||
@Test
|
||||
public void cleanupTest() {
|
||||
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
|
||||
|
||||
System.out.println("cleaned up : " + cleanup(TEST_STRING));
|
||||
}
|
||||
|
||||
System.out.println("cleaned up : " + cleanup(TEST_STRING));
|
||||
}
|
||||
@Test
|
||||
public void testGetNumbers() {
|
||||
System.out.println("Numbers : " + getNumbers(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetNumbers() {
|
||||
System.out.println("Numbers : " + getNumbers(TEST_STRING));
|
||||
}
|
||||
@Test
|
||||
public void testRemoveSymbols() {
|
||||
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveSymbols() {
|
||||
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFixAliases() {
|
||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
@Test
|
||||
public void testFixAliases() {
|
||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
|
||||
package eu.dnetlib.pace.comparators;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.TestInstance;
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.tree.*;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.TestInstance;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||
public class ComparatorTest extends AbstractPaceTest {
|
||||
|
@ -26,7 +28,8 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
params.put("name_th", "0.95");
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
|
||||
conf = DedupConfig
|
||||
.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -39,32 +42,38 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
public void cityMatchTest() {
|
||||
final CityMatch cityMatch = new CityMatch(params);
|
||||
|
||||
//both names with no cities
|
||||
// both names with no cities
|
||||
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
|
||||
|
||||
//one of the two names with no cities
|
||||
// one of the two names with no cities
|
||||
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
|
||||
|
||||
//both names with cities (same)
|
||||
// both names with cities (same)
|
||||
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
|
||||
|
||||
//both names with cities (different)
|
||||
// both names with cities (different)
|
||||
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
||||
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
|
||||
|
||||
//particular cases
|
||||
// particular cases
|
||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
||||
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
|
||||
assertEquals(
|
||||
1.0,
|
||||
cityMatch
|
||||
.distance(
|
||||
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
|
||||
conf));
|
||||
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordMatchTest(){
|
||||
public void keywordMatchTest() {
|
||||
params.put("threshold", "0.5");
|
||||
|
||||
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
||||
|
||||
assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
||||
assertEquals(
|
||||
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||
|
@ -77,7 +86,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void listContainsMatchTest(){
|
||||
public void listContainsMatchTest() {
|
||||
|
||||
List<String> a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
|
||||
List<String> b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
|
||||
|
@ -100,7 +109,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void stringContainsMatchTest(){
|
||||
public void stringContainsMatchTest() {
|
||||
|
||||
params.put("string", "openorgs");
|
||||
params.put("bool", "XOR");
|
||||
|
@ -120,7 +129,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void numbersMatchTest(){
|
||||
public void numbersMatchTest() {
|
||||
final NumbersMatch numbersMatch = new NumbersMatch(params);
|
||||
|
||||
assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf));
|
||||
|
@ -128,7 +137,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void romansMatchTest(){
|
||||
public void romansMatchTest() {
|
||||
|
||||
final RomansMatch romansMatch = new RomansMatch(params);
|
||||
|
||||
|
@ -142,8 +151,9 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||
System.out.println("result = " + result);
|
||||
double result = jaroWinklerNormalizedName
|
||||
.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
@ -171,7 +181,11 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
final LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
||||
|
||||
double result = levensteinTitle.distance("Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6", "Degradation of lignin β-aryl ether units in <i>Arabidopsis thaliana</i> expressing <i>LigD</i>, <i>LigF</i> and <i>LigG</i> from <i>Sphingomonas paucimobilis</i> SYK-6", conf);
|
||||
double result = levensteinTitle
|
||||
.distance(
|
||||
"Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6",
|
||||
"Degradation of lignin β-aryl ether units in <i>Arabidopsis thaliana</i> expressing <i>LigD</i>, <i>LigF</i> and <i>LigG</i> from <i>Sphingomonas paucimobilis</i> SYK-6",
|
||||
conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
@ -195,13 +209,16 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
assertEquals(1.0, result);
|
||||
|
||||
List<String> c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
|
||||
List<String> c = createFieldList(
|
||||
Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
|
||||
result = instanceTypeMatch.compare(c, b, conf);
|
||||
|
||||
assertEquals(1.0, result);
|
||||
|
||||
List<String> d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
|
||||
List<String> e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
|
||||
List<String> d = createFieldList(
|
||||
Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
|
||||
List<String> e = createFieldList(
|
||||
Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
|
||||
result = instanceTypeMatch.compare(d, e, conf);
|
||||
|
||||
assertEquals(1.0, result);
|
||||
|
@ -222,7 +239,8 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||
|
||||
List<String> a = createFieldList(Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
|
||||
List<String> a = createFieldList(
|
||||
Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
|
||||
List<String> b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
|
||||
double result = authorsMatch.compare(a, b, conf);
|
||||
|
||||
|
@ -232,7 +250,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
List<String> d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
|
||||
result = authorsMatch.compare(c, d, conf);
|
||||
|
||||
assertEquals(0.0, result) ;
|
||||
assertEquals(0.0, result);
|
||||
|
||||
params.put("mode", "surname");
|
||||
authorsMatch = new AuthorsMatch(params);
|
||||
|
@ -246,7 +264,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
assertEquals(0.25, result);
|
||||
|
||||
List<String> f = createFieldList(new ArrayList<>(), "authors");
|
||||
result = authorsMatch.compare(f,f, conf);
|
||||
result = authorsMatch.compare(f, f, conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
}
|
||||
|
@ -256,8 +274,19 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||
|
||||
List<String> a = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors");
|
||||
List<String> b = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), "authors");
|
||||
List<String> a = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
||||
"authors");
|
||||
List<String> b = createFieldList(
|
||||
Arrays
|
||||
.asList(
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
|
||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
|
||||
"authors");
|
||||
|
||||
double result = jsonListMatch.compare(a, b, conf);
|
||||
|
||||
|
@ -287,13 +316,16 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
|
||||
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
|
||||
|
||||
double[] a = new double[]{1,2,3};
|
||||
double[] b = new double[]{1,2,3};
|
||||
double[] a = new double[] {
|
||||
1, 2, 3
|
||||
};
|
||||
double[] b = new double[] {
|
||||
1, 2, 3
|
||||
};
|
||||
|
||||
double compare = cosineSimilarity.compare(a, b, conf);
|
||||
|
||||
System.out.println("compare = " + compare);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
|
||||
package eu.dnetlib.pace.config;
|
||||
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
||||
public class ConfigTest extends AbstractPaceTest {
|
||||
|
||||
|
@ -56,7 +56,7 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
|
||||
System.out.println("translationMap = " + translationMap.size());
|
||||
|
||||
for (String key: translationMap.keySet()) {
|
||||
for (String key : translationMap.keySet()) {
|
||||
if (translationMap.get(key).equals("key::1"))
|
||||
System.out.println("key = " + key);
|
||||
}
|
||||
|
@ -70,13 +70,13 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
assertEquals(0, load.getPace().translationMap().keySet().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJPath() {
|
||||
final String json = readFromClasspath("organization.json");
|
||||
@Test
|
||||
public void testJPath() {
|
||||
final String json = readFromClasspath("organization.json");
|
||||
|
||||
final String jpath ="$.id";
|
||||
final String jpath = "$.id";
|
||||
|
||||
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
||||
}
|
||||
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,40 +1,43 @@
|
|||
|
||||
package eu.dnetlib.pace.util;
|
||||
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
import org.junit.jupiter.api.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||
|
||||
public class UtilTest {
|
||||
|
||||
static Map<String, String> params;
|
||||
static Map<String, String> params;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp(){
|
||||
params = new HashMap<>();
|
||||
}
|
||||
@BeforeAll
|
||||
public static void setUp() {
|
||||
params = new HashMap<>();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Ignore
|
||||
public void paceResolverTest() {
|
||||
PaceResolver paceResolver = new PaceResolver();
|
||||
paceResolver.getComparator("keywordMatch", params);
|
||||
}
|
||||
@Test
|
||||
@Ignore
|
||||
public void paceResolverTest() {
|
||||
PaceResolver paceResolver = new PaceResolver();
|
||||
paceResolver.getComparator("keywordMatch", params);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void personTest() {
|
||||
Person p = new Person("j. f. kennedy", false);
|
||||
@Test
|
||||
public void personTest() {
|
||||
Person p = new Person("j. f. kennedy", false);
|
||||
|
||||
assertEquals("kennedy", p.getSurnameString());
|
||||
assertEquals("j f", p.getNameString());
|
||||
assertEquals("kennedy", p.getSurnameString());
|
||||
assertEquals("j f", p.getNameString());
|
||||
|
||||
p = new Person("Guan-Hua Du", false);
|
||||
p = new Person("Guan-Hua Du", false);
|
||||
|
||||
System.out.println("surname = " + p.getSurnameString());
|
||||
System.out.println("name = " + p.getNameString());
|
||||
}
|
||||
System.out.println("surname = " + p.getSurnameString());
|
||||
System.out.println("name = " + p.getNameString());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.SparkDedupConfig;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
|
||||
public class TrustUtils {
|
||||
|
||||
|
@ -18,13 +20,18 @@ public class TrustUtils {
|
|||
|
||||
private static DedupConfig dedupConfig;
|
||||
|
||||
private static SparkDedupConfig sparkDedupConfig;
|
||||
|
||||
private static final ObjectMapper mapper;
|
||||
|
||||
static {
|
||||
final ObjectMapper mapper = new ObjectMapper();
|
||||
mapper = new ObjectMapper();
|
||||
try {
|
||||
dedupConfig = mapper
|
||||
.readValue(
|
||||
DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
|
||||
DedupConfig.class);
|
||||
sparkDedupConfig = new SparkDedupConfig(dedupConfig, 1);
|
||||
} catch (final IOException e) {
|
||||
log.error("Error loading dedupConfig, e");
|
||||
}
|
||||
|
@ -40,11 +47,8 @@ public class TrustUtils {
|
|||
}
|
||||
|
||||
try {
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
final Row doc1 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
||||
final Row doc2 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
||||
final Row doc1 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r1));
|
||||
final Row doc2 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r2));
|
||||
|
||||
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
||||
|
||||
|
|
|
@ -53,13 +53,17 @@
|
|||
</dependencyManagement>
|
||||
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.opentelemetry</groupId>
|
||||
<artifactId>opentelemetry-api</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>io.opentelemetry</groupId>
|
||||
<artifactId>opentelemetry-sdk</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-pace-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
@ -83,31 +87,21 @@
|
|||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.arakelian</groupId>
|
||||
<artifactId>java-jq</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-pace-core</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-graphx_2.11</artifactId>
|
||||
|
@ -141,12 +135,7 @@
|
|||
<version>1.4.200</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-catalyst_2.11</artifactId>
|
||||
<version>2.4.0.cloudera2</version>
|
||||
<scope>compile</scope>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
|
|
|
@ -3,29 +3,20 @@ package eu.dnetlib.dhp.oa.dedup
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.oa.dedup.dsl.{Clustering, Deduper}
|
||||
import eu.dnetlib.dhp.oa.dedup.model.BlockStats
|
||||
import eu.dnetlib.dhp.oa.dedup.model.SparkDedupConfig
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService
|
||||
import eu.dnetlib.pace.config.DedupConfig
|
||||
import eu.dnetlib.pace.model.RowDataOrderingComparator
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.{ISLookUpException, ISLookUpService}
|
||||
import eu.dnetlib.pace.model.{RowDataOrderingComparator, SparkDedupConfig}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.api.java.JavaSparkContext
|
||||
import org.apache.spark.sql._
|
||||
import org.apache.spark.sql.catalyst.expressions.Literal
|
||||
import org.apache.spark.sql.types.DataTypes
|
||||
import org.dom4j.DocumentException
|
||||
import org.slf4j.Logger
|
||||
import org.slf4j.LoggerFactory
|
||||
import org.xml.sax.SAXException
|
||||
|
||||
import java.io.IOException
|
||||
import java.util
|
||||
import java.util.Optional
|
||||
import java.util.stream.Collectors
|
||||
import scala.collection.Seq
|
||||
|
||||
object DSLExample {
|
||||
private val log = LoggerFactory.getLogger(classOf[DSLExample])
|
||||
|
@ -64,15 +55,15 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends
|
|||
DSLExample.log.info("isLookUpUrl: '{}'", isLookUpUrl)
|
||||
DSLExample.log.info("actionSetId: '{}'", actionSetId)
|
||||
DSLExample.log.info("workingPath: '{}'", workingPath)
|
||||
// for each dedup configuration
|
||||
// for each dedup configuration
|
||||
import scala.collection.JavaConversions._
|
||||
for (dedupConf <- getConfigurations(isLookUpService, actionSetId).subList(0, 1)) {
|
||||
val subEntity = dedupConf.getWf.getSubEntityValue
|
||||
DSLExample.log.info("Creating blockstats for: '{}'", subEntity)
|
||||
val outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity)
|
||||
AbstractSparkAction.removeOutputDir(spark, outputPath)
|
||||
val sc = JavaSparkContext.fromSparkContext(spark.sparkContext)
|
||||
val sparkConfig = new SparkDedupConfig(dedupConf, numPartitions)
|
||||
|
||||
val sparkConfig = SparkDedupConfig(dedupConf, numPartitions)
|
||||
|
||||
val inputDF = spark.read
|
||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||
|
@ -87,8 +78,7 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends
|
|||
Clustering("suffixprefix", Seq("legalname"), Map("max" -> 1, "len" -> 3)),
|
||||
Clustering("urlclustering", Seq("websiteurl")),
|
||||
Clustering("keywordsclustering", Seq("fields"), Map("max" -> 2, "windowSize" -> 4))
|
||||
);
|
||||
|
||||
)
|
||||
|
||||
simRels
|
||||
.map[BlockStats](
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue