forked from D-Net/dnet-hadoop
put the last modification of the master branch into the tree2. Addition of the configuration as parameter of the comparator. This is to allow the comparator to access it
This commit is contained in:
commit
30a873265f
|
@ -6,7 +6,7 @@
|
||||||
<parent>
|
<parent>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>dnet-dedup</artifactId>
|
<artifactId>dnet-dedup</artifactId>
|
||||||
<version>3.0.14-SNAPSHOT</version>
|
<version>3.0.15-SNAPSHOT</version>
|
||||||
<relativePath>../pom.xml</relativePath>
|
<relativePath>../pom.xml</relativePath>
|
||||||
</parent>
|
</parent>
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract Collection<String> doApply(String s);
|
protected abstract Collection<String> doApply(Config conf, String s);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||||
return fields.stream().filter(f -> !f.isEmpty())
|
return fields.stream().filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
.map(Field::stringValue)
|
||||||
.map(this::normalize)
|
.map(this::normalize)
|
||||||
.map(s -> filterAllStopWords(s))
|
.map(s -> filterAllStopWords(s))
|
||||||
.map(this::doApply)
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
.flatMap(c -> c.stream())
|
.flatMap(c -> c.stream())
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Set;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("acronyms")
|
@ClusteringClass("acronyms")
|
||||||
public class Acronyms extends AbstractClusteringFunction {
|
public class Acronyms extends AbstractClusteringFunction {
|
||||||
|
@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field;
|
||||||
public class ClusteringCombiner {
|
public class ClusteringCombiner {
|
||||||
|
|
||||||
public static Collection<String> combine(final Document a, final Config conf) {
|
public static Collection<String> combine(final Document a, final Config conf) {
|
||||||
return new ClusteringCombiner().doCombine(a, conf.clusterings());
|
return new ClusteringCombiner().doCombine(a, conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Collection<String> doCombine(final Document a, final List<ClusteringDef> defs) {
|
private Collection<String> doCombine(final Document a, final Config conf) {
|
||||||
final Collection<String> res = Sets.newLinkedHashSet();
|
final Collection<String> res = Sets.newLinkedHashSet();
|
||||||
for (final ClusteringDef cd : defs) {
|
for (final ClusteringDef cd : conf.clusterings()) {
|
||||||
for (final String fieldName : cd.getFields()) {
|
for (final String fieldName : cd.getFields()) {
|
||||||
final Field values = a.values(fieldName);
|
final Field values = a.values(fieldName);
|
||||||
res.addAll(cd.clusteringFunction().apply((List<Field>) values));
|
res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -4,11 +4,12 @@ import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
public interface ClusteringFunction {
|
public interface ClusteringFunction {
|
||||||
|
|
||||||
public Collection<String> apply(List<Field> fields);
|
public Collection<String> apply(Config config, List<Field> fields);
|
||||||
|
|
||||||
public Map<String, Integer> getParams();
|
public Map<String, Integer> getParams();
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("immutablefieldvalue")
|
@ClusteringClass("immutablefieldvalue")
|
||||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||||
|
@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final Config conf, final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
||||||
res.add(s);
|
res.add(s);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -15,16 +16,16 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(final Config conf, String s) {
|
||||||
|
|
||||||
//takes city codes and keywords codes without duplicates
|
//takes city codes and keywords codes without duplicates
|
||||||
Set<String> keywords = getKeywords(s, params.getOrDefault("windowSize", 4));
|
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
||||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||||
|
|
||||||
//list of combination to return as result
|
//list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
|
||||||
for (String keyword: keywordsToCodes(keywords)){
|
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
|
||||||
for (String city: citiesToCodes(cities)) {
|
for (String city: citiesToCodes(cities)) {
|
||||||
combinations.add(keyword+"-"+city);
|
combinations.add(keyword+"-"+city);
|
||||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||||
|
@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||||
return fields.stream().filter(f -> !f.isEmpty())
|
return fields.stream().filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
.map(Field::stringValue)
|
||||||
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
|
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
|
||||||
.map(this::normalize)
|
.map(this::normalize)
|
||||||
.map(s -> filterAllStopWords(s))
|
.map(s -> filterAllStopWords(s))
|
||||||
.map(this::doApply)
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
.flatMap(c -> c.stream())
|
.flatMap(c -> c.stream())
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||||
Collection<String> c = Sets.newLinkedHashSet();
|
Collection<String> c = Sets.newLinkedHashSet();
|
||||||
for(Field f : fields) {
|
for(Field f : fields) {
|
||||||
c.addAll(doApply(f.stringValue()));
|
c.addAll(doApply(conf, f.stringValue()));
|
||||||
}
|
}
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final Config conf, final String s) {
|
||||||
if(StringUtils.isBlank(s)) {
|
if(StringUtils.isBlank(s)) {
|
||||||
return Lists.newArrayList();
|
return Lists.newArrayList();
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("ngrampairs")
|
@ClusteringClass("ngrampairs")
|
||||||
public class NgramPairs extends Ngrams {
|
public class NgramPairs extends Ngrams {
|
||||||
|
@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
@ClusteringClass("ngrams")
|
@ClusteringClass("ngrams")
|
||||||
|
@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final List<Field> fields) {
|
public Collection<String> apply(final Config conf, final List<Field> fields) {
|
||||||
final Set<String> hashes = Sets.newHashSet();
|
final Set<String> hashes = Sets.newHashSet();
|
||||||
|
|
||||||
for (final Field f : fields) {
|
for (final Field f : fields) {
|
||||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
|
||||||
@ClusteringClass("personhash")
|
@ClusteringClass("personhash")
|
||||||
|
@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final Config conf, final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
||||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(final Config conf, String s) {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.util.*;
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("sortedngrampairs")
|
@ClusteringClass("sortedngrampairs")
|
||||||
public class SortedNgramPairs extends NgramPairs {
|
public class SortedNgramPairs extends NgramPairs {
|
||||||
|
@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
|
|
||||||
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
|
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import org.apache.commons.lang.RandomStringUtils;
|
import org.apache.commons.lang.RandomStringUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final Config conf, final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
||||||
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
|
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
@ClusteringClass("suffixprefix")
|
@ClusteringClass("suffixprefix")
|
||||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||||
|
@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
return suffixPrefix(s, param("len"), param("max"));
|
return suffixPrefix(s, param("len"), param("max"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
import java.net.MalformedURLException;
|
import java.net.MalformedURLException;
|
||||||
|
@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||||
try {
|
try {
|
||||||
return fields.stream()
|
return fields.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
|
|
|
@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.StringWriter;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -25,7 +27,6 @@ import java.util.stream.Collectors;
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractPaceFunctions {
|
public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
|
||||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||||
|
|
||||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
|
@ -238,10 +239,10 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public double keywordsCompare(Set<String> s1, Set<String> s2){
|
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
|
||||||
|
|
||||||
Set<String> k1 = keywordsToCodes(s1);
|
Set<String> k1 = keywordsToCodes(s1, translationMap);
|
||||||
Set<String> k2 = keywordsToCodes(s2);
|
Set<String> k2 = keywordsToCodes(s2, translationMap);
|
||||||
|
|
||||||
int longer = (k1.size()>k2.size())?k1.size():k2.size();
|
int longer = (k1.size()>k2.size())?k1.size():k2.size();
|
||||||
|
|
||||||
|
@ -273,7 +274,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> keywordsToCodes(Set<String> keywords) {
|
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||||
return toCodes(keywords, translationMap);
|
return toCodes(keywords, translationMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -324,12 +325,17 @@ public abstract class AbstractPaceFunctions {
|
||||||
return codes;
|
return codes;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> getKeywords(String s1, int windowSize) {
|
|
||||||
return getKeywords(s1, translationMap, windowSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Set<String> getCities(String s1, int windowSize) {
|
public Set<String> getCities(String s1, int windowSize) {
|
||||||
return getKeywords(s1, cityMap, windowSize);
|
return getKeywords(s1, cityMap, windowSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||||
|
final StringWriter sw = new StringWriter();
|
||||||
|
try {
|
||||||
|
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||||
|
return sw.toString();
|
||||||
|
} catch (final IOException e) {
|
||||||
|
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,4 +49,6 @@ public interface Config {
|
||||||
*/
|
*/
|
||||||
public Map<String, List<String>> blacklists();
|
public Map<String, List<String>> blacklists();
|
||||||
|
|
||||||
|
|
||||||
|
public Map<String, String> translationMap();
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable {
|
||||||
try {
|
try {
|
||||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||||
config.getPace().initModel();
|
config.getPace().initModel();
|
||||||
|
config.getPace().initTranslationMap();
|
||||||
return config;
|
return config;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new PaceException("Error in parsing configuration json", e);
|
throw new PaceException("Error in parsing configuration json", e);
|
||||||
|
@ -139,4 +140,9 @@ public class DedupConfig implements Config, Serializable {
|
||||||
return getPace().getBlacklists();
|
return getPace().getBlacklists();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, String> translationMap() {
|
||||||
|
return getPace().translationMap();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.config;
|
package eu.dnetlib.pace.config;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.ClusteringDef;
|
import eu.dnetlib.pace.model.ClusteringDef;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||||
|
@ -8,6 +9,7 @@ import eu.dnetlib.pace.util.PaceResolver;
|
||||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.text.Normalizer;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -19,6 +21,10 @@ public class PaceConfig implements Serializable {
|
||||||
private Map<String, TreeNodeDef> decisionTree;
|
private Map<String, TreeNodeDef> decisionTree;
|
||||||
|
|
||||||
private Map<String, List<String>> blacklists;
|
private Map<String, List<String>> blacklists;
|
||||||
|
private Map<String, List<String>> synonyms;
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
|
private Map<String, String> translationMap;
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private Map<String, FieldDef> modelMap;
|
private Map<String, FieldDef> modelMap;
|
||||||
|
@ -30,11 +36,26 @@ public class PaceConfig implements Serializable {
|
||||||
|
|
||||||
public void initModel() {
|
public void initModel() {
|
||||||
modelMap = Maps.newHashMap();
|
modelMap = Maps.newHashMap();
|
||||||
for(FieldDef fd : getModel()) {
|
for (FieldDef fd : getModel()) {
|
||||||
modelMap.put(fd.getName(), fd);
|
modelMap.put(fd.getName(), fd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void initTranslationMap(){
|
||||||
|
translationMap = Maps.newHashMap();
|
||||||
|
for (String key : synonyms.keySet()) {
|
||||||
|
for (String term : synonyms.get(key)){
|
||||||
|
translationMap.put(
|
||||||
|
Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD),
|
||||||
|
key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, String> translationMap(){
|
||||||
|
return translationMap;
|
||||||
|
}
|
||||||
|
|
||||||
public List<FieldDef> getModel() {
|
public List<FieldDef> getModel() {
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
@ -67,6 +88,14 @@ public class PaceConfig implements Serializable {
|
||||||
this.blacklists = blacklists;
|
this.blacklists = blacklists;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Map<String, List<String>> getSynonyms() {
|
||||||
|
return synonyms;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSynonyms(Map<String, List<String>> synonyms) {
|
||||||
|
this.synonyms = synonyms;
|
||||||
|
}
|
||||||
|
|
||||||
public Map<String, FieldDef> getModelMap() {
|
public Map<String, FieldDef> getModelMap() {
|
||||||
return modelMap;
|
return modelMap;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
@ -23,7 +24,7 @@ public class AlwaysMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
return 1.0;
|
return 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ -22,7 +23,7 @@ public class ExactMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
return a.equals(b) ? 1.0 : 0;
|
return a.equals(b) ? 1.0 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
@ -14,7 +15,7 @@ public class ExactMatchIgnoreCase extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(Field a, Field b) {
|
public double compare(Field a, Field b, final Config conf) {
|
||||||
|
|
||||||
final String fa = getValue(a);
|
final String fa = getValue(a);
|
||||||
final String fb = getValue(b);
|
final String fb = getValue(b);
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ -23,7 +24,7 @@ public class JaroWinkler extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(String a, String b) {
|
public double distance(String a, String b, final Config conf) {
|
||||||
String ca = cleanup(a);
|
String ca = cleanup(a);
|
||||||
String cb = cleanup(b);
|
String cb = cleanup(b);
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -26,7 +28,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(String a, String b) {
|
public double distance(String a, String b, final Config conf) {
|
||||||
String ca = cleanup(a);
|
String ca = cleanup(a);
|
||||||
String cb = cleanup(b);
|
String cb = cleanup(b);
|
||||||
|
|
||||||
|
@ -36,15 +38,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
||||||
ca = filterAllStopWords(ca);
|
ca = filterAllStopWords(ca);
|
||||||
cb = filterAllStopWords(cb);
|
cb = filterAllStopWords(cb);
|
||||||
|
|
||||||
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
||||||
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
||||||
|
|
||||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||||
|
|
||||||
if (sameCity(cities1,cities2)) {
|
if (sameCity(cities1,cities2)) {
|
||||||
|
|
||||||
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||||
|
|
||||||
ca = removeKeywords(ca, keywords1);
|
ca = removeKeywords(ca, keywords1);
|
||||||
ca = removeKeywords(ca, cities1);
|
ca = removeKeywords(ca, cities1);
|
||||||
|
|
|
@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||||
|
@ -23,7 +25,7 @@ public class JaroWinklerTitle extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(String a, String b) {
|
public double distance(String a, String b, final Config conf) {
|
||||||
String ca = cleanup(a);
|
String ca = cleanup(a);
|
||||||
String cb = cleanup(b);
|
String cb = cleanup(b);
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
final String ca = cleanup(a);
|
final String ca = cleanup(a);
|
||||||
final String cb = cleanup(b);
|
final String cb = cleanup(b);
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
@ -26,7 +28,7 @@ public class LevensteinTitle extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
final String ca = cleanup(a);
|
final String ca = cleanup(a);
|
||||||
final String cb = cleanup(b);
|
final String cb = cleanup(b);
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -25,7 +27,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
String ca = cleanup(a);
|
String ca = cleanup(a);
|
||||||
String cb = cleanup(b);
|
String cb = cleanup(b);
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -22,7 +23,7 @@ public class MustBeDifferent extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
return !a.equals(b) ? 1.0 : 0;
|
return !a.equals(b) ? 1.0 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.Comparator;
|
import eu.dnetlib.pace.tree.support.Comparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
@ -17,7 +18,7 @@ public class NullDistanceAlgo implements Comparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(Field a, Field b) {
|
public double compare(Field a, Field b, Config config) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
import eu.dnetlib.pace.model.adaptor.Pid;
|
import eu.dnetlib.pace.model.adaptor.Pid;
|
||||||
|
@ -27,7 +28,7 @@ public class PidMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
|
|
||||||
final List<String> sa = ((FieldList) a).stringList();
|
final List<String> sa = ((FieldList) a).stringList();
|
||||||
final List<String> sb = ((FieldList) b).stringList();
|
final List<String> sb = ((FieldList) b).stringList();
|
||||||
|
|
|
@ -5,6 +5,7 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
@ -28,7 +29,7 @@ public class SizeMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
@ -67,9 +70,9 @@ public class SubStringLevenstein extends AbstractComparator {
|
||||||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double distance(final Field a, final Field b, final Config conf) {
|
||||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
|
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
|
||||||
|
|
||||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
@ -21,7 +22,7 @@ public class TitleVersionMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
final String valueA = getFirstValue(a);
|
final String valueA = getFirstValue(a);
|
||||||
final String valueB = getFirstValue(b);
|
final String valueB = getFirstValue(b);
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
@ -28,8 +29,7 @@ public class UrlMatcher extends Levenstein {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(Field a, Field b) {
|
public double distance(Field a, Field b, final Config conf) {
|
||||||
|
|
||||||
final URL urlA = asUrl(getFirstValue(a));
|
final URL urlA = asUrl(getFirstValue(a));
|
||||||
final URL urlB = asUrl(getFirstValue(b));
|
final URL urlB = asUrl(getFirstValue(b));
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ public class UrlMatcher extends Levenstein {
|
||||||
return hostW * 0.5;
|
return hostW * 0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
|
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
private URL asUrl(final String value) {
|
private URL asUrl(final String value) {
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
@ -22,7 +23,7 @@ public class YearMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
final String valueA = getNumbers(getFirstValue(a));
|
final String valueA = getNumbers(getFirstValue(a));
|
||||||
final String valueB = getNumbers(getFirstValue(b));
|
final String valueB = getNumbers(getFirstValue(b));
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
|
@ -66,7 +67,7 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
||||||
* the b
|
* the b
|
||||||
* @return the double
|
* @return the double
|
||||||
*/
|
*/
|
||||||
public double distance(final String a, final String b) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty()) {
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
return -1; //return -1 if a field is missing
|
return -1; //return -1 if a field is missing
|
||||||
|
@ -84,16 +85,23 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
||||||
* the b
|
* the b
|
||||||
* @return the double
|
* @return the double
|
||||||
*/
|
*/
|
||||||
protected double distance(final List<String> a, final List<String> b) {
|
protected double distance(final List<String> a, final List<String> b, final Config conf) {
|
||||||
return distance(concat(a), concat(b));
|
return distance(concat(a), concat(b), conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double distance(final Field a, final Field b, final Config conf) {
|
||||||
|
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||||
|
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||||
|
|
||||||
|
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b) {
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
|
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
|
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||||
|
|
||||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.tree.support;
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
public interface Comparator {
|
public interface Comparator {
|
||||||
|
@ -8,6 +9,6 @@ public interface Comparator {
|
||||||
* return : -1 -> can't decide (missing field)
|
* return : -1 -> can't decide (missing field)
|
||||||
* >0 -> similarity degree (depends on the algorithm)
|
* >0 -> similarity degree (depends on the algorithm)
|
||||||
* */
|
* */
|
||||||
public double compare(Field a, Field b);
|
public double compare(Field a, Field b, Config conf);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package eu.dnetlib.pace.tree.support;
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.PaceConfig;
|
import eu.dnetlib.pace.config.PaceConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -35,7 +37,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
public TreeNodeDef() {
|
public TreeNodeDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) {
|
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
||||||
|
|
||||||
TreeNodeStats stats = new TreeNodeStats();
|
TreeNodeStats stats = new TreeNodeStats();
|
||||||
stats.setFieldsCount(fields.size());
|
stats.setFieldsCount(fields.size());
|
||||||
|
@ -44,7 +46,7 @@ public class TreeNodeDef implements Serializable {
|
||||||
|
|
||||||
double weight = fieldConf.getWeight();
|
double weight = fieldConf.getWeight();
|
||||||
|
|
||||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
|
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||||
|
|
||||||
if (result == -1) { //if the field is missing
|
if (result == -1) { //if the field is missing
|
||||||
stats.incrementMissCount();
|
stats.incrementMissCount();
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class TreeProcessor {
|
||||||
if (currentNode == null)
|
if (currentNode == null)
|
||||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||||
|
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
|
|
||||||
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
|
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
|
||||||
current = currentNode.getUndefined();
|
current = currentNode.getUndefined();
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
|
key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
|
||||||
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
||||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
|
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
|
||||||
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
||||||
|
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
|
||||||
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
||||||
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
||||||
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
||||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
|
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology
|
||||||
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
||||||
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
||||||
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
||||||
|
|
|
|
@ -1,22 +1,24 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class ClusteringFunctionTest extends AbstractPaceTest {
|
public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
private Map<String, Integer> params;
|
private Map<String, Integer> params;
|
||||||
|
DedupConfig conf;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
params = Maps.newHashMap();
|
params = Maps.newHashMap();
|
||||||
|
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -26,7 +28,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "http://www.test.it/path/to/resource";
|
final String s = "http://www.test.it/path/to/resource";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(urlClustering.apply(Lists.newArrayList(url(s))));
|
System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -40,7 +42,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(ngram.apply(Lists.newArrayList(title(s))));
|
System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -52,7 +54,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(np.apply(Lists.newArrayList(title(s))));
|
System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -64,11 +66,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s1 = "University of Pisa";
|
final String s1 = "University of Pisa";
|
||||||
System.out.println(s1);
|
System.out.println(s1);
|
||||||
System.out.println(np.apply(Lists.newArrayList(title(s1))));
|
System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
|
||||||
|
|
||||||
final String s2 = "Pisa University";
|
final String s2 = "Pisa University";
|
||||||
System.out.println(s2);
|
System.out.println(s2);
|
||||||
System.out.println(np.apply(Lists.newArrayList(title(s2))));
|
System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -81,7 +83,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(acro.apply(Lists.newArrayList(title(s))));
|
System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -93,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -105,7 +107,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -114,7 +116,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = readFromClasspath("gt.author.json");
|
final String s = readFromClasspath("gt.author.json");
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(person(s))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -123,27 +125,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final ClusteringFunction cf = new KeywordsClustering(params);
|
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||||
final String s = "Polytechnic University of Turin";
|
final String s = "Polytechnic University of Turin";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||||
|
|
||||||
final String s1 = "POLITECNICO DI TORINO";
|
final String s1 = "POLITECNICO DI TORINO";
|
||||||
System.out.println(s1);
|
System.out.println(s1);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
||||||
|
|
||||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||||
System.out.println("s2 = " + s2);
|
System.out.println("s2 = " + s2);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s2))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
|
||||||
|
|
||||||
final String s3 = "universita universita milano milano";
|
final String s3 = "universita universita milano milano";
|
||||||
System.out.println("s3 = " + s3);
|
System.out.println("s3 = " + s3);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
|
||||||
|
|
||||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||||
System.out.println("s4 = " + s4);
|
System.out.println("s4 = " + s4);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
|
||||||
|
|
||||||
final String s5 = "İstanbul Ticarət Universiteti";
|
final String s5 = "İstanbul Ticarət Universiteti";
|
||||||
System.out.println("s5 = " + s5);
|
System.out.println("s5 = " + s5);
|
||||||
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.comparators;
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
|
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||||
private Map<String, Number> params;
|
private Map<String, Number> params;
|
||||||
|
private DedupConfig conf;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void setup() {
|
public void setup() {
|
||||||
System.out.println("****************************************************************");
|
|
||||||
System.out.println("Test String : " + TEST_STRING);
|
|
||||||
params = new HashMap<>();
|
params = new HashMap<>();
|
||||||
params.put("weight", 1.0);
|
params.put("weight", 1.0);
|
||||||
|
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName() {
|
public void testJaroWinklerNormalizedName() {
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
|
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertEquals(0.0, result);
|
assertEquals(0.0, result);
|
||||||
|
@ -65,49 +66,49 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
public void testJaroWinklerNormalizedName2() {
|
public void testJaroWinklerNormalizedName2() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
|
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf);
|
||||||
|
|
||||||
assertEquals(result, 1.0);
|
assertEquals(1.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName3() {
|
public void testJaroWinklerNormalizedName3() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna");
|
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertEquals(result, 0.0);
|
assertEquals(0.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName4() {
|
public void testJaroWinklerNormalizedName4() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa");
|
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertEquals(result, 1.0);
|
assertEquals(1.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName5() {
|
public void testJaroWinklerNormalizedName5() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS");
|
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertEquals(result, 1.0);
|
assertEquals(1.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName6() {
|
public void testJaroWinklerNormalizedName6() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung");
|
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertTrue(result> 0.9);
|
assertTrue(result > 0.9);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -115,17 +116,17 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
public void testJaroWinklerNormalizedName7() {
|
public void testJaroWinklerNormalizedName7() {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO");
|
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
assertTrue(result> 0.9);
|
assertTrue(result > 0.9);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJaroWinklerNormalizedName8() {
|
public void testJaroWinklerNormalizedName8() {
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology");
|
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
}
|
}
|
||||||
|
@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
public void testJaroWinklerNormalizedName9() {
|
public void testJaroWinklerNormalizedName9() {
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
|
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
}
|
}
|
||||||
|
@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||||
|
|
||||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
|
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
||||||
|
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,12 +5,13 @@ import org.junit.Test;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
import static org.junit.Assert.assertNotNull;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
public class ConfigTest extends AbstractPaceTest {
|
public class ConfigTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void dedupConfigSerializationTest() {
|
public void dedupConfigSerializationTest() {
|
||||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
|
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||||
|
|
||||||
final String conf = cfgFromClasspath.toString();
|
final String conf = cfgFromClasspath.toString();
|
||||||
|
|
||||||
|
@ -37,4 +38,20 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
System.out.println(load.toString());
|
System.out.println(load.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void translationMapTest() {
|
||||||
|
|
||||||
|
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||||
|
|
||||||
|
System.out.println("translationMap = " + load.getPace().translationMap().toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void emptyTranslationMapTest() {
|
||||||
|
|
||||||
|
DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf"));
|
||||||
|
|
||||||
|
assertEquals(0, load.getPace().translationMap().keySet().size());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,32 +5,152 @@
|
||||||
"entityType" : "organization",
|
"entityType" : "organization",
|
||||||
"orderField" : "legalname",
|
"orderField" : "legalname",
|
||||||
"queueMaxSize" : "2000",
|
"queueMaxSize" : "2000",
|
||||||
"groupMaxSize" : "10",
|
"groupMaxSize" : "50",
|
||||||
"slidingWindowSize" : "200",
|
"slidingWindowSize" : "200",
|
||||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||||
"includeChildren" : "true"
|
"includeChildren" : "true"
|
||||||
},
|
},
|
||||||
"pace" : {
|
"pace" : {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
|
<<<<<<< HEAD
|
||||||
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
|
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
|
||||||
|
=======
|
||||||
|
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||||
|
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||||
|
>>>>>>> origin/master
|
||||||
],
|
],
|
||||||
"sufficientConditions" : [
|
"sufficientConditions" : [
|
||||||
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
|
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
|
||||||
],
|
],
|
||||||
|
<<<<<<< HEAD
|
||||||
"necessaryConditions" : [
|
"necessaryConditions" : [
|
||||||
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
|
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
|
||||||
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
|
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
|
||||||
|
=======
|
||||||
|
"conditions" : [
|
||||||
|
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||||
|
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||||
|
>>>>>>> origin/master
|
||||||
],
|
],
|
||||||
"model" : [
|
"model" : [
|
||||||
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
|
|
||||||
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
|
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||||
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||||
],
|
],
|
||||||
"blacklists" : { }
|
"blacklists" : {
|
||||||
|
"legalname" : []
|
||||||
|
},
|
||||||
|
"synonyms": {
|
||||||
|
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
||||||
|
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||||
|
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||||
|
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||||
|
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
||||||
|
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
||||||
|
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
|
||||||
|
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||||
|
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||||
|
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||||
|
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||||
|
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||||
|
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||||
|
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
||||||
|
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
||||||
|
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
||||||
|
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
||||||
|
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
||||||
|
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
||||||
|
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
||||||
|
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
||||||
|
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
||||||
|
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
||||||
|
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
||||||
|
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
||||||
|
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
||||||
|
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
||||||
|
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
||||||
|
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
||||||
|
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
||||||
|
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
||||||
|
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
||||||
|
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
||||||
|
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
||||||
|
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
||||||
|
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
||||||
|
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
||||||
|
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
||||||
|
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
||||||
|
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
||||||
|
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
||||||
|
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
||||||
|
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
||||||
|
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
||||||
|
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
||||||
|
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
||||||
|
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
||||||
|
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
||||||
|
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
||||||
|
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
||||||
|
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
|
||||||
|
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
|
||||||
|
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
|
||||||
|
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
|
||||||
|
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
|
||||||
|
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
|
||||||
|
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
|
||||||
|
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
|
||||||
|
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
|
||||||
|
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
|
||||||
|
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
|
||||||
|
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
|
||||||
|
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
|
||||||
|
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
|
||||||
|
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
|
||||||
|
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
|
||||||
|
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
|
||||||
|
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
|
||||||
|
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
|
||||||
|
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
|
||||||
|
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
|
||||||
|
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
|
||||||
|
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
|
||||||
|
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
|
||||||
|
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
|
||||||
|
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
|
||||||
|
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
|
||||||
|
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
|
||||||
|
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
|
||||||
|
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
|
||||||
|
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
|
||||||
|
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
|
||||||
|
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
|
||||||
|
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
|
||||||
|
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
|
||||||
|
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
|
||||||
|
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
|
||||||
|
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
|
||||||
|
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
|
||||||
|
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
|
||||||
|
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
|
||||||
|
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
|
||||||
|
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
|
||||||
|
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
|
||||||
|
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
|
||||||
|
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
|
||||||
|
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
|
||||||
|
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
|
||||||
|
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
|
||||||
|
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
|
||||||
|
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
|
||||||
|
"key::102": ["informatics","informatica","informática","informática","informatica"],
|
||||||
|
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||||
|
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,40 @@
|
||||||
|
{
|
||||||
|
"wf" : {
|
||||||
|
"threshold" : "0.9",
|
||||||
|
"dedupRun" : "001",
|
||||||
|
"entityType" : "organization",
|
||||||
|
"orderField" : "legalname",
|
||||||
|
"queueMaxSize" : "2000",
|
||||||
|
"groupMaxSize" : "50",
|
||||||
|
"slidingWindowSize" : "200",
|
||||||
|
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||||
|
"includeChildren" : "true"
|
||||||
|
},
|
||||||
|
"pace" : {
|
||||||
|
"clustering" : [
|
||||||
|
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||||
|
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||||
|
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||||
|
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||||
|
],
|
||||||
|
"strictConditions" : [
|
||||||
|
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||||
|
],
|
||||||
|
"conditions" : [
|
||||||
|
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||||
|
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||||
|
],
|
||||||
|
"model" : [
|
||||||
|
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||||
|
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||||
|
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||||
|
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||||
|
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||||
|
],
|
||||||
|
"blacklists" : {
|
||||||
|
"legalname" : []
|
||||||
|
},
|
||||||
|
"synonyms": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,11 +0,0 @@
|
||||||
#release configuration
|
|
||||||
#Mon Jul 08 10:03:15 CEST 2019
|
|
||||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
|
||||||
pushChanges=true
|
|
||||||
scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
|
|
||||||
preparationGoals=clean verify
|
|
||||||
projectVersionPolicyId=default
|
|
||||||
remoteTagging=true
|
|
||||||
scm.commentPrefix=[maven-release-plugin]
|
|
||||||
exec.snapshotReleasePluginAllowed=false
|
|
||||||
completedPhase=create-backup-poms
|
|
Loading…
Reference in New Issue