put the last modification of the master branch into the tree2. Addition of the configuration as parameter of the comparator. This is to allow the comparator to access it

This commit is contained in:
miconis 2019-10-29 16:38:42 +01:00
commit 30a873265f
49 changed files with 388 additions and 126 deletions

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId> <artifactId>dnet-dedup</artifactId>
<version>3.0.14-SNAPSHOT</version> <version>3.0.15-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath> <relativePath>../pom.xml</relativePath>
</parent> </parent>

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
this.params = params; this.params = params;
} }
protected abstract Collection<String> doApply(String s); protected abstract Collection<String> doApply(Config conf, String s);
@Override @Override
public Collection<String> apply(List<Field> fields) { public Collection<String> apply(Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty()) return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue) .map(Field::stringValue)
.map(this::normalize) .map(this::normalize)
.map(s -> filterAllStopWords(s)) .map(s -> filterAllStopWords(s))
.map(this::doApply) .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream()) .flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)

View File

@ -6,6 +6,7 @@ import java.util.Set;
import java.util.StringTokenizer; import java.util.StringTokenizer;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("acronyms") @ClusteringClass("acronyms")
public class Acronyms extends AbstractClusteringFunction { public class Acronyms extends AbstractClusteringFunction {
@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(Config conf, String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
} }

View File

@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field;
public class ClusteringCombiner { public class ClusteringCombiner {
public static Collection<String> combine(final Document a, final Config conf) { public static Collection<String> combine(final Document a, final Config conf) {
return new ClusteringCombiner().doCombine(a, conf.clusterings()); return new ClusteringCombiner().doCombine(a, conf);
} }
private Collection<String> doCombine(final Document a, final List<ClusteringDef> defs) { private Collection<String> doCombine(final Document a, final Config conf) {
final Collection<String> res = Sets.newLinkedHashSet(); final Collection<String> res = Sets.newLinkedHashSet();
for (final ClusteringDef cd : defs) { for (final ClusteringDef cd : conf.clusterings()) {
for (final String fieldName : cd.getFields()) { for (final String fieldName : cd.getFields()) {
final Field values = a.values(fieldName); final Field values = a.values(fieldName);
res.addAll(cd.clusteringFunction().apply((List<Field>) values)); res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
} }
} }
return res; return res;

View File

@ -4,11 +4,12 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
public interface ClusteringFunction { public interface ClusteringFunction {
public Collection<String> apply(List<Field> fields); public Collection<String> apply(Config config, List<Field> fields);
public Map<String, Integer> getParams(); public Map<String, Integer> getParams();

View File

@ -5,6 +5,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("immutablefieldvalue") @ClusteringClass("immutablefieldvalue")
public class ImmutableFieldValue extends AbstractClusteringFunction { public class ImmutableFieldValue extends AbstractClusteringFunction {
@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
res.add(s); res.add(s);

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -15,16 +16,16 @@ public class KeywordsClustering extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(final Config conf, String s) {
//takes city codes and keywords codes without duplicates //takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, params.getOrDefault("windowSize", 4)); Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4)); Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
//list of combination to return as result //list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword: keywordsToCodes(keywords)){ for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
for (String city: citiesToCodes(cities)) { for (String city: citiesToCodes(cities)) {
combinations.add(keyword+"-"+city); combinations.add(keyword+"-"+city);
if (combinations.size()>=params.getOrDefault("max", 2)) { if (combinations.size()>=params.getOrDefault("max", 2)) {
@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction {
} }
@Override @Override
public Collection<String> apply(List<Field> fields) { public Collection<String> apply(final Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty()) return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue) .map(Field::stringValue)
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here? .map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
.map(this::normalize) .map(this::normalize)
.map(s -> filterAllStopWords(s)) .map(s -> filterAllStopWords(s))
.map(this::doApply) .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream()) .flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)

View File

@ -6,6 +6,7 @@ import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction {
} }
@Override @Override
public Collection<String> apply(List<Field> fields) { public Collection<String> apply(Config conf, List<Field> fields) {
Collection<String> c = Sets.newLinkedHashSet(); Collection<String> c = Sets.newLinkedHashSet();
for(Field f : fields) { for(Field f : fields) {
c.addAll(doApply(f.stringValue())); c.addAll(doApply(conf, f.stringValue()));
} }
return c; return c;
} }
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final Config conf, final String s) {
if(StringUtils.isBlank(s)) { if(StringUtils.isBlank(s)) {
return Lists.newArrayList(); return Lists.newArrayList();
} }

View File

@ -6,6 +6,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrampairs") @ClusteringClass("ngrampairs")
public class NgramPairs extends Ngrams { public class NgramPairs extends Ngrams {
@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(Config conf, String s) {
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
} }

View File

@ -1,5 +1,7 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.*; import java.util.*;
@ClusteringClass("ngrams") @ClusteringClass("ngrams")
@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(Config conf, String s) {
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
} }

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
} }
@Override @Override
public Collection<String> apply(final List<Field> fields) { public Collection<String> apply(final Config conf, final List<Field> fields) {
final Set<String> hashes = Sets.newHashSet(); final Set<String> hashes = Sets.newHashSet();
for (final Field f : fields) { for (final Field f : fields) {

View File

@ -6,6 +6,7 @@ import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
@ClusteringClass("personhash") @ClusteringClass("personhash")
@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);

View File

@ -1,5 +1,7 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.Collection; import java.util.Collection;
import java.util.Map; import java.util.Map;
@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(final Config conf, String s) {
// TODO Auto-generated method stub // TODO Auto-generated method stub
return null; return null;
} }

View File

@ -5,6 +5,7 @@ import java.util.*;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("sortedngrampairs") @ClusteringClass("sortedngrampairs")
public class SortedNgramPairs extends NgramPairs { public class SortedNgramPairs extends NgramPairs {
@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(Config conf, String s) {
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s)); final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));

View File

@ -4,6 +4,7 @@ import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));

View File

@ -5,6 +5,7 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("suffixprefix") @ClusteringClass("suffixprefix")
public class SuffixPrefix extends AbstractClusteringFunction { public class SuffixPrefix extends AbstractClusteringFunction {
@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
} }
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max")); return suffixPrefix(s, param("len"), param("max"));
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import java.net.MalformedURLException; import java.net.MalformedURLException;
@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
} }
@Override @Override
public Collection<String> apply(List<Field> fields) { public Collection<String> apply(final Config conf, List<Field> fields) {
try { try {
return fields.stream() return fields.stream()
.filter(f -> !f.isEmpty()) .filter(f -> !f.isEmpty())

View File

@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -25,7 +27,6 @@ import java.util.stream.Collectors;
*/ */
public abstract class AbstractPaceFunctions { public abstract class AbstractPaceFunctions {
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@ -238,10 +239,10 @@ public abstract class AbstractPaceFunctions {
} }
public double keywordsCompare(Set<String> s1, Set<String> s2){ public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
Set<String> k1 = keywordsToCodes(s1); Set<String> k1 = keywordsToCodes(s1, translationMap);
Set<String> k2 = keywordsToCodes(s2); Set<String> k2 = keywordsToCodes(s2, translationMap);
int longer = (k1.size()>k2.size())?k1.size():k2.size(); int longer = (k1.size()>k2.size())?k1.size():k2.size();
@ -273,7 +274,7 @@ public abstract class AbstractPaceFunctions {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
} }
public Set<String> keywordsToCodes(Set<String> keywords) { public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
return toCodes(keywords, translationMap); return toCodes(keywords, translationMap);
} }
@ -324,12 +325,17 @@ public abstract class AbstractPaceFunctions {
return codes; return codes;
} }
public Set<String> getKeywords(String s1, int windowSize) {
return getKeywords(s1, translationMap, windowSize);
}
public Set<String> getCities(String s1, int windowSize) { public Set<String> getCities(String s1, int windowSize) {
return getKeywords(s1, cityMap, windowSize); return getKeywords(s1, cityMap, windowSize);
} }
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
} }

View File

@ -49,4 +49,6 @@ public interface Config {
*/ */
public Map<String, List<String>> blacklists(); public Map<String, List<String>> blacklists();
public Map<String, String> translationMap();
} }

View File

@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable {
try { try {
config = new ObjectMapper().readValue(json, DedupConfig.class); config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel(); config.getPace().initModel();
config.getPace().initTranslationMap();
return config; return config;
} catch (IOException e) { } catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e); throw new PaceException("Error in parsing configuration json", e);
@ -139,4 +140,9 @@ public class DedupConfig implements Config, Serializable {
return getPace().getBlacklists(); return getPace().getBlacklists();
} }
@Override
public Map<String, String> translationMap() {
return getPace().translationMap();
}
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.config; package eu.dnetlib.pace.config;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -8,6 +9,7 @@ import eu.dnetlib.pace.util.PaceResolver;
import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnore;
import java.io.Serializable; import java.io.Serializable;
import java.text.Normalizer;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -19,6 +21,10 @@ public class PaceConfig implements Serializable {
private Map<String, TreeNodeDef> decisionTree; private Map<String, TreeNodeDef> decisionTree;
private Map<String, List<String>> blacklists; private Map<String, List<String>> blacklists;
private Map<String, List<String>> synonyms;
@JsonIgnore
private Map<String, String> translationMap;
@JsonIgnore @JsonIgnore
private Map<String, FieldDef> modelMap; private Map<String, FieldDef> modelMap;
@ -30,11 +36,26 @@ public class PaceConfig implements Serializable {
public void initModel() { public void initModel() {
modelMap = Maps.newHashMap(); modelMap = Maps.newHashMap();
for(FieldDef fd : getModel()) { for (FieldDef fd : getModel()) {
modelMap.put(fd.getName(), fd); modelMap.put(fd.getName(), fd);
} }
} }
public void initTranslationMap(){
translationMap = Maps.newHashMap();
for (String key : synonyms.keySet()) {
for (String term : synonyms.get(key)){
translationMap.put(
Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD),
key);
}
}
}
public Map<String, String> translationMap(){
return translationMap;
}
public List<FieldDef> getModel() { public List<FieldDef> getModel() {
return model; return model;
} }
@ -67,6 +88,14 @@ public class PaceConfig implements Serializable {
this.blacklists = blacklists; this.blacklists = blacklists;
} }
public Map<String, List<String>> getSynonyms() {
return synonyms;
}
public void setSynonyms(Map<String, List<String>> synonyms) {
this.synonyms = synonyms;
}
public Map<String, FieldDef> getModelMap() { public Map<String, FieldDef> getModelMap() {
return modelMap; return modelMap;
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -23,7 +24,7 @@ public class AlwaysMatch extends AbstractComparator {
} }
@Override @Override
public double compare(final Field a, final Field b) { public double compare(final Field a, final Field b, final Config conf) {
return 1.0; return 1.0;
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -22,7 +23,7 @@ public class ExactMatch extends AbstractComparator {
} }
@Override @Override
public double distance(final String a, final String b) { public double distance(final String a, final String b, final Config conf) {
return a.equals(b) ? 1.0 : 0; return a.equals(b) ? 1.0 : 0;
} }

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -14,7 +15,7 @@ public class ExactMatchIgnoreCase extends AbstractComparator {
} }
@Override @Override
public double compare(Field a, Field b) { public double compare(Field a, Field b, final Config conf) {
final String fa = getValue(a); final String fa = getValue(a);
final String fb = getValue(b); final String fb = getValue(b);

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -23,7 +24,7 @@ public class JaroWinkler extends AbstractComparator {
} }
@Override @Override
public double distance(String a, String b) { public double distance(String a, String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);

View File

@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
@ -26,7 +28,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
} }
@Override @Override
public double distance(String a, String b) { public double distance(String a, String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);
@ -36,15 +38,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
ca = filterAllStopWords(ca); ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb); cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) { if (sameCity(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) { if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
ca = removeKeywords(ca, keywords1); ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1); ca = removeKeywords(ca, cities1);

View File

@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import java.util.Map; import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ -23,7 +25,7 @@ public class JaroWinklerTitle extends AbstractComparator {
} }
@Override @Override
public double distance(String a, String b) { public double distance(String a, String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);

View File

@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import java.util.Map; import java.util.Map;
@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator {
} }
@Override @Override
public double distance(final String a, final String b) { public double distance(final String a, final String b, final Config conf) {
final String ca = cleanup(a); final String ca = cleanup(a);
final String cb = cleanup(b); final String cb = cleanup(b);

View File

@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -26,7 +28,7 @@ public class LevensteinTitle extends AbstractComparator {
} }
@Override @Override
public double distance(final String a, final String b) { public double distance(final String a, final String b, final Config conf) {
final String ca = cleanup(a); final String ca = cleanup(a);
final String cb = cleanup(b); final String cb = cleanup(b);

View File

@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import java.util.Map; import java.util.Map;
@ -25,7 +27,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator {
} }
@Override @Override
public double distance(final String a, final String b) { public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a); String ca = cleanup(a);
String cb = cleanup(b); String cb = cleanup(b);

View File

@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import java.util.Map; import java.util.Map;
@ -22,7 +23,7 @@ public class MustBeDifferent extends AbstractComparator {
} }
@Override @Override
public double distance(final String a, final String b) { public double distance(final String a, final String b, final Config conf) {
return !a.equals(b) ? 1.0 : 0; return !a.equals(b) ? 1.0 : 0;
} }

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -17,7 +18,7 @@ public class NullDistanceAlgo implements Comparator {
} }
@Override @Override
public double compare(Field a, Field b) { public double compare(Field a, Field b, Config config) {
return 0; return 0;
} }
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.adaptor.Pid; import eu.dnetlib.pace.model.adaptor.Pid;
@ -27,7 +28,7 @@ public class PidMatch extends AbstractComparator {
} }
@Override @Override
public double compare(final Field a, final Field b) { public double compare(final Field a, final Field b, final Config conf) {
final List<String> sa = ((FieldList) a).stringList(); final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList(); final List<String> sb = ((FieldList) b).stringList();

View File

@ -5,6 +5,7 @@ import java.util.Map;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -28,7 +29,7 @@ public class SizeMatch extends AbstractComparator {
} }
@Override @Override
public double compare(final Field a, final Field b) { public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty())
return -1; return -1;

View File

@ -1,5 +1,8 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang.StringUtils;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
@ -67,9 +70,9 @@ public class SubStringLevenstein extends AbstractComparator {
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/ */
@Override @Override
public double compare(final Field a, final Field b) { public double distance(final Field a, final Field b, final Config conf) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
} }

View File

@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -21,7 +22,7 @@ public class TitleVersionMatch extends AbstractComparator {
} }
@Override @Override
public double compare(final Field a, final Field b) { public double compare(final Field a, final Field b, final Config conf) {
final String valueA = getFirstValue(a); final String valueA = getFirstValue(a);
final String valueB = getFirstValue(b); final String valueB = getFirstValue(b);

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -28,8 +29,7 @@ public class UrlMatcher extends Levenstein {
} }
@Override @Override
public double compare(Field a, Field b) { public double distance(Field a, Field b, final Config conf) {
final URL urlA = asUrl(getFirstValue(a)); final URL urlA = asUrl(getFirstValue(a));
final URL urlB = asUrl(getFirstValue(b)); final URL urlB = asUrl(getFirstValue(b));
@ -44,7 +44,7 @@ public class UrlMatcher extends Levenstein {
return hostW * 0.5; return hostW * 0.5;
} }
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
} }
private URL asUrl(final String value) { private URL asUrl(final String value) {

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree; package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.tree.support.ComparatorClass;
@ -22,7 +23,7 @@ public class YearMatch extends AbstractComparator {
} }
@Override @Override
public double compare(final Field a, final Field b) { public double compare(final Field a, final Field b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a)); final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b)); final String valueB = getNumbers(getFirstValue(b));

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldList;
@ -66,7 +67,7 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
* the b * the b
* @return the double * @return the double
*/ */
public double distance(final String a, final String b) { public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) { if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing return -1; //return -1 if a field is missing
@ -84,16 +85,23 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
* the b * the b
* @return the double * @return the double
*/ */
protected double distance(final List<String> a, final List<String> b) { protected double distance(final List<String> a, final List<String> b, final Config conf) {
return distance(concat(a), concat(b)); return distance(concat(a), concat(b), conf);
}
public double distance(final Field a, final Field b, final Config conf) {
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
} }
@Override @Override
public double compare(final Field a, final Field b) { public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) if (a.isEmpty() || b.isEmpty())
return -1; return -1;
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
} }

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
public interface Comparator { public interface Comparator {
@ -8,6 +9,6 @@ public interface Comparator {
* return : -1 -> can't decide (missing field) * return : -1 -> can't decide (missing field)
* >0 -> similarity degree (depends on the algorithm) * >0 -> similarity degree (depends on the algorithm)
* */ * */
public double compare(Field a, Field b); public double compare(Field a, Field b, Config conf);
} }

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.tree.support; package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig; import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException; import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.annotate.JsonIgnore;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException; import java.io.IOException;
@ -35,7 +37,7 @@ public class TreeNodeDef implements Serializable {
public TreeNodeDef() { public TreeNodeDef() {
} }
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) { public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(); TreeNodeStats stats = new TreeNodeStats();
stats.setFieldsCount(fields.size()); stats.setFieldsCount(fields.size());
@ -44,7 +46,7 @@ public class TreeNodeDef implements Serializable {
double weight = fieldConf.getWeight(); double weight = fieldConf.getWeight();
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField())); double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
if (result == -1) { //if the field is missing if (result == -1) { //if the field is missing
stats.incrementMissCount(); stats.incrementMissCount();

View File

@ -38,7 +38,7 @@ public class TreeProcessor {
if (currentNode == null) if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + current); throw new PaceException("The Tree Node doesn't exist: " + current);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) { if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
current = currentNode.getUndefined(); current = currentNode.getUndefined();

View File

@ -1,4 +1,4 @@
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί

1 key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
2 key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
3 key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
4 key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
38 key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
39 key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
40 key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
41 key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology
42 key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
43 key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
44 key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί

View File

@ -1,22 +1,24 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.config.DedupConfig;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.util.Map;
public class ClusteringFunctionTest extends AbstractPaceTest { public class ClusteringFunctionTest extends AbstractPaceTest {
private Map<String, Integer> params; private Map<String, Integer> params;
DedupConfig conf;
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
params = Maps.newHashMap(); params = Maps.newHashMap();
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
} }
@Test @Test
@ -26,7 +28,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "http://www.test.it/path/to/resource"; final String s = "http://www.test.it/path/to/resource";
System.out.println(s); System.out.println(s);
System.out.println(urlClustering.apply(Lists.newArrayList(url(s)))); System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
} }
@Test @Test
@ -40,7 +42,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(ngram.apply(Lists.newArrayList(title(s)))); System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
} }
@Test @Test
@ -52,7 +54,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(np.apply(Lists.newArrayList(title(s)))); System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
} }
@Test @Test
@ -64,11 +66,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s1 = "University of Pisa"; final String s1 = "University of Pisa";
System.out.println(s1); System.out.println(s1);
System.out.println(np.apply(Lists.newArrayList(title(s1)))); System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
final String s2 = "Pisa University"; final String s2 = "Pisa University";
System.out.println(s2); System.out.println(s2);
System.out.println(np.apply(Lists.newArrayList(title(s2)))); System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
} }
@Test @Test
@ -81,7 +83,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(acro.apply(Lists.newArrayList(title(s)))); System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
} }
@Test @Test
@ -93,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(sp.apply(Lists.newArrayList(title(s)))); System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
} }
@Test @Test
@ -105,7 +107,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson"; final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s); System.out.println(s);
System.out.println(sp.apply(Lists.newArrayList(title(s)))); System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
} }
@Test @Test
@ -114,7 +116,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = readFromClasspath("gt.author.json"); final String s = readFromClasspath("gt.author.json");
System.out.println(s); System.out.println(s);
System.out.println(cf.apply(Lists.newArrayList(person(s)))); System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
} }
@Test @Test
@ -123,27 +125,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final ClusteringFunction cf = new KeywordsClustering(params); final ClusteringFunction cf = new KeywordsClustering(params);
final String s = "Polytechnic University of Turin"; final String s = "Polytechnic University of Turin";
System.out.println(s); System.out.println(s);
System.out.println(cf.apply(Lists.newArrayList(title(s)))); System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "POLITECNICO DI TORINO"; final String s1 = "POLITECNICO DI TORINO";
System.out.println(s1); System.out.println(s1);
System.out.println(cf.apply(Lists.newArrayList(title(s1)))); System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
final String s2 = "Universita farmaceutica culturale di milano bergamo"; final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2); System.out.println("s2 = " + s2);
System.out.println(cf.apply(Lists.newArrayList(title(s2)))); System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
final String s3 = "universita universita milano milano"; final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3); System.out.println("s3 = " + s3);
System.out.println(cf.apply(Lists.newArrayList(title(s3)))); System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)"; final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4); System.out.println("s4 = " + s4);
System.out.println(cf.apply(Lists.newArrayList(title(s4)))); System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
final String s5 = "İstanbul Ticarət Universiteti"; final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5); System.out.println("s5 = " + s5);
System.out.println(cf.apply(Lists.newArrayList(title(s5)))); System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
} }

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName; import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
private Map<String, Number> params; private Map<String, Number> params;
private DedupConfig conf;
@Before @Before
public void setup() { public void setup() {
System.out.println("****************************************************************");
System.out.println("Test String : " + TEST_STRING);
params = new HashMap<>(); params = new HashMap<>();
params.put("weight", 1.0); params.put("weight", 1.0);
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
} }
@Test @Test
@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
@Test @Test
public void testJaroWinklerNormalizedName() { public void testJaroWinklerNormalizedName() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State"); double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
assertEquals(0.0, result); assertEquals(0.0, result);
@ -65,49 +66,49 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName2() { public void testJaroWinklerNormalizedName2() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf);
assertEquals(result, 1.0); assertEquals(1.0, result);
} }
@Test @Test
public void testJaroWinklerNormalizedName3() { public void testJaroWinklerNormalizedName3() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
assertEquals(result, 0.0); assertEquals(0.0, result);
} }
@Test @Test
public void testJaroWinklerNormalizedName4() { public void testJaroWinklerNormalizedName4() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
assertEquals(result, 1.0); assertEquals(1.0, result);
} }
@Test @Test
public void testJaroWinklerNormalizedName5() { public void testJaroWinklerNormalizedName5() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
assertEquals(result, 1.0); assertEquals(1.0, result);
} }
@Test @Test
public void testJaroWinklerNormalizedName6() { public void testJaroWinklerNormalizedName6() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
assertTrue(result> 0.9); assertTrue(result > 0.9);
} }
@ -115,17 +116,17 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName7() { public void testJaroWinklerNormalizedName7() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO"); double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
assertTrue(result> 0.9); assertTrue(result > 0.9);
} }
@Test @Test
public void testJaroWinklerNormalizedName8() { public void testJaroWinklerNormalizedName8() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology"); double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void testJaroWinklerNormalizedName9() { public void testJaroWinklerNormalizedName9() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti"); double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence"); double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }

View File

@ -5,12 +5,13 @@ import org.junit.Test;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
public class ConfigTest extends AbstractPaceTest { public class ConfigTest extends AbstractPaceTest {
@Test @Test
public void dedupConfigSerializationTest() { public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json")); final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf"));
final String conf = cfgFromClasspath.toString(); final String conf = cfgFromClasspath.toString();
@ -37,4 +38,20 @@ public class ConfigTest extends AbstractPaceTest {
System.out.println(load.toString()); System.out.println(load.toString());
} }
@Test
public void translationMapTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
System.out.println("translationMap = " + load.getPace().translationMap().toString());
}
@Test
public void emptyTranslationMapTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf"));
assertEquals(0, load.getPace().translationMap().keySet().size());
}
} }

View File

@ -5,32 +5,152 @@
"entityType" : "organization", "entityType" : "organization",
"orderField" : "legalname", "orderField" : "legalname",
"queueMaxSize" : "2000", "queueMaxSize" : "2000",
"groupMaxSize" : "10", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true" "includeChildren" : "true"
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
<<<<<<< HEAD
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } } { "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
=======
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
>>>>>>> origin/master
], ],
"sufficientConditions" : [ "sufficientConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] } { "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
], ],
<<<<<<< HEAD
"necessaryConditions" : [ "necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }, { "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] } { "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
=======
"conditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
>>>>>>> origin/master
], ],
"model" : [ "model" : [
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
], ],
"blacklists" : { } "blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
"key::102": ["informatics","informatica","informática","informática","informatica"],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
}
} }
} }

View File

@ -0,0 +1,40 @@
{
"wf" : {
"threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"strictConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"conditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
],
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
}
}
}

View File

@ -1,11 +0,0 @@
#release configuration
#Mon Jul 08 10:03:15 CEST 2019
scm.tagNameFormat=@{project.artifactId}-@{project.version}
pushChanges=true
scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
preparationGoals=clean verify
projectVersionPolicyId=default
remoteTagging=true
scm.commentPrefix=[maven-release-plugin]
exec.snapshotReleasePluginAllowed=false
completedPhase=create-backup-poms