forked from D-Net/dnet-hadoop
put the last modification of the master branch into the tree2. Addition of the configuration as parameter of the comparator. This is to allow the comparator to access it
This commit is contained in:
commit
30a873265f
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-dedup</artifactId>
|
||||
<version>3.0.14-SNAPSHOT</version>
|
||||
<version>3.0.15-SNAPSHOT</version>
|
||||
<relativePath>../pom.xml</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
|||
this.params = params;
|
||||
}
|
||||
|
||||
protected abstract Collection<String> doApply(String s);
|
||||
protected abstract Collection<String> doApply(Config conf, String s);
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(this::doApply)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Set;
|
|||
import java.util.StringTokenizer;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("acronyms")
|
||||
public class Acronyms extends AbstractClusteringFunction {
|
||||
|
@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||
}
|
||||
|
||||
|
|
|
@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field;
|
|||
public class ClusteringCombiner {
|
||||
|
||||
public static Collection<String> combine(final Document a, final Config conf) {
|
||||
return new ClusteringCombiner().doCombine(a, conf.clusterings());
|
||||
return new ClusteringCombiner().doCombine(a, conf);
|
||||
}
|
||||
|
||||
private Collection<String> doCombine(final Document a, final List<ClusteringDef> defs) {
|
||||
private Collection<String> doCombine(final Document a, final Config conf) {
|
||||
final Collection<String> res = Sets.newLinkedHashSet();
|
||||
for (final ClusteringDef cd : defs) {
|
||||
for (final ClusteringDef cd : conf.clusterings()) {
|
||||
for (final String fieldName : cd.getFields()) {
|
||||
final Field values = a.values(fieldName);
|
||||
res.addAll(cd.clusteringFunction().apply((List<Field>) values));
|
||||
res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
|
|
@ -4,11 +4,12 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface ClusteringFunction {
|
||||
|
||||
public Collection<String> apply(List<Field> fields);
|
||||
public Collection<String> apply(Config config, List<Field> fields);
|
||||
|
||||
public Map<String, Integer> getParams();
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("immutablefieldvalue")
|
||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||
|
@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(s);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -15,16 +16,16 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
|
||||
//takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, params.getOrDefault("windowSize", 4));
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||
|
||||
//list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
|
||||
for (String keyword: keywordsToCodes(keywords)){
|
||||
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
|
||||
for (String city: citiesToCodes(cities)) {
|
||||
combinations.add(keyword+"-"+city);
|
||||
if (combinations.size()>=params.getOrDefault("max", 2)) {
|
||||
|
@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(this::doApply)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
Collection<String> c = Sets.newLinkedHashSet();
|
||||
for(Field f : fields) {
|
||||
c.addAll(doApply(f.stringValue()));
|
||||
c.addAll(doApply(conf, f.stringValue()));
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
if(StringUtils.isBlank(s)) {
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("ngrampairs")
|
||||
public class NgramPairs extends Ngrams {
|
||||
|
@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@ClusteringClass("ngrams")
|
||||
|
@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering;
|
|||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final List<Field> fields) {
|
||||
public Collection<String> apply(final Config conf, final List<Field> fields) {
|
||||
final Set<String> hashes = Sets.newHashSet();
|
||||
|
||||
for (final Field f : fields) {
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("personhash")
|
||||
|
@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.*;
|
|||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("sortedngrampairs")
|
||||
public class SortedNgramPairs extends NgramPairs {
|
||||
|
@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
|
||||
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang.RandomStringUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("suffixprefix")
|
||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||
|
@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||
try {
|
||||
return fields.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
|
|
|
@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -25,7 +27,6 @@ import java.util.stream.Collectors;
|
|||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
private static Map<String,String> translationMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
|
@ -238,10 +239,10 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
|
||||
public double keywordsCompare(Set<String> s1, Set<String> s2){
|
||||
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
|
||||
|
||||
Set<String> k1 = keywordsToCodes(s1);
|
||||
Set<String> k2 = keywordsToCodes(s2);
|
||||
Set<String> k1 = keywordsToCodes(s1, translationMap);
|
||||
Set<String> k2 = keywordsToCodes(s2, translationMap);
|
||||
|
||||
int longer = (k1.size()>k2.size())?k1.size():k2.size();
|
||||
|
||||
|
@ -273,7 +274,7 @@ public abstract class AbstractPaceFunctions {
|
|||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
public Set<String> keywordsToCodes(Set<String> keywords) {
|
||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return toCodes(keywords, translationMap);
|
||||
}
|
||||
|
||||
|
@ -324,12 +325,17 @@ public abstract class AbstractPaceFunctions {
|
|||
return codes;
|
||||
}
|
||||
|
||||
public Set<String> getKeywords(String s1, int windowSize) {
|
||||
return getKeywords(s1, translationMap, windowSize);
|
||||
}
|
||||
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,4 +49,6 @@ public interface Config {
|
|||
*/
|
||||
public Map<String, List<String>> blacklists();
|
||||
|
||||
|
||||
public Map<String, String> translationMap();
|
||||
}
|
||||
|
|
|
@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable {
|
|||
try {
|
||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||
config.getPace().initModel();
|
||||
config.getPace().initTranslationMap();
|
||||
return config;
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Error in parsing configuration json", e);
|
||||
|
@ -139,4 +140,9 @@ public class DedupConfig implements Config, Serializable {
|
|||
return getPace().getBlacklists();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> translationMap() {
|
||||
return getPace().translationMap();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
@ -8,6 +9,7 @@ import eu.dnetlib.pace.util.PaceResolver;
|
|||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -19,6 +21,10 @@ public class PaceConfig implements Serializable {
|
|||
private Map<String, TreeNodeDef> decisionTree;
|
||||
|
||||
private Map<String, List<String>> blacklists;
|
||||
private Map<String, List<String>> synonyms;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, String> translationMap;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, FieldDef> modelMap;
|
||||
|
@ -30,11 +36,26 @@ public class PaceConfig implements Serializable {
|
|||
|
||||
public void initModel() {
|
||||
modelMap = Maps.newHashMap();
|
||||
for(FieldDef fd : getModel()) {
|
||||
for (FieldDef fd : getModel()) {
|
||||
modelMap.put(fd.getName(), fd);
|
||||
}
|
||||
}
|
||||
|
||||
public void initTranslationMap(){
|
||||
translationMap = Maps.newHashMap();
|
||||
for (String key : synonyms.keySet()) {
|
||||
for (String term : synonyms.get(key)){
|
||||
translationMap.put(
|
||||
Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD),
|
||||
key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, String> translationMap(){
|
||||
return translationMap;
|
||||
}
|
||||
|
||||
public List<FieldDef> getModel() {
|
||||
return model;
|
||||
}
|
||||
|
@ -67,6 +88,14 @@ public class PaceConfig implements Serializable {
|
|||
this.blacklists = blacklists;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getSynonyms() {
|
||||
return synonyms;
|
||||
}
|
||||
|
||||
public void setSynonyms(Map<String, List<String>> synonyms) {
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
public Map<String, FieldDef> getModelMap() {
|
||||
return modelMap;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
@ -23,7 +24,7 @@ public class AlwaysMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class ExactMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
@ -14,7 +15,7 @@ public class ExactMatchIgnoreCase extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
public double compare(Field a, Field b, final Config conf) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
|
@ -23,7 +24,7 @@ public class JaroWinkler extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
|
|||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -26,7 +28,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
@ -36,15 +38,15 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
|||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
if (sameCity(cities1,cities2)) {
|
||||
|
||||
if (keywordsCompare(keywords1, keywords2)>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||
if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
|
|
|
@ -4,6 +4,8 @@ import com.wcohen.ss.AbstractStringDistance;
|
|||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
|
@ -23,7 +25,7 @@ public class JaroWinklerTitle extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
|||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
|
|||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
|
@ -26,7 +28,7 @@ public class LevensteinTitle extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.pace.tree;
|
|||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -25,7 +27,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
|||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class MustBeDifferent extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return !a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.Comparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
@ -17,7 +18,7 @@ public class NullDistanceAlgo implements Comparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
public double compare(Field a, Field b, Config config) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
import eu.dnetlib.pace.model.adaptor.Pid;
|
||||
|
@ -27,7 +28,7 @@ public class PidMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
@ -28,7 +29,7 @@ public class SizeMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
|
|
|
@ -1,5 +1,8 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
@ -67,9 +70,9 @@ public class SubStringLevenstein extends AbstractComparator {
|
|||
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double distance(final Field a, final Field b, final Config conf) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
@ -21,7 +22,7 @@ public class TitleVersionMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
final String valueA = getFirstValue(a);
|
||||
final String valueB = getFirstValue(b);
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
@ -28,8 +29,7 @@ public class UrlMatcher extends Levenstein {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(Field a, Field b) {
|
||||
|
||||
public double distance(Field a, Field b, final Config conf) {
|
||||
final URL urlA = asUrl(getFirstValue(a));
|
||||
final URL urlB = asUrl(getFirstValue(b));
|
||||
|
||||
|
@ -44,7 +44,7 @@ public class UrlMatcher extends Levenstein {
|
|||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
@ -22,7 +23,7 @@ public class YearMatch extends AbstractComparator {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.tree.support;
|
|||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
@ -66,7 +67,7 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
|||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1; //return -1 if a field is missing
|
||||
|
@ -84,16 +85,23 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
|||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
protected double distance(final List<String> a, final List<String> b) {
|
||||
return distance(concat(a), concat(b));
|
||||
protected double distance(final List<String> a, final List<String> b, final Config conf) {
|
||||
return distance(concat(a), concat(b), conf);
|
||||
}
|
||||
|
||||
public double distance(final Field a, final Field b, final Config conf) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public double compare(final Field a, final Field b) {
|
||||
public double compare(final Field a, final Field b, final Config conf) {
|
||||
if (a.isEmpty() || b.isEmpty())
|
||||
return -1;
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface Comparator {
|
||||
|
@ -8,6 +9,6 @@ public interface Comparator {
|
|||
* return : -1 -> can't decide (missing field)
|
||||
* >0 -> similarity degree (depends on the algorithm)
|
||||
* */
|
||||
public double compare(Field a, Field b);
|
||||
public double compare(Field a, Field b, Config conf);
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package eu.dnetlib.pace.tree.support;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -35,7 +37,7 @@ public class TreeNodeDef implements Serializable {
|
|||
public TreeNodeDef() {
|
||||
}
|
||||
|
||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2) {
|
||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
stats.setFieldsCount(fields.size());
|
||||
|
@ -44,7 +46,7 @@ public class TreeNodeDef implements Serializable {
|
|||
|
||||
double weight = fieldConf.getWeight();
|
||||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()));
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
|
||||
if (result == -1) { //if the field is missing
|
||||
stats.incrementMissCount();
|
||||
|
|
|
@ -38,7 +38,7 @@ public class TreeProcessor {
|
|||
if (currentNode == null)
|
||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
||||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2);
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
|
||||
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
|
||||
current = currentNode.getUndefined();
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
|
||||
key::1;university;universita;universita studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο;universitesi;universiteti
|
||||
key::2;studies;studi;études;estudios;estudos;Studien;studia;исследования;studies;σπουδές
|
||||
key::3;advanced;superiore;supérieur;supérieure;supérieurs;supérieures;avancado;avancados;fortgeschrittene;fortgeschritten;zaawansowany;передовой;gevorderd;gevorderde;προχωρημένος;προχωρημένη;προχωρημένο;προχωρημένες;προχωρημένα;wyzsza
|
||||
key::4;institute;istituto;institut;instituto;instituto;Institut;instytut;институт;instituut;ινστιτούτο
|
||||
|
@ -38,7 +38,7 @@ key::37;federation;federazione;fédération;федерация;federatie;ομο
|
|||
key::38;observatory;osservatorio;observatoire;обсерватория;observatorium;αστεροσκοπείο
|
||||
key::39;bureau;ufficio;bureau;офис;bureau;γραφείο
|
||||
key::40;company;impresa;compagnie;société;компания;bedrijf;εταιρία
|
||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;politechnika;politechniki;university technology;university science technology
|
||||
key::41;polytechnic;politecnico;polytechnique;политехника;polytechnisch;πολυτεχνείο;universita politecnica;polytechnic university;universidad politecnica;universitat politecnica;politechnika;politechniki;university technology;university science technology
|
||||
key::42;coalition;coalizione;coalition;коалиция;coalitie;συνασπισμός
|
||||
key::43;initiative;iniziativa;initiative;инициатива;initiatief;πρωτοβουλία
|
||||
key::44;academic;accademico;académique;universitaire;акадеческий academisch;ακαδημαϊκός;ακαδημαϊκή;ακαδημαϊκό;ακαδημαϊκές;ακαδημαϊκοί
|
||||
|
|
|
|
@ -1,22 +1,24 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||
|
||||
private Map<String, Integer> params;
|
||||
DedupConfig conf;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
params = Maps.newHashMap();
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -26,7 +28,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "http://www.test.it/path/to/resource";
|
||||
System.out.println(s);
|
||||
System.out.println(urlClustering.apply(Lists.newArrayList(url(s))));
|
||||
System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -40,7 +42,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(ngram.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -52,7 +54,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(np.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -64,11 +66,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s1 = "University of Pisa";
|
||||
System.out.println(s1);
|
||||
System.out.println(np.apply(Lists.newArrayList(title(s1))));
|
||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
|
||||
|
||||
final String s2 = "Pisa University";
|
||||
System.out.println(s2);
|
||||
System.out.println(np.apply(Lists.newArrayList(title(s2))));
|
||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -81,7 +83,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(acro.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -93,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(sp.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -105,7 +107,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(sp.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -114,7 +116,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = readFromClasspath("gt.author.json");
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(Lists.newArrayList(person(s))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -123,27 +125,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||
final String s = "Polytechnic University of Turin";
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||
|
||||
final String s1 = "POLITECNICO DI TORINO";
|
||||
System.out.println(s1);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
||||
|
||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||
System.out.println("s2 = " + s2);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s2))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
|
||||
|
||||
final String s3 = "universita universita milano milano";
|
||||
System.out.println("s3 = " + s3);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
|
||||
|
||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||
System.out.println("s4 = " + s4);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
|
||||
|
||||
final String s5 = "İstanbul Ticarət Universiteti";
|
||||
System.out.println("s5 = " + s5);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.comparators;
|
|||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.tree.JaroWinklerNormalizedName;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
|
@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
private Map<String, Number> params;
|
||||
private DedupConfig conf;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
System.out.println("****************************************************************");
|
||||
System.out.println("Test String : " + TEST_STRING);
|
||||
params = new HashMap<>();
|
||||
params.put("weight", 1.0);
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
@Test
|
||||
public void testJaroWinklerNormalizedName() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
|
||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(0.0, result);
|
||||
|
@ -65,49 +66,49 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName2() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
|
||||
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf);
|
||||
|
||||
assertEquals(result, 1.0);
|
||||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName3() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna");
|
||||
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(result, 0.0);
|
||||
assertEquals(0.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName4() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa");
|
||||
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(result, 1.0);
|
||||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName5() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS");
|
||||
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(result, 1.0);
|
||||
assertEquals(1.0, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName6() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung");
|
||||
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertTrue(result> 0.9);
|
||||
assertTrue(result > 0.9);
|
||||
|
||||
}
|
||||
|
||||
|
@ -115,17 +116,17 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName7() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO");
|
||||
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertTrue(result> 0.9);
|
||||
assertTrue(result > 0.9);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName8() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology");
|
||||
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName9() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
|
||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
|
||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
|
|
@ -5,12 +5,13 @@ import org.junit.Test;
|
|||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class ConfigTest extends AbstractPaceTest {
|
||||
|
||||
@Test
|
||||
public void dedupConfigSerializationTest() {
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||
|
||||
final String conf = cfgFromClasspath.toString();
|
||||
|
||||
|
@ -37,4 +38,20 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
System.out.println(load.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void translationMapTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||
|
||||
System.out.println("translationMap = " + load.getPace().translationMap().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyTranslationMapTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf"));
|
||||
|
||||
assertEquals(0, load.getPace().translationMap().keySet().size());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,32 +5,152 @@
|
|||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
<<<<<<< HEAD
|
||||
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
|
||||
=======
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
>>>>>>> origin/master
|
||||
],
|
||||
"sufficientConditions" : [
|
||||
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
|
||||
],
|
||||
<<<<<<< HEAD
|
||||
"necessaryConditions" : [
|
||||
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
|
||||
=======
|
||||
"conditions" : [
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
>>>>>>> origin/master
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||
],
|
||||
"blacklists" : { }
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
||||
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
||||
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
|
||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
||||
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
||||
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
||||
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
||||
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
||||
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
||||
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
||||
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
||||
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
||||
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
||||
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
||||
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
||||
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
||||
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
||||
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
||||
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
||||
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
||||
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
||||
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
||||
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
||||
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
||||
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
||||
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
||||
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
||||
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
||||
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
||||
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
||||
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
||||
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
||||
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
||||
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
||||
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
||||
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
||||
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
||||
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
||||
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
||||
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
|
||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
|
||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
|
||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
|
||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
|
||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
|
||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
|
||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
|
||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
|
||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
|
||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
|
||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
|
||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
|
||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
|
||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
|
||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
|
||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
|
||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
|
||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
|
||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
|
||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
|
||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
|
||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
|
||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
|
||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
|
||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
|
||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
|
||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
|
||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
|
||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
|
||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
|
||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
|
||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
|
||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
|
||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
|
||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
|
||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
|
||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
|
||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
|
||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
|
||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
|
||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
|
||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
|
||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
|
||||
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
|
||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
|
||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
|
||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
|
||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
|
||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
|
||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
|
||||
"key::102": ["informatics","informatica","informática","informática","informatica"],
|
||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.9",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
],
|
||||
"strictConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,11 +0,0 @@
|
|||
#release configuration
|
||||
#Mon Jul 08 10:03:15 CEST 2019
|
||||
scm.tagNameFormat=@{project.artifactId}-@{project.version}
|
||||
pushChanges=true
|
||||
scm.url=scm\:git\:https\://github.com/dnet-team/dnet-dedup.git
|
||||
preparationGoals=clean verify
|
||||
projectVersionPolicyId=default
|
||||
remoteTagging=true
|
||||
scm.commentPrefix=[maven-release-plugin]
|
||||
exec.snapshotReleasePluginAllowed=false
|
||||
completedPhase=create-backup-poms
|
Loading…
Reference in New Issue