From 26b383fea259c8ad460d657d6d50a5adce113a2c Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 8 Oct 2019 14:53:52 +0200 Subject: [PATCH] translation map moved in json configuration, support for synonyms added in the configuration, now the configuration is argument of conditions, distancealgos and clusteringfunctions --- .../AbstractClusteringFunction.java | 7 +- .../eu/dnetlib/pace/clustering/Acronyms.java | 3 +- .../pace/clustering/ClusteringCombiner.java | 8 +- .../pace/clustering/ClusteringFunction.java | 3 +- .../pace/clustering/ImmutableFieldValue.java | 3 +- .../pace/clustering/KeywordsClustering.java | 9 +- .../pace/clustering/LowercaseClustering.java | 7 +- .../dnetlib/pace/clustering/NgramPairs.java | 3 +- .../eu/dnetlib/pace/clustering/Ngrams.java | 4 +- .../pace/clustering/PersonClustering.java | 3 +- .../dnetlib/pace/clustering/PersonHash.java | 3 +- .../clustering/RandomClusteringFunction.java | 4 +- .../pace/clustering/SortedNgramPairs.java | 3 +- .../clustering/SpaceTrimmingFieldValue.java | 3 +- .../dnetlib/pace/clustering/SuffixPrefix.java | 3 +- .../pace/clustering/UrlClustering.java | 3 +- .../pace/common/AbstractPaceFunctions.java | 15 ++- .../pace/condition/AbstractCondition.java | 9 +- .../pace/condition/AlwaysTrueCondition.java | 4 +- .../dnetlib/pace/condition/ConditionAlgo.java | 4 +- .../eu/dnetlib/pace/condition/ExactMatch.java | 3 +- .../pace/condition/ExactMatchIgnoreCase.java | 3 +- .../pace/condition/MustBeDifferent.java | 4 +- .../eu/dnetlib/pace/condition/PidMatch.java | 4 +- .../eu/dnetlib/pace/condition/SizeMatch.java | 3 +- .../pace/condition/TitleVersionMatch.java | 3 +- .../eu/dnetlib/pace/condition/YearMatch.java | 3 +- .../java/eu/dnetlib/pace/config/Config.java | 2 + .../eu/dnetlib/pace/config/DedupConfig.java | 6 + .../eu/dnetlib/pace/config/PaceConfig.java | 27 +++- .../dnetlib/pace/distance/DistanceAlgo.java | 3 +- .../dnetlib/pace/distance/DistanceScorer.java | 4 +- .../distance/SecondStringDistanceAlgo.java | 13 +- .../pace/distance/algo/AlwaysMatch.java | 3 +- .../pace/distance/algo/ExactMatch.java | 3 +- .../pace/distance/algo/JaroWinkler.java | 3 +- .../algo/JaroWinklerNormalizedName.java | 7 +- .../pace/distance/algo/JaroWinklerTitle.java | 3 +- .../distance/algo/Level2JaroWinklerTitle.java | 3 +- .../pace/distance/algo/LevensteinTitle.java | 3 +- .../algo/LevensteinTitleIgnoreVersion.java | 3 +- .../pace/distance/algo/MustBeDifferent.java | 3 +- .../pace/distance/algo/NullDistanceAlgo.java | 3 +- .../distance/algo/SubStringLevenstein.java | 5 +- .../pace/distance/algo/UrlMatcher.java | 5 +- .../clustering/ClusteringFunctionTest.java | 39 +++--- .../pace/distance/DistanceAlgoTest.java | 25 ++-- .../eu/dnetlib/pace/config/org.curr.conf | 124 ++++++++++++++++-- 48 files changed, 304 insertions(+), 107 deletions(-) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 1782b87613..7fdcce497d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i this.params = params; } - protected abstract Collection doApply(String s); + protected abstract Collection doApply(Config conf, String s); @Override - public Collection apply(List fields) { + public Collection apply(Config conf, List fields) { return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::normalize) .map(s -> filterAllStopWords(s)) - .map(this::doApply) + .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) .filter(StringUtils::isNotBlank) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index ee5efc9671..d3008332db 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -6,6 +6,7 @@ import java.util.Set; import java.util.StringTokenizer; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; @ClusteringClass("acronyms") public class Acronyms extends AbstractClusteringFunction { @@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java index a4b58aa81f..52859b4b8b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field; public class ClusteringCombiner { public static Collection combine(final Document a, final Config conf) { - return new ClusteringCombiner().doCombine(a, conf.clusterings()); + return new ClusteringCombiner().doCombine(a, conf); } - private Collection doCombine(final Document a, final List defs) { + private Collection doCombine(final Document a, final Config conf) { final Collection res = Sets.newLinkedHashSet(); - for (final ClusteringDef cd : defs) { + for (final ClusteringDef cd : conf.clusterings()) { for (final String fieldName : cd.getFields()) { final Field values = a.values(fieldName); - res.addAll(cd.clusteringFunction().apply((List) values)); + res.addAll(cd.clusteringFunction().apply(conf, (List) values)); } } return res; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 4fe1b596ed..0554d27a19 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -4,11 +4,12 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; public interface ClusteringFunction { - public Collection apply(List fields); + public Collection apply(Config config, List fields); public Map getParams(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index fab8e989d1..7f342f69c4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -5,6 +5,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("immutablefieldvalue") public class ImmutableFieldValue extends AbstractClusteringFunction { @@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); res.add(s); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 1cabecd600..1680ab0f1a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -15,10 +16,10 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(final Config conf, String s) { //takes city codes and keywords codes without duplicates - Set keywords = getKeywords(s, params.getOrDefault("windowSize", 4)); + Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); Set cities = getCities(s, params.getOrDefault("windowSize", 4)); //list of combination to return as result @@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction { } @Override - public Collection apply(List fields) { + public Collection apply(final Config conf, List fields) { return fields.stream().filter(f -> !f.isEmpty()) .map(Field::stringValue) .map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here? .map(this::normalize) .map(s -> filterAllStopWords(s)) - .map(this::doApply) + .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) .filter(StringUtils::isNotBlank) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 5ec8590aa6..6fe525f16e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -6,6 +6,7 @@ import java.util.Map; import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction { } @Override - public Collection apply(List fields) { + public Collection apply(Config conf, List fields) { Collection c = Sets.newLinkedHashSet(); for(Field f : fields) { - c.addAll(doApply(f.stringValue())); + c.addAll(doApply(conf, f.stringValue())); } return c; } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { if(StringUtils.isBlank(s)) { return Lists.newArrayList(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index 06885be9fb..baa30d7471 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -6,6 +6,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrampairs") public class NgramPairs extends Ngrams { @@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index 8549468dbc..214b1451fa 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -1,5 +1,7 @@ package eu.dnetlib.pace.clustering; +import eu.dnetlib.pace.config.Config; + import java.util.*; @ClusteringClass("ngrams") @@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index 718b88d14b..26b07f0203 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering; import com.google.common.collect.Sets; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Person; import org.apache.commons.lang.StringUtils; @@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin } @Override - public Collection apply(final List fields) { + public Collection apply(final Config conf, final List fields) { final Set hashes = Sets.newHashSet(); for (final Field f : fields) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index fcb01b994b..2020a662fd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -6,6 +6,7 @@ import java.util.Map; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Person; @ClusteringClass("personhash") @@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index f012aacabe..c485fcb9aa 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -1,5 +1,7 @@ package eu.dnetlib.pace.clustering; +import eu.dnetlib.pace.config.Config; + import java.util.Collection; import java.util.Map; @@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(final Config conf, String s) { // TODO Auto-generated method stub return null; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 2f475fe716..55b203d7a2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -5,6 +5,7 @@ import java.util.*; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; @ClusteringClass("sortedngrampairs") public class SortedNgramPairs extends NgramPairs { @@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s)); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 22dc4906b3..fd8e7a3cc0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -4,6 +4,7 @@ import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; import org.apache.commons.lang.RandomStringUtils; import org.apache.commons.lang.StringUtils; @@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { } @Override - protected Collection doApply(final String s) { + protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index 3960331c9f..fa1f643621 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -5,6 +5,7 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; @ClusteringClass("suffixprefix") public class SuffixPrefix extends AbstractClusteringFunction { @@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction { } @Override - protected Collection doApply(String s) { + protected Collection doApply(Config conf, String s) { return suffixPrefix(s, param("len"), param("max")); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 9955d5fbec..feb60a221d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.clustering; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import java.net.MalformedURLException; @@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu } @Override - public Collection apply(List fields) { + public Collection apply(final Config conf, List fields) { try { return fields.stream() .filter(f -> !f.isEmpty()) diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 24379c677e..3050293a0c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; +import java.io.IOException; +import java.io.StringWriter; import java.text.Normalizer; import java.util.*; import java.util.regex.Matcher; @@ -327,12 +329,17 @@ public abstract class AbstractPaceFunctions { return codes; } - public Set getKeywords(String s1, int windowSize) { - return getKeywords(s1, translationMap, windowSize); - } - public Set getCities(String s1, int windowSize) { return getKeywords(s1, cityMap, windowSize); } + public static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java index 2b4aa29a18..8f6bf3e088 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; @@ -25,10 +26,10 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements this.fields = fields; } - protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); + protected abstract ConditionEval verify(FieldDef fd, Field a, Field b, Config conf); @Override - public ConditionEvalMap verify(final Document a, final Document b) { + public ConditionEvalMap verify(final Document a, final Document b, final Config conf) { final ConditionEvalMap res = new ConditionEvalMap(); for (final FieldDef fd : getFields()) { @@ -36,12 +37,12 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements final Field vb = b.values(fd.getName()); if (fd.isIgnoreMissing()) { - res.put(fd.getName(), verify(fd, va, vb)); + res.put(fd.getName(), verify(fd, va, vb, conf)); } else { if (va.isEmpty() || vb.isEmpty()) { res.put(fd.getName(), new ConditionEval(cond, va, vb, -1)); } else { - res.put(fd.getName(), verify(fd, va, vb)); + res.put(fd.getName(), verify(fd, va, vb, conf)); } } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java index 2274da5d5e..633ade3307 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java @@ -1,6 +1,8 @@ package eu.dnetlib.pace.condition; import java.util.List; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -18,7 +20,7 @@ public class AlwaysTrueCondition extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { return new ConditionEval(cond, a, b, 1); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java index 787ad9af10..34e6de9271 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -1,6 +1,8 @@ package eu.dnetlib.pace.condition; import java.util.List; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.FieldDef; @@ -22,6 +24,6 @@ public interface ConditionAlgo { * @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when * the condition is not verified. */ - public abstract ConditionEvalMap verify(Document a, Document b); + public abstract ConditionEvalMap verify(Document a, Document b, Config conf); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java index a4cd847922..755e8153d6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,7 +21,7 @@ public class ExactMatch extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java index e9925ec6d5..672980ccc4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,7 +21,7 @@ public class ExactMatchIgnoreCase extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java index f2b3bdba4d..630e2341c6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.condition; import java.util.List; import com.google.common.collect.Iterables; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -30,7 +31,8 @@ public class MustBeDifferent extends AbstractCondition { * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) */ @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) + { final String fa = getValue(a); final String fb = getValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java index c15729ed31..92378f30c4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -6,6 +6,7 @@ import java.util.Set; import java.util.stream.Collectors; import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -29,7 +30,8 @@ public class PidMatch extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf + ) { final List sa = ((FieldList) a).stringList(); final List sb = ((FieldList) b).stringList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java index afd0a8eaaf..634320194a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java @@ -4,6 +4,7 @@ import java.util.List; import com.google.common.collect.Iterables; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -32,7 +33,7 @@ public class SizeMatch extends AbstractCondition { * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) */ @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { // if (a.isEmpty() & b.isEmpty()) return 1; // diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java index 4b94a04598..844cbf8421 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,7 +21,7 @@ public class TitleVersionMatch extends AbstractCondition { } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String valueA = getFirstValue(a); final String valueB = getFirstValue(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java index 71bb6cfd6e..af8635cef8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.condition; import java.time.Year; import java.util.List; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.eval.ConditionEval; import org.apache.commons.lang.StringUtils; @@ -34,7 +35,7 @@ public class YearMatch extends AbstractCondition { // } @Override - protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) { final String valueA = getNumbers(getFirstValue(a)); final String valueB = getNumbers(getFirstValue(b)); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 7498c23cf7..3ff299ef02 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -56,4 +56,6 @@ public interface Config { */ public Map> blacklists(); + + public Map translationMap(); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 1cfcb089c7..f252414dd4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable { try { config = new ObjectMapper().readValue(json, DedupConfig.class); config.getPace().initModel(); + config.getPace().initTranslationMap(); return config; } catch (IOException e) { throw new PaceException("Error in parsing configuration json", e); @@ -144,4 +145,9 @@ public class DedupConfig implements Config, Serializable { return getPace().getBlacklists(); } + @Override + public Map translationMap() { + return getPace().translationMap(); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 4fde1dee95..490fbafc4f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -22,6 +22,10 @@ public class PaceConfig implements Serializable { private List conditions; private List clustering; private Map> blacklists; + private Map> synonyms; + + @JsonIgnore + private Map translationMap; @JsonIgnore private Map modelMap; @@ -33,11 +37,24 @@ public class PaceConfig implements Serializable { public void initModel() { modelMap = Maps.newHashMap(); - for(FieldDef fd : getModel()) { + for (FieldDef fd : getModel()) { modelMap.put(fd.getName(), fd); } } + public void initTranslationMap(){ + translationMap = Maps.newHashMap(); + for (String key : synonyms.keySet()) { + for (String term : synonyms.get(key)){ + translationMap.put(term.toLowerCase(), key); + } + } + } + + public Map translationMap(){ + return translationMap; + } + public List getModel() { return model; } @@ -88,6 +105,14 @@ public class PaceConfig implements Serializable { this.blacklists = blacklists; } + public Map> getSynonyms() { + return synonyms; + } + + public void setSynonyms(Map> synonyms) { + this.synonyms = synonyms; + } + public Map getModelMap() { return modelMap; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java index 5e4f69f518..8d2b9bdbb7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import java.util.Map; @@ -10,7 +11,7 @@ import java.util.Map; */ public interface DistanceAlgo { - public abstract double distance(Field a, Field b); + public abstract double distance(Field a, Field b, Config conf); public double getWeight(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java index bb3c37ed6e..2f10aca81d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java @@ -49,7 +49,7 @@ public class DistanceScorer { final ConditionEvalMap res = new ConditionEvalMap(); for (final ConditionAlgo cd : conditions) { - final ConditionEvalMap map = cd.verify(a, b); + final ConditionEvalMap map = cd.verify(a, b, config); res.mergeFrom(map); // commented out shortcuts @@ -82,7 +82,7 @@ public class DistanceScorer { } } else { if (va.getType().equals(vb.getType())) { - de.setDistance(w * fd.distanceAlgo().distance(va, vb)); + de.setDistance(w * fd.distanceAlgo().distance(va, vb, config)); } else { throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java index 9cc35298f1..b710ccf4c5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -7,6 +7,7 @@ import java.util.Map; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; @@ -69,7 +70,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp * the b * @return the double */ - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { double score = ssalgo.score(a, b); return normalize(score); } @@ -83,8 +84,8 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp * the b * @return the double */ - protected double distance(final List a, final List b) { - return distance(concat(a), concat(b)); + protected double distance(final List a, final List b, final Config conf) { + return distance(concat(a), concat(b), conf); } /* @@ -93,9 +94,9 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp * @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override - public double distance(final Field a, final Field b) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); + public double distance(final Field a, final Field b, final Config conf) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java index 503235c139..bab477e105 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class AlwaysMatch extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return 1.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java index 44d881e553..66ff3c5d65 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class ExactMatch extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java index 20c09121da..9a89fe9eec 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -24,7 +25,7 @@ public class JaroWinkler extends SecondStringDistanceAlgo { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java index 546629b087..64ab74c221 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerNormalizedName.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -27,7 +28,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); @@ -37,8 +38,8 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo { ca = filterAllStopWords(ca); cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue()); - Set keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue()); + Set keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); + Set keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue()); Set cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue()); Set cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java index ff4d6de1f0..134f9726cf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -23,7 +24,7 @@ public class JaroWinklerTitle extends SecondStringDistanceAlgo { } @Override - public double distance(String a, String b) { + public double distance(String a, String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java index 2d05a00844..7b002aeae9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { final String ca = cleanup(a); final String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 503dc33b2f..f43d319ebc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.DistanceScorer; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -27,7 +28,7 @@ public class LevensteinTitle extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { final String ca = cleanup(a); final String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java index ff8b34bf33..956538ea71 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitleIgnoreVersion.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -25,7 +26,7 @@ public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { String ca = cleanup(a); String cb = cleanup(b); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java index e794f025f9..a1f555f80c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; @@ -22,7 +23,7 @@ public class MustBeDifferent extends SecondStringDistanceAlgo { } @Override - public double distance(final String a, final String b) { + public double distance(final String a, final String b, final Config conf) { return !a.equals(b) ? 1.0 : 0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java index 8afc45fd6b..16dca1c3de 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; @@ -17,7 +18,7 @@ public class NullDistanceAlgo implements DistanceAlgo { } @Override - public double distance(Field a, Field b) { + public double distance(Field a, Field b, final Config conf) { return 0.0; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java index 8f0c024c7f..e2c0007d07 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import org.apache.commons.lang.StringUtils; @@ -69,9 +70,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) */ @Override - public double distance(final Field a, final Field b) { + public double distance(final Field a, final Field b, final Config conf) { if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) - return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); + return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf); throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java index eacfdc08fe..5fd05fe0f8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -28,7 +29,7 @@ public class UrlMatcher extends Levenstein { } @Override - public double distance(Field a, Field b) { + public double distance(Field a, Field b, final Config conf) { final URL urlA = asUrl(getFirstValue(a)); final URL urlB = asUrl(getFirstValue(b)); @@ -44,7 +45,7 @@ public class UrlMatcher extends Levenstein { return hostW * 0.5; } - return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); } private URL asUrl(final String value) { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 84ec090067..a71894138a 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -1,22 +1,25 @@ package eu.dnetlib.pace.clustering; -import java.util.Map; - import com.google.common.collect.Lists; import com.google.common.collect.Maps; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.distance.DistanceAlgoTest; import org.junit.Before; import org.junit.Test; +import java.util.Map; + public class ClusteringFunctionTest extends AbstractPaceTest { private Map params; + DedupConfig conf; @Before public void setUp() throws Exception { params = Maps.newHashMap(); + conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class)); } @Test @@ -26,7 +29,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "http://www.test.it/path/to/resource"; System.out.println(s); - System.out.println(urlClustering.apply(Lists.newArrayList(url(s)))); + System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s)))); } @Test @@ -40,7 +43,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(ngram.apply(Lists.newArrayList(title(s)))); + System.out.println(ngram.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -52,7 +55,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(np.apply(Lists.newArrayList(title(s)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -64,11 +67,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s1 = "University of Pisa"; System.out.println(s1); - System.out.println(np.apply(Lists.newArrayList(title(s1)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s1)))); final String s2 = "Pisa University"; System.out.println(s2); - System.out.println(np.apply(Lists.newArrayList(title(s2)))); + System.out.println(np.apply(conf, Lists.newArrayList(title(s2)))); } @Test @@ -81,7 +84,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(acro.apply(Lists.newArrayList(title(s)))); + System.out.println(acro.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -93,7 +96,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -105,7 +108,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = "Search for the Standard Model Higgs Boson"; System.out.println(s); - System.out.println(sp.apply(Lists.newArrayList(title(s)))); + System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } @Test @@ -114,7 +117,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final String s = readFromClasspath("gt.author.json"); System.out.println(s); - System.out.println(cf.apply(Lists.newArrayList(person(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(person(s)))); } @Test @@ -123,27 +126,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest { final ClusteringFunction cf = new KeywordsClustering(params); final String s = "Polytechnic University of Turin"; System.out.println(s); - System.out.println(cf.apply(Lists.newArrayList(title(s)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); final String s1 = "POLITECNICO DI TORINO"; System.out.println(s1); - System.out.println(cf.apply(Lists.newArrayList(title(s1)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s1)))); final String s2 = "Universita farmaceutica culturale di milano bergamo"; System.out.println("s2 = " + s2); - System.out.println(cf.apply(Lists.newArrayList(title(s2)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s2)))); final String s3 = "universita universita milano milano"; System.out.println("s3 = " + s3); - System.out.println(cf.apply(Lists.newArrayList(title(s3)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s3)))); final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)"; System.out.println("s4 = " + s4); - System.out.println(cf.apply(Lists.newArrayList(title(s4)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s4)))); final String s5 = "İstanbul Ticarət Universiteti"; System.out.println("s5 = " + s5); - System.out.println(cf.apply(Lists.newArrayList(title(s5)))); + System.out.println(cf.apply(conf, Lists.newArrayList(title(s5)))); } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 3bf300bae8..97773f157a 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName; import org.junit.Before; import org.junit.Test; @@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; private Map params; + private DedupConfig conf; @Before public void setup() { - System.out.println("****************************************************************"); - System.out.println("Test String : " + TEST_STRING); params = new HashMap<>(); params.put("weight", 1.0); + conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class)); } @Test @@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { @Test public void testJaroWinklerNormalizedName() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State"); + double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf); System.out.println("result = " + result); assertEquals(0.0, result); @@ -65,7 +66,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName2() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York"); + double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf); assertEquals(1.0, result); } @@ -74,7 +75,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName3() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna"); + double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf); System.out.println("result = " + result); assertEquals(0.0, result); @@ -84,7 +85,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName4() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa"); + double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf); System.out.println("result = " + result); assertEquals(1.0, result); @@ -94,7 +95,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName5() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS"); + double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf); System.out.println("result = " + result); assertEquals(1.0, result); @@ -104,7 +105,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName6() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung"); + double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf); System.out.println("result = " + result); assertTrue(result > 0.9); @@ -115,7 +116,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName7() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO"); + double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf); System.out.println("result = " + result); assertTrue(result > 0.9); @@ -125,7 +126,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName8() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology"); + double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf); System.out.println("result = " + result); } @@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { public void testJaroWinklerNormalizedName9() { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti"); + double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf); System.out.println("result = " + result); } @@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence"); + double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf); System.out.println("result = " + result); } diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf index fd4fbbe799..0293680514 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/org.curr.conf @@ -5,7 +5,7 @@ "entityType" : "organization", "orderField" : "legalname", "queueMaxSize" : "2000", - "groupMaxSize" : "10", + "groupMaxSize" : "50", "slidingWindowSize" : "200", "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], "includeChildren" : "true" @@ -14,23 +14,131 @@ "clustering" : [ { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, - { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, + { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ], "strictConditions" : [ { "name" : "exactMatch", "fields" : [ "gridid" ] } ], "conditions" : [ - { "name" : "exactMatch", "fields" : [ "country" ] }, - { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }, + { "name" : "exactMatch", "fields" : [ "country" ] } ], "model" : [ - { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, - { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } ], - "blacklists" : { } + "blacklists" : { + "legalname" : [] + }, + "synonyms": { + "key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"], + "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], + "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], + "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], + "key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"], + "key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"], + "key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"], + "key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"], + "key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"], + "key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"], + "key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"], + "key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"], + "key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"], + "key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"], + "key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"], + "key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"], + "key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"], + "key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"], + "key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"], + "key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"], + "key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"], + "key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"], + "key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"], + "key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"], + "key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"], + "key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"], + "key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"], + "key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"], + "key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"], + "key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"], + "key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"], + "key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"], + "key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"], + "key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"], + "key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"], + "key::36": ["authority","autorità","autorité","авторитет","autoriteit"], + "key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"], + "key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"], + "key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"], + "key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"], + "key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"], + "key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"], + "key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"], + "key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"], + "key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"], + "key::46": ["division","divisione","division","отделение","divisie","τμήμα"], + "key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"], + "key::48": ["promotion","promozione","продвижение","proothisis","forderung"], + "key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"], + "key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"], + "key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"], + "key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"], + "key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"], + "key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"], + "key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"], + "key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"], + "key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"], + "key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"], + "key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"], + "key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"], + "key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"], + "key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"], + "key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"], + "key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"], + "key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"], + "key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"], + "key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"], + "key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"], + "key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"], + "key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"], + "key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"], + "key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"], + "key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"], + "key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"], + "key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"], + "key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"], + "key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"], + "key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"], + "key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"], + "key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"], + "key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"], + "key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"], + "key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"], + "key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"], + "key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"], + "key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"], + "key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"], + "key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"], + "key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"], + "key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"], + "key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"], + "key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"], + "key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"], + "key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"], + "key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"], + "key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"], + "key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"], + "key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"], + "key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"], + "key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"], + "key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"], + "key::102": ["informatics","informatica","informática","informática","informatica"], + "key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"], + "key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"] + } } } \ No newline at end of file