forked from D-Net/dnet-hadoop
translation map moved in json configuration, support for synonyms added in the configuration, now the configuration is argument of conditions, distancealgos and clusteringfunctions
This commit is contained in:
parent
07355d2811
commit
26b383fea2
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -18,15 +19,15 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
|||
this.params = params;
|
||||
}
|
||||
|
||||
protected abstract Collection<String> doApply(String s);
|
||||
protected abstract Collection<String> doApply(Config conf, String s);
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(this::doApply)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Set;
|
|||
import java.util.StringTokenizer;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("acronyms")
|
||||
public class Acronyms extends AbstractClusteringFunction {
|
||||
|
@ -15,7 +16,7 @@ public class Acronyms extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||
}
|
||||
|
||||
|
|
|
@ -13,15 +13,15 @@ import eu.dnetlib.pace.model.Field;
|
|||
public class ClusteringCombiner {
|
||||
|
||||
public static Collection<String> combine(final Document a, final Config conf) {
|
||||
return new ClusteringCombiner().doCombine(a, conf.clusterings());
|
||||
return new ClusteringCombiner().doCombine(a, conf);
|
||||
}
|
||||
|
||||
private Collection<String> doCombine(final Document a, final List<ClusteringDef> defs) {
|
||||
private Collection<String> doCombine(final Document a, final Config conf) {
|
||||
final Collection<String> res = Sets.newLinkedHashSet();
|
||||
for (final ClusteringDef cd : defs) {
|
||||
for (final ClusteringDef cd : conf.clusterings()) {
|
||||
for (final String fieldName : cd.getFields()) {
|
||||
final Field values = a.values(fieldName);
|
||||
res.addAll(cd.clusteringFunction().apply((List<Field>) values));
|
||||
res.addAll(cd.clusteringFunction().apply(conf, (List<Field>) values));
|
||||
}
|
||||
}
|
||||
return res;
|
||||
|
|
|
@ -4,11 +4,12 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
public interface ClusteringFunction {
|
||||
|
||||
public Collection<String> apply(List<Field> fields);
|
||||
public Collection<String> apply(Config config, List<Field> fields);
|
||||
|
||||
public Map<String, Integer> getParams();
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("immutablefieldvalue")
|
||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||
|
@ -14,7 +15,7 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(s);
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -15,10 +16,10 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
|
||||
//takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, params.getOrDefault("windowSize", 4));
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
||||
|
||||
//list of combination to return as result
|
||||
|
@ -37,13 +38,13 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||
return fields.stream().filter(f -> !f.isEmpty())
|
||||
.map(Field::stringValue)
|
||||
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
|
||||
.map(this::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(this::doApply)
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -17,16 +18,16 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
||||
Collection<String> c = Sets.newLinkedHashSet();
|
||||
for(Field f : fields) {
|
||||
c.addAll(doApply(f.stringValue()));
|
||||
c.addAll(doApply(conf, f.stringValue()));
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
if(StringUtils.isBlank(s)) {
|
||||
return Lists.newArrayList();
|
||||
}
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("ngrampairs")
|
||||
public class NgramPairs extends Ngrams {
|
||||
|
@ -15,7 +16,7 @@ public class NgramPairs extends Ngrams {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@ClusteringClass("ngrams")
|
||||
|
@ -10,7 +12,7 @@ public class Ngrams extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||
}
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.clustering;
|
|||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
@ -23,7 +24,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final List<Field> fields) {
|
||||
public Collection<String> apply(final Config conf, final List<Field> fields) {
|
||||
final Set<String> hashes = Sets.newHashSet();
|
||||
|
||||
for (final Field f : fields) {
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("personhash")
|
||||
|
@ -18,7 +19,7 @@ public class PersonHash extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -10,7 +12,7 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.*;
|
|||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("sortedngrampairs")
|
||||
public class SortedNgramPairs extends NgramPairs {
|
||||
|
@ -14,7 +15,7 @@ public class SortedNgramPairs extends NgramPairs {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
|
||||
final List<String> tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s));
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.lang.RandomStringUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -17,7 +18,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
protected Collection<String> doApply(final Config conf, final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
|
||||
|
|
|
@ -5,6 +5,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("suffixprefix")
|
||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||
|
@ -14,7 +15,7 @@ public class SuffixPrefix extends AbstractClusteringFunction {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
protected Collection<String> doApply(Config conf, String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
@ -21,7 +22,7 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
|||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
||||
try {
|
||||
return fields.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
|
|
|
@ -13,6 +13,8 @@ import org.apache.commons.collections.CollectionUtils;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
|
@ -327,12 +329,17 @@ public abstract class AbstractPaceFunctions {
|
|||
return codes;
|
||||
}
|
||||
|
||||
public Set<String> getKeywords(String s1, int windowSize) {
|
||||
return getKeywords(s1, translationMap, windowSize);
|
||||
}
|
||||
|
||||
public Set<String> getCities(String s1, int windowSize) {
|
||||
return getKeywords(s1, cityMap, windowSize);
|
||||
}
|
||||
|
||||
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
try {
|
||||
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
|
||||
return sw.toString();
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
|
@ -25,10 +26,10 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements
|
|||
this.fields = fields;
|
||||
}
|
||||
|
||||
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
||||
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b, Config conf);
|
||||
|
||||
@Override
|
||||
public ConditionEvalMap verify(final Document a, final Document b) {
|
||||
public ConditionEvalMap verify(final Document a, final Document b, final Config conf) {
|
||||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
for (final FieldDef fd : getFields()) {
|
||||
|
||||
|
@ -36,12 +37,12 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements
|
|||
final Field vb = b.values(fd.getName());
|
||||
|
||||
if (fd.isIgnoreMissing()) {
|
||||
res.put(fd.getName(), verify(fd, va, vb));
|
||||
res.put(fd.getName(), verify(fd, va, vb, conf));
|
||||
} else {
|
||||
if (va.isEmpty() || vb.isEmpty()) {
|
||||
res.put(fd.getName(), new ConditionEval(cond, va, vb, -1));
|
||||
} else {
|
||||
res.put(fd.getName(), verify(fd, va, vb));
|
||||
res.put(fd.getName(), verify(fd, va, vb, conf));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -18,7 +20,7 @@ public class AlwaysTrueCondition extends AbstractCondition {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) {
|
||||
return new ConditionEval(cond, a, b, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -22,6 +24,6 @@ public interface ConditionAlgo {
|
|||
* @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when
|
||||
* the condition is not verified.
|
||||
*/
|
||||
public abstract ConditionEvalMap verify(Document a, Document b);
|
||||
public abstract ConditionEvalMap verify(Document a, Document b, Config conf);
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -20,7 +21,7 @@ public class ExactMatch extends AbstractCondition {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -20,7 +21,7 @@ public class ExactMatchIgnoreCase extends AbstractCondition {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) {
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.condition;
|
|||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -30,7 +31,8 @@ public class MustBeDifferent extends AbstractCondition {
|
|||
* @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
|
||||
*/
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf)
|
||||
{
|
||||
|
||||
final String fa = getValue(a);
|
||||
final String fb = getValue(b);
|
||||
|
|
|
@ -6,6 +6,7 @@ import java.util.Set;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -29,7 +30,8 @@ public class PidMatch extends AbstractCondition {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf
|
||||
) {
|
||||
|
||||
final List<String> sa = ((FieldList) a).stringList();
|
||||
final List<String> sb = ((FieldList) b).stringList();
|
||||
|
|
|
@ -4,6 +4,7 @@ import java.util.List;
|
|||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -32,7 +33,7 @@ public class SizeMatch extends AbstractCondition {
|
|||
* @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List)
|
||||
*/
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) {
|
||||
|
||||
// if (a.isEmpty() & b.isEmpty()) return 1;
|
||||
//
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -20,7 +21,7 @@ public class TitleVersionMatch extends AbstractCondition {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) {
|
||||
final String valueA = getFirstValue(a);
|
||||
final String valueB = getFirstValue(b);
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.condition;
|
|||
import java.time.Year;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -34,7 +35,7 @@ public class YearMatch extends AbstractCondition {
|
|||
// }
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b, final Config conf) {
|
||||
final String valueA = getNumbers(getFirstValue(a));
|
||||
final String valueB = getNumbers(getFirstValue(b));
|
||||
|
||||
|
|
|
@ -56,4 +56,6 @@ public interface Config {
|
|||
*/
|
||||
public Map<String, List<String>> blacklists();
|
||||
|
||||
|
||||
public Map<String, String> translationMap();
|
||||
}
|
||||
|
|
|
@ -55,6 +55,7 @@ public class DedupConfig implements Config, Serializable {
|
|||
try {
|
||||
config = new ObjectMapper().readValue(json, DedupConfig.class);
|
||||
config.getPace().initModel();
|
||||
config.getPace().initTranslationMap();
|
||||
return config;
|
||||
} catch (IOException e) {
|
||||
throw new PaceException("Error in parsing configuration json", e);
|
||||
|
@ -144,4 +145,9 @@ public class DedupConfig implements Config, Serializable {
|
|||
return getPace().getBlacklists();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String> translationMap() {
|
||||
return getPace().translationMap();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -22,6 +22,10 @@ public class PaceConfig implements Serializable {
|
|||
private List<CondDef> conditions;
|
||||
private List<ClusteringDef> clustering;
|
||||
private Map<String, List<String>> blacklists;
|
||||
private Map<String, List<String>> synonyms;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, String> translationMap;
|
||||
|
||||
@JsonIgnore
|
||||
private Map<String, FieldDef> modelMap;
|
||||
|
@ -33,11 +37,24 @@ public class PaceConfig implements Serializable {
|
|||
|
||||
public void initModel() {
|
||||
modelMap = Maps.newHashMap();
|
||||
for(FieldDef fd : getModel()) {
|
||||
for (FieldDef fd : getModel()) {
|
||||
modelMap.put(fd.getName(), fd);
|
||||
}
|
||||
}
|
||||
|
||||
public void initTranslationMap(){
|
||||
translationMap = Maps.newHashMap();
|
||||
for (String key : synonyms.keySet()) {
|
||||
for (String term : synonyms.get(key)){
|
||||
translationMap.put(term.toLowerCase(), key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, String> translationMap(){
|
||||
return translationMap;
|
||||
}
|
||||
|
||||
public List<FieldDef> getModel() {
|
||||
return model;
|
||||
}
|
||||
|
@ -88,6 +105,14 @@ public class PaceConfig implements Serializable {
|
|||
this.blacklists = blacklists;
|
||||
}
|
||||
|
||||
public Map<String, List<String>> getSynonyms() {
|
||||
return synonyms;
|
||||
}
|
||||
|
||||
public void setSynonyms(Map<String, List<String>> synonyms) {
|
||||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
public Map<String, FieldDef> getModelMap() {
|
||||
return modelMap;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
@ -10,7 +11,7 @@ import java.util.Map;
|
|||
*/
|
||||
public interface DistanceAlgo {
|
||||
|
||||
public abstract double distance(Field a, Field b);
|
||||
public abstract double distance(Field a, Field b, Config conf);
|
||||
|
||||
public double getWeight();
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ public class DistanceScorer {
|
|||
final ConditionEvalMap res = new ConditionEvalMap();
|
||||
|
||||
for (final ConditionAlgo cd : conditions) {
|
||||
final ConditionEvalMap map = cd.verify(a, b);
|
||||
final ConditionEvalMap map = cd.verify(a, b, config);
|
||||
res.mergeFrom(map);
|
||||
|
||||
// commented out shortcuts
|
||||
|
@ -82,7 +82,7 @@ public class DistanceScorer {
|
|||
}
|
||||
} else {
|
||||
if (va.getType().equals(vb.getType())) {
|
||||
de.setDistance(w * fd.distanceAlgo().distance(va, vb));
|
||||
de.setDistance(w * fd.distanceAlgo().distance(va, vb, config));
|
||||
} else {
|
||||
throw new PaceException(String.format("Types are different: %s:%s - %s:%s", va, va.getType(), vb, vb.getType()));
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Map;
|
|||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldList;
|
||||
|
@ -69,7 +70,7 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
double score = ssalgo.score(a, b);
|
||||
return normalize(score);
|
||||
}
|
||||
|
@ -83,8 +84,8 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
* the b
|
||||
* @return the double
|
||||
*/
|
||||
protected double distance(final List<String> a, final List<String> b) {
|
||||
return distance(concat(a), concat(b));
|
||||
protected double distance(final List<String> a, final List<String> b, final Config conf) {
|
||||
return distance(concat(a), concat(b), conf);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -93,9 +94,9 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
* @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue());
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b));
|
||||
public double distance(final Field a, final Field b, final Config conf) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf);
|
||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class AlwaysMatch extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class ExactMatch extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -24,7 +25,7 @@ public class JaroWinkler extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.distance.algo;
|
|||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -27,7 +28,7 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
@ -37,8 +38,8 @@ public class JaroWinklerNormalizedName extends SecondStringDistanceAlgo {
|
|||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -23,7 +24,7 @@ public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.DistanceScorer;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
@ -27,7 +28,7 @@ public class LevensteinTitle extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
final String ca = cleanup(a);
|
||||
final String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -25,7 +26,7 @@ public class LevensteinTitleIgnoreVersion extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
|
@ -22,7 +23,7 @@ public class MustBeDifferent extends SecondStringDistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b) {
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
return !a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
@ -17,7 +18,7 @@ public class NullDistanceAlgo implements DistanceAlgo {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(Field a, Field b) {
|
||||
public double distance(Field a, Field b, final Config conf) {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
@ -69,9 +70,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
* @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
|
||||
*/
|
||||
@Override
|
||||
public double distance(final Field a, final Field b) {
|
||||
public double distance(final Field a, final Field b, final Config conf) {
|
||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit));
|
||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
|
||||
|
||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
@ -28,7 +29,7 @@ public class UrlMatcher extends Levenstein {
|
|||
}
|
||||
|
||||
@Override
|
||||
public double distance(Field a, Field b) {
|
||||
public double distance(Field a, Field b, final Config conf) {
|
||||
|
||||
final URL urlA = asUrl(getFirstValue(a));
|
||||
final URL urlB = asUrl(getFirstValue(b));
|
||||
|
@ -44,7 +45,7 @@ public class UrlMatcher extends Levenstein {
|
|||
return hostW * 0.5;
|
||||
}
|
||||
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath());
|
||||
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
|
||||
}
|
||||
|
||||
private URL asUrl(final String value) {
|
||||
|
|
|
@ -1,22 +1,25 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.distance.DistanceAlgoTest;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||
|
||||
private Map<String, Integer> params;
|
||||
DedupConfig conf;
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
params = Maps.newHashMap();
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -26,7 +29,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "http://www.test.it/path/to/resource";
|
||||
System.out.println(s);
|
||||
System.out.println(urlClustering.apply(Lists.newArrayList(url(s))));
|
||||
System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -40,7 +43,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(ngram.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -52,7 +55,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(np.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -64,11 +67,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s1 = "University of Pisa";
|
||||
System.out.println(s1);
|
||||
System.out.println(np.apply(Lists.newArrayList(title(s1))));
|
||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
|
||||
|
||||
final String s2 = "Pisa University";
|
||||
System.out.println(s2);
|
||||
System.out.println(np.apply(Lists.newArrayList(title(s2))));
|
||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -81,7 +84,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(acro.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -93,7 +96,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(sp.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -105,7 +108,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = "Search for the Standard Model Higgs Boson";
|
||||
System.out.println(s);
|
||||
System.out.println(sp.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -114,7 +117,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
|
||||
final String s = readFromClasspath("gt.author.json");
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(Lists.newArrayList(person(s))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(person(s))));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -123,27 +126,27 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||
final String s = "Polytechnic University of Turin";
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
||||
|
||||
final String s1 = "POLITECNICO DI TORINO";
|
||||
System.out.println(s1);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s1))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
||||
|
||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||
System.out.println("s2 = " + s2);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s2))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
|
||||
|
||||
final String s3 = "universita universita milano milano";
|
||||
System.out.println("s3 = " + s3);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s3))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
|
||||
|
||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||
System.out.println("s4 = " + s4);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s4))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
|
||||
|
||||
final String s5 = "İstanbul Ticarət Universiteti";
|
||||
System.out.println("s5 = " + s5);
|
||||
System.out.println(cf.apply(Lists.newArrayList(title(s5))));
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
@ -17,13 +18,13 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
private Map<String, Number> params;
|
||||
private DedupConfig conf;
|
||||
|
||||
@Before
|
||||
public void setup() {
|
||||
System.out.println("****************************************************************");
|
||||
System.out.println("Test String : " + TEST_STRING);
|
||||
params = new HashMap<>();
|
||||
params.put("weight", 1.0);
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -55,7 +56,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
@Test
|
||||
public void testJaroWinklerNormalizedName() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State");
|
||||
double result = jaroWinklerNormalizedName.distance("Free University of Bozen-Bolzano", "University of the Free State", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(0.0, result);
|
||||
|
@ -65,7 +66,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName2() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York");
|
||||
double result = jaroWinklerNormalizedName.distance("University of New York", "Università di New York", conf);
|
||||
|
||||
assertEquals(1.0, result);
|
||||
}
|
||||
|
@ -74,7 +75,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName3() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna");
|
||||
double result = jaroWinklerNormalizedName.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(0.0, result);
|
||||
|
@ -84,7 +85,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName4() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa");
|
||||
double result = jaroWinklerNormalizedName.distance("Universita degli studi di Pisa", "Universita di Pisa", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(1.0, result);
|
||||
|
@ -94,7 +95,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName5() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS");
|
||||
double result = jaroWinklerNormalizedName.distance("RESEARCH PROMOTION FOUNDATION", "IDRYMA PROOTHISIS EREVNAS", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertEquals(1.0, result);
|
||||
|
@ -104,7 +105,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName6() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung");
|
||||
double result = jaroWinklerNormalizedName.distance("Fonds zur Förderung der wissenschaftlichen Forschung (Austrian Science Fund)", "Fonds zur Förderung der wissenschaftlichen Forschung", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertTrue(result > 0.9);
|
||||
|
@ -115,7 +116,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName7() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO");
|
||||
double result = jaroWinklerNormalizedName.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
assertTrue(result > 0.9);
|
||||
|
@ -125,7 +126,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName8() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology");
|
||||
double result = jaroWinklerNormalizedName.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
@ -134,7 +135,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void testJaroWinklerNormalizedName9() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti");
|
||||
double result = jaroWinklerNormalizedName.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
@ -144,7 +145,7 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence");
|
||||
double result = jaroWinklerNormalizedName.distance("Firenze University Press", "University of Florence", conf);
|
||||
|
||||
System.out.println("result = " + result);
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
|
@ -14,23 +14,131 @@
|
|||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
],
|
||||
"strictConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||
],
|
||||
"blacklists" : { }
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
||||
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
||||
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
|
||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
||||
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
||||
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
||||
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
||||
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
||||
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
||||
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
||||
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
||||
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
||||
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
||||
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
||||
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
||||
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
||||
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
||||
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
||||
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
||||
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
||||
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
||||
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
||||
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
||||
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
||||
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
||||
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
||||
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
||||
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
||||
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
||||
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
||||
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
||||
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
||||
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
||||
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
||||
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
||||
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
||||
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
||||
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
||||
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
||||
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
|
||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
|
||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
|
||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
|
||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
|
||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
|
||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
|
||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
|
||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
|
||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
|
||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
|
||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
|
||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
|
||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
|
||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
|
||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
|
||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
|
||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
|
||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
|
||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
|
||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
|
||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
|
||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
|
||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
|
||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
|
||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
|
||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
|
||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
|
||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
|
||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
|
||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
|
||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
|
||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
|
||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
|
||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
|
||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
|
||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
|
||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
|
||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
|
||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
|
||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
|
||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
|
||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
|
||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
|
||||
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
|
||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
|
||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
|
||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
|
||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
|
||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
|
||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
|
||||
"key::102": ["informatics","informatica","informática","informática","informatica"],
|
||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue