forked from D-Net/dnet-hadoop
code cleaning, distribution of the classes in packages and implementation of the new configuration
This commit is contained in:
parent
30a873265f
commit
0973899865
|
@ -20,15 +20,17 @@ import java.util.*;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Set of common functions
|
||||
* Set of common functions for the framework
|
||||
*
|
||||
* @author claudio
|
||||
*
|
||||
*/
|
||||
public abstract class AbstractPaceFunctions {
|
||||
|
||||
//city map to be used when translating the city names into codes
|
||||
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
//list of stopwords in different languages
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
|
||||
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
|
||||
|
@ -36,15 +38,14 @@ public abstract class AbstractPaceFunctions {
|
|||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||
|
||||
//blacklist of ngrams: to avoid generic keys
|
||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||
|
||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
private static final String special_from = "İə";
|
||||
private static final String special_to = "Ie";
|
||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||
|
||||
//doi prefix for normalization
|
||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
||||
|
@ -54,8 +55,7 @@ public abstract class AbstractPaceFunctions {
|
|||
}
|
||||
|
||||
protected String cleanup(final String s) {
|
||||
final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters?
|
||||
final String s0 = ss.toLowerCase();
|
||||
final String s0 = s.toLowerCase();
|
||||
final String s1 = fixAliases(s0);
|
||||
final String s2 = nfd(s1);
|
||||
final String s3 = s2.replaceAll("–", " ");
|
||||
|
@ -63,15 +63,12 @@ public abstract class AbstractPaceFunctions {
|
|||
final String s5 = s4.replaceAll(""", " ");
|
||||
final String s6 = s5.replaceAll("−", " ");
|
||||
final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
|
||||
final String s9 = s8.replaceAll("\\n", " ");
|
||||
final String s10 = s9.replaceAll("(?m)\\s+", " ");
|
||||
final String s11 = s10.trim();
|
||||
return s11;
|
||||
}
|
||||
|
||||
protected String finalCleanup(final String s) {
|
||||
return s.toLowerCase();
|
||||
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
|
||||
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
|
||||
final String s10 = s9.replaceAll("\\n", " ");
|
||||
final String s11 = s10.replaceAll("(?m)\\s+", " ");
|
||||
final String s12 = s11.trim();
|
||||
return s12;
|
||||
}
|
||||
|
||||
protected boolean checkNumbers(final String a, final String b) {
|
||||
|
@ -98,16 +95,6 @@ public abstract class AbstractPaceFunctions {
|
|||
return s.replaceAll("\\D", "");
|
||||
}
|
||||
|
||||
//sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function
|
||||
protected static String fixSpecial(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
final int i = StringUtils.indexOf(special_from, ch);
|
||||
sb.append(i >= 0 ? special_to.charAt(i) : ch);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
protected static String fixAliases(final String s) {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
for (final char ch : Lists.charactersOf(s)) {
|
||||
|
@ -134,12 +121,11 @@ public abstract class AbstractPaceFunctions {
|
|||
return s != null;
|
||||
}
|
||||
|
||||
// ///////////////////////
|
||||
|
||||
protected String normalize(final String s) {
|
||||
return nfd(s).toLowerCase()
|
||||
return nfd(s)
|
||||
.toLowerCase()
|
||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
|
||||
.replaceAll("(\\W)+", " ")
|
||||
.replaceAll("[^ \\w]+", "")
|
||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||
.replaceAll("(\\p{Punct})+", " ")
|
||||
.replaceAll("(\\d)+", " ")
|
||||
|
@ -147,7 +133,7 @@ public abstract class AbstractPaceFunctions {
|
|||
.trim();
|
||||
}
|
||||
|
||||
private String nfd(final String s) {
|
||||
public String nfd(final String s) {
|
||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
}
|
||||
|
||||
|
@ -186,8 +172,6 @@ public abstract class AbstractPaceFunctions {
|
|||
return newset;
|
||||
}
|
||||
|
||||
// ////////////////////
|
||||
|
||||
public static Set<String> loadFromClasspath(final String classpath) {
|
||||
final Set<String> h = Sets.newHashSet();
|
||||
try {
|
||||
|
@ -217,17 +201,6 @@ public abstract class AbstractPaceFunctions {
|
|||
return m;
|
||||
}
|
||||
|
||||
//translate the string: replace the keywords with the code
|
||||
public String translate(String s1, Map<String, String> translationMap){
|
||||
final StringTokenizer st = new StringTokenizer(s1);
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
while (st.hasMoreTokens()){
|
||||
final String token = st.nextToken();
|
||||
sb.append(translationMap.getOrDefault(token,token) + " ");
|
||||
}
|
||||
return sb.toString().trim();
|
||||
}
|
||||
|
||||
public String removeKeywords(String s, Set<String> keywords) {
|
||||
|
||||
s = " " + s + " ";
|
||||
|
@ -238,7 +211,6 @@ public abstract class AbstractPaceFunctions {
|
|||
return s.trim();
|
||||
}
|
||||
|
||||
|
||||
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
|
||||
|
||||
Set<String> k1 = keywordsToCodes(s1, translationMap);
|
||||
|
@ -252,23 +224,6 @@ public abstract class AbstractPaceFunctions {
|
|||
return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer;
|
||||
}
|
||||
|
||||
//returns true if at least 1 city is in common
|
||||
//returns true if no cities are contained in names
|
||||
//returns false if one of the two names have no city
|
||||
public boolean sameCity(Set<String> s1, Set<String> s2){
|
||||
|
||||
Set<String> c1 = citiesToCodes(s1);
|
||||
Set<String> c2 = citiesToCodes(s2);
|
||||
|
||||
if (c1.isEmpty() && c2.isEmpty())
|
||||
return true;
|
||||
else {
|
||||
if (c1.isEmpty() ^ c2.isEmpty())
|
||||
return false;
|
||||
return CollectionUtils.intersection(c1, c2).size() > 0;
|
||||
}
|
||||
}
|
||||
|
||||
//convert the set of keywords to codes
|
||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||
|
@ -294,7 +249,7 @@ public abstract class AbstractPaceFunctions {
|
|||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
||||
}
|
||||
|
||||
//get the list of codes into the input string
|
||||
//get the list of keywords into the input string
|
||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
|
||||
|
||||
String s = s1;
|
||||
|
@ -311,10 +266,10 @@ public abstract class AbstractPaceFunctions {
|
|||
while (length != 0) {
|
||||
|
||||
for (int i = 0; i<=tokens.size()-length; i++){
|
||||
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
|
||||
String candidate = concat(tokens.subList(i, i + length));
|
||||
if (translationMap.containsKey(candidate)) {
|
||||
codes.add(candidate);
|
||||
s = s.replace(candidate, "");
|
||||
s = s.replace(candidate, "").trim();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -45,6 +45,7 @@ public class DedupConfig implements Config, Serializable {
|
|||
defaults.put("slidingWindowSize", "200");
|
||||
defaults.put("rootBuilder", "result");
|
||||
defaults.put("includeChildren", "true");
|
||||
defaults.put("maxIterations", "20");
|
||||
}
|
||||
|
||||
public DedupConfig() {}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.model.ClusteringDef;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||
|
@ -9,11 +9,10 @@ import eu.dnetlib.pace.util.PaceResolver;
|
|||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.Normalizer;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class PaceConfig implements Serializable {
|
||||
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
||||
|
||||
private List<FieldDef> model;
|
||||
|
||||
|
@ -46,7 +45,7 @@ public class PaceConfig implements Serializable {
|
|||
for (String key : synonyms.keySet()) {
|
||||
for (String term : synonyms.get(key)){
|
||||
translationMap.put(
|
||||
Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD),
|
||||
normalize(term.toLowerCase()),
|
||||
key);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -76,6 +76,12 @@ public class WfConfig implements Serializable {
|
|||
/** Maximum number of allowed children. */
|
||||
private int maxChildren = MAX_CHILDREN;
|
||||
|
||||
/** Default maximum number of iterations. */
|
||||
private final static int MAX_ITERATIONS = 20;
|
||||
|
||||
/** Maximum number of iterations */
|
||||
private int maxIterations = MAX_ITERATIONS;
|
||||
|
||||
public WfConfig() {}
|
||||
|
||||
/**
|
||||
|
@ -104,7 +110,7 @@ public class WfConfig implements Serializable {
|
|||
*/
|
||||
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
|
||||
final double threshold,
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) {
|
||||
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) {
|
||||
super();
|
||||
this.entityType = entityType;
|
||||
this.orderField = orderField;
|
||||
|
@ -116,6 +122,7 @@ public class WfConfig implements Serializable {
|
|||
this.groupMaxSize = groupMaxSize;
|
||||
this.slidingWindowSize = slidingWindowSize;
|
||||
this.includeChildren = includeChildren;
|
||||
this.maxIterations = maxIterations;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -245,6 +252,15 @@ public class WfConfig implements Serializable {
|
|||
this.maxChildren = maxChildren;
|
||||
}
|
||||
|
||||
public int getMaxIterations() {
|
||||
return maxIterations;
|
||||
}
|
||||
|
||||
public WfConfig setMaxIterations(int maxIterations) {
|
||||
this.maxIterations = maxIterations;
|
||||
return this;
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
|
|
|
@ -5,6 +5,7 @@ import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
@ -44,7 +45,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
|||
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
|
||||
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
|
||||
|
||||
if (sameCity(cities1,cities2)) {
|
||||
if (checkCities(cities1,cities2)) {
|
||||
|
||||
if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
|
||||
|
||||
|
@ -64,6 +65,22 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
|
|||
return 0.0;
|
||||
}
|
||||
|
||||
//returns true if at least 1 city is in common
|
||||
//returns true if no cities are contained in names
|
||||
//returns false if one of the two names have no city
|
||||
public boolean checkCities(Set<String> s1, Set<String> s2){
|
||||
Set<String> c1 = citiesToCodes(s1);
|
||||
Set<String> c2 = citiesToCodes(s2);
|
||||
|
||||
if (c1.isEmpty() && c2.isEmpty())
|
||||
return true;
|
||||
else {
|
||||
if (c1.isEmpty() ^ c2.isEmpty())
|
||||
return false;
|
||||
return CollectionUtils.intersection(c1, c2).size() > 0;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
|
|
|
@ -31,10 +31,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator {
|
|||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return ssalgo.score(cca, ccb);
|
||||
return ssalgo.score(ca, cb);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -36,10 +36,7 @@ public class LevensteinTitle extends AbstractComparator {
|
|||
|
||||
if (check) return 0.5;
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
|
|
|
@ -37,10 +37,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
|||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
final String cca = finalCleanup(ca);
|
||||
final String ccb = finalCleanup(cb);
|
||||
|
||||
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
|
||||
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
|
||||
}
|
||||
|
||||
private double normalize(final double score, final int la, final int lb) {
|
||||
|
|
|
@ -4,11 +4,15 @@ import eu.dnetlib.pace.util.PaceException;
|
|||
|
||||
public enum AggType {
|
||||
|
||||
WEIGHTED_MEAN,
|
||||
AVG,
|
||||
W_MEAN, //weighted mean
|
||||
AVG, //average
|
||||
SUM,
|
||||
MAX,
|
||||
MIN;
|
||||
MIN,
|
||||
NC, //necessary condition
|
||||
SC, //sufficient condition
|
||||
AND,
|
||||
OR;
|
||||
|
||||
public static AggType getEnum(String value) {
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
public interface Comparator {
|
||||
|
||||
/*
|
||||
* return : -1 -> can't decide (missing field)
|
||||
* return : -1 -> can't decide (i.e. missing field)
|
||||
* >0 -> similarity degree (depends on the algorithm)
|
||||
* */
|
||||
public double compare(Field a, Field b, Config conf);
|
||||
|
|
|
@ -14,25 +14,25 @@ public class FieldConf implements Serializable {
|
|||
private double weight = 1.0; //weight for the field (to be used in the aggregation)
|
||||
private Map<String,Number> params; //parameters
|
||||
|
||||
private boolean ignoreMissing;
|
||||
private boolean countIfUndefined;
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
public boolean isCountIfUndefined() {
|
||||
return countIfUndefined;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
public void setCountIfUndefined(boolean countIfUndefined) {
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public FieldConf() {
|
||||
}
|
||||
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean ignoreMissing) {
|
||||
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean countIfUndefined) {
|
||||
this.field = field;
|
||||
this.comparator = comparator;
|
||||
this.weight = weight;
|
||||
this.params = params;
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
this.countIfUndefined = countIfUndefined;
|
||||
}
|
||||
|
||||
public String getField() {
|
||||
|
|
|
@ -4,7 +4,6 @@ import eu.dnetlib.pace.config.Config;
|
|||
import eu.dnetlib.pace.config.PaceConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.util.PaceException;
|
||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import java.io.IOException;
|
||||
|
@ -22,16 +21,16 @@ public class TreeNodeDef implements Serializable {
|
|||
private String negative;
|
||||
private String undefined;
|
||||
|
||||
boolean ignoreMissing;
|
||||
boolean ignoreUndefined;
|
||||
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) {
|
||||
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) {
|
||||
this.fields = fields;
|
||||
this.aggregation = aggregation;
|
||||
this.threshold = threshold;
|
||||
this.positive = positive;
|
||||
this.negative = negative;
|
||||
this.undefined = undefined;
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public TreeNodeDef() {
|
||||
|
@ -48,9 +47,9 @@ public class TreeNodeDef implements Serializable {
|
|||
|
||||
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
||||
|
||||
if (result == -1) { //if the field is missing
|
||||
stats.incrementMissCount();
|
||||
if (!fieldConf.isIgnoreMissing()) {
|
||||
if (result == -1) { //if the comparison is undefined
|
||||
stats.incrementUndefinedCount();
|
||||
if (fieldConf.isCountIfUndefined()) { //if it must be taken into account, increment weights (i.e. the average would be lower)
|
||||
stats.incrementWeightsSum(weight);
|
||||
}
|
||||
}
|
||||
|
@ -117,12 +116,12 @@ public class TreeNodeDef implements Serializable {
|
|||
this.undefined = undefined;
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
return ignoreMissing;
|
||||
public boolean isIgnoreUndefined() {
|
||||
return ignoreUndefined;
|
||||
}
|
||||
|
||||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
public void setIgnoreUndefined(boolean ignoreUndefined) {
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -7,7 +7,7 @@ import java.io.Serializable;
|
|||
public class TreeNodeStats implements Serializable {
|
||||
|
||||
private DescriptiveStatistics stats;
|
||||
private int missCount = 0;
|
||||
private int undefinedCount = 0; //counter for the number of undefined comparisons between the fields in the tree node
|
||||
private int fieldsCount = 0;
|
||||
private double weightsSum = 0.0;
|
||||
|
||||
|
@ -15,8 +15,8 @@ public class TreeNodeStats implements Serializable {
|
|||
this.stats = new DescriptiveStatistics();
|
||||
}
|
||||
|
||||
public TreeNodeStats(int missCount, int fieldsCount, double weightsSum) {
|
||||
this.missCount = missCount;
|
||||
public TreeNodeStats(int undefinedCount, int fieldsCount, double weightsSum) {
|
||||
this.undefinedCount = undefinedCount;
|
||||
this.fieldsCount = fieldsCount;
|
||||
this.weightsSum = weightsSum;
|
||||
}
|
||||
|
@ -29,12 +29,12 @@ public class TreeNodeStats implements Serializable {
|
|||
this.stats = stats;
|
||||
}
|
||||
|
||||
public int getMissCount() {
|
||||
return missCount;
|
||||
public int getUndefinedCount() {
|
||||
return undefinedCount;
|
||||
}
|
||||
|
||||
public void setMissCount(int missCount) {
|
||||
this.missCount = missCount;
|
||||
public void setUndefinedCount(int undefinedCount) {
|
||||
this.undefinedCount = undefinedCount;
|
||||
}
|
||||
|
||||
public int getFieldsCount() {
|
||||
|
@ -57,8 +57,8 @@ public class TreeNodeStats implements Serializable {
|
|||
this.weightsSum += delta;
|
||||
}
|
||||
|
||||
public void incrementMissCount(){
|
||||
this.missCount += 1;
|
||||
public void incrementUndefinedCount(){
|
||||
this.undefinedCount += 1;
|
||||
}
|
||||
|
||||
public void incrementScoresSum(double delta){
|
||||
|
@ -72,11 +72,15 @@ public class TreeNodeStats implements Serializable {
|
|||
return stats.getMean();
|
||||
case SUM:
|
||||
return stats.getSum();
|
||||
case SC:
|
||||
case OR:
|
||||
case MAX:
|
||||
return stats.getMax();
|
||||
case NC:
|
||||
case AND:
|
||||
case MIN:
|
||||
return stats.getMin();
|
||||
case WEIGHTED_MEAN:
|
||||
case W_MEAN:
|
||||
return stats.getSum()/weightsSum;
|
||||
default:
|
||||
return 0.0;
|
||||
|
|
|
@ -40,9 +40,11 @@ public class TreeProcessor {
|
|||
|
||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
|
||||
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
|
||||
//if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) {
|
||||
current = currentNode.getUndefined();
|
||||
}
|
||||
//if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
current = currentNode.getPositive();
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace;
|
||||
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldListImpl;
|
||||
|
@ -11,7 +12,7 @@ import java.io.StringWriter;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public abstract class AbstractPaceTest {
|
||||
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
|
||||
|
||||
protected String readFromClasspath(final String filename) {
|
||||
final StringWriter sw = new StringWriter();
|
||||
|
|
|
@ -18,7 +18,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
@Before
|
||||
public void setUp() throws Exception {
|
||||
params = Maps.newHashMap();
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
|
||||
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.common;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import static junit.framework.Assert.assertEquals;
|
||||
|
@ -7,6 +8,8 @@ import static junit.framework.Assert.assertTrue;
|
|||
|
||||
public class PaceFunctionTest extends AbstractPaceFunctions {
|
||||
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
|
||||
@Test
|
||||
public void normalizePidTest(){
|
||||
|
||||
|
@ -14,7 +17,6 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
|||
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
|
||||
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
|
||||
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -22,4 +24,35 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
|||
|
||||
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void normalizeTest() {
|
||||
Assert.assertEquals("universitat", normalize("Universität"));
|
||||
|
||||
System.out.println(normalize("İstanbul Ticarət Universiteti"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cleanupTest() {
|
||||
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
|
||||
|
||||
|
||||
System.out.println("cleaned up : " + cleanup(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetNumbers() {
|
||||
System.out.println("Numbers : " + getNumbers(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveSymbols() {
|
||||
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFixAliases() {
|
||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -14,9 +14,8 @@ import java.util.Map;
|
|||
import static junit.framework.Assert.assertEquals;
|
||||
import static junit.framework.Assert.assertTrue;
|
||||
|
||||
public class DistanceAlgoTest extends AbstractPaceFunctions {
|
||||
public class ComparatorTest extends AbstractPaceFunctions {
|
||||
|
||||
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
|
||||
private Map<String, Number> params;
|
||||
private DedupConfig conf;
|
||||
|
||||
|
@ -24,7 +23,8 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
public void setup() {
|
||||
params = new HashMap<>();
|
||||
params.put("weight", 1.0);
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
|
||||
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -33,26 +33,6 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
|
|||
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetNumbers() {
|
||||
System.out.println("Numbers : " + getNumbers(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveSymbols() {
|
||||
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFixAliases() {
|
||||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCleanup() {
|
||||
System.out.println("cleaned up : " + cleanup(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJaroWinklerNormalizedName() {
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.pace.config;
|
|||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
@ -11,15 +13,10 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
|
||||
@Test
|
||||
public void dedupConfigSerializationTest() {
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf"));
|
||||
|
||||
final String conf = cfgFromClasspath.toString();
|
||||
|
||||
// System.out.println("*****SERIALIZED*****");
|
||||
// System.out.println(conf);
|
||||
// System.out.println("*****FROM CLASSPATH*****");
|
||||
// System.out.println(readFromClasspath("result.pace.conf.json"));
|
||||
|
||||
final DedupConfig cfgFromSerialization = DedupConfig.load(conf);
|
||||
|
||||
assertEquals(cfgFromClasspath.toString(), cfgFromSerialization.toString());
|
||||
|
@ -27,29 +24,36 @@ public class ConfigTest extends AbstractPaceTest {
|
|||
assertNotNull(cfgFromClasspath);
|
||||
assertNotNull(cfgFromSerialization);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void dedupConfigTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
|
||||
|
||||
System.out.println(load.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void translationMapTest() {
|
||||
public void initTranslationMapTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
|
||||
|
||||
Map<String, String> translationMap = load.translationMap();
|
||||
|
||||
System.out.println("translationMap = " + translationMap.size());
|
||||
|
||||
for (String key: translationMap.keySet()) {
|
||||
if (translationMap.get(key).equals("key::1"))
|
||||
System.out.println("key = " + key);
|
||||
}
|
||||
|
||||
System.out.println("translationMap = " + load.getPace().translationMap().toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void emptyTranslationMapTest() {
|
||||
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf"));
|
||||
DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf"));
|
||||
|
||||
assertEquals(0, load.getPace().translationMap().keySet().size());
|
||||
}
|
||||
|
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.9",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
],
|
||||
"strictConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
}
|
||||
}
|
||||
}
|
|
@ -12,42 +12,28 @@
|
|||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
<<<<<<< HEAD
|
||||
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
|
||||
=======
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
>>>>>>> origin/master
|
||||
],
|
||||
"sufficientConditions" : [
|
||||
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
|
||||
],
|
||||
<<<<<<< HEAD
|
||||
"necessaryConditions" : [
|
||||
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
|
||||
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
|
||||
=======
|
||||
"conditions" : [
|
||||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
|
||||
{ "name" : "exactMatch", "fields" : [ "country" ] }
|
||||
>>>>>>> origin/master
|
||||
],
|
||||
"decisionTree" : {
|
||||
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
|
||||
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
|
||||
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
|
||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
|
||||
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
|
||||
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
||||
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
|
@ -97,58 +83,58 @@
|
|||
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
||||
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
||||
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
|
||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
|
||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
|
||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
|
||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
|
||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
|
||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
|
||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
|
||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
|
||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
|
||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
|
||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
|
||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
|
||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
|
||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
|
||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
|
||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
|
||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
|
||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
|
||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
|
||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
|
||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
|
||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
|
||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
|
||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
|
||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
|
||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
|
||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
|
||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
|
||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
|
||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
|
||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
|
||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
|
||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
|
||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
|
||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
|
||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
|
||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
|
||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
|
||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
|
||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
|
||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
|
||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
|
||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
|
||||
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
|
||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
|
||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
|
||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
|
||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
|
||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
|
||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
|
||||
"key::102": ["informatics","informatica","informática","informática","informatica"],
|
||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
|
||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
|
||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
|
||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
|
||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
|
||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
|
||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
|
||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
|
||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
|
||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
|
||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
|
||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
|
||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
|
||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
|
||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
|
||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
|
||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
|
||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
|
||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
|
||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
|
||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
|
||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
|
||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
|
||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
|
||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
|
||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
|
||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
|
||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
|
||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
|
||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
|
||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
|
||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
|
||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
|
||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
|
||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
|
||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
|
||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
|
||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
|
||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
|
||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
|
||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
|
||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
||||
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
|
||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
|
||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
|
||||
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.9",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
],
|
||||
"decisionTree" : {
|
||||
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
|
||||
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
|
||||
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "result" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
|
||||
],
|
||||
"strictConditions" : [
|
||||
{ "name" : "exactMatch", "fields" : [ "pid" ] }
|
||||
],
|
||||
"conditions" : [
|
||||
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
|
||||
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
|
||||
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" },
|
||||
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
|
||||
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
|
||||
{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"title" : [
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*"
|
||||
] }
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"wf" : {
|
||||
"threshold" : "0.99",
|
||||
"dedupRun" : "001",
|
||||
"entityType" : "result",
|
||||
"orderField" : "title",
|
||||
"queueMaxSize" : "2000",
|
||||
"groupMaxSize" : "10",
|
||||
"slidingWindowSize" : "200",
|
||||
"rootBuilder" : [ "result" ],
|
||||
"includeChildren" : "true"
|
||||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
|
||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
|
||||
],
|
||||
"decisionTree": {
|
||||
"start": {"fields": [{"field":"pid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
|
||||
"layer2": {"fields": [{"field":"dateofacceptance", "comparator":"yearMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"title", "comparator":"titleVersionMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}, {"field":"authors", "comparator":"sizeMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
|
||||
"layer3": {"fields": [{"field":"title", "comparator":"JaroWinkler", "weight":1.0, "countIfUndefined":"false", "params":{}}], "threshold": 0.99, "aggregation": "MAX", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "false"}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "pid", "type" : "String", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" },
|
||||
{ "name" : "title", "type" : "String", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
|
||||
{ "name" : "dateofacceptance", "type" : "String", "path" : "result/metadata/dateofacceptance/value" } ,
|
||||
{ "name" : "authors", "type" : "List", "path" : "result/author/metadata/fullname/value" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"title" : [
|
||||
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
|
||||
"^(Kiri Karl Morgensternile).*$",
|
||||
"^(\\[Eksliibris Aleksandr).*\\]$",
|
||||
"^(\\[Eksliibris Aleksandr).*$",
|
||||
"^(Eksliibris Aleksandr).*$",
|
||||
"^(Kiri A\\. de Vignolles).*$",
|
||||
"^(2 kirja Karl Morgensternile).*$",
|
||||
"^(Pirita kloostri idaosa arheoloogilised).*$",
|
||||
"^(Kiri tundmatule).*$",
|
||||
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
|
||||
"^(Eksliibris Nikolai Birukovile).*$",
|
||||
"^(Eksliibris Nikolai Issakovile).*$",
|
||||
"^(WHP Cruise Summary Information of section).*$",
|
||||
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
|
||||
"^(Measurement of the spin\\-dependent structure function).*"
|
||||
] },
|
||||
"synonyms": {}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue