Added first implementation using UDF function

This commit is contained in:
Sandro La Bruzzo 2023-06-07 12:15:35 +02:00
parent 9963fd6d29
commit 4c2dfcbdf7
143 changed files with 25346 additions and 381 deletions

View File

@ -143,8 +143,8 @@
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
</dependency>
<dependency>

91
dhp-pace-core/pom.xml Normal file
View File

@ -0,0 +1,91 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
<version>1.2.5-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency>
<dependency>
<groupId>org.antlr</groupId>
<artifactId>stringtemplate</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-catalyst_2.11</artifactId>
<version>2.4.0.cloudera2</version>
<scope>compile</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,42 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params;
public AbstractClusteringFunction(final Map<String, Integer> params) {
this.params = params;
}
protected abstract Collection<String> doApply(Config conf, String s);
@Override
public Collection<String> apply(Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
public Map<String, Integer> getParams() {
return params;
}
protected Integer param(String name) {
return params.get(name);
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("acronyms")
public class Acronyms extends AbstractClusteringFunction {
public Acronyms(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
}
private Set<String> extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) {
final Set<String> acronyms = Sets.newLinkedHashSet();
for (int i = 0; i < maxAcronyms; i++) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (sb.length() > maxLen) {
break;
}
if (token.length() > 1 && i < token.length()) {
sb.append(token.charAt(i));
}
}
String acronym = sb.toString();
if (acronym.length() > minLen) {
acronyms.add(acronym);
}
}
return acronyms;
}
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.clustering;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ClusteringClass {
public String value();
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.Collection;
import java.util.List;
import java.util.Map;
public interface ClusteringFunction {
public Collection<String> apply(Config config, List<String> fields);
public Map<String, Integer> getParams();
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("immutablefieldvalue")
public class ImmutableFieldValue extends AbstractClusteringFunction {
public ImmutableFieldValue(final Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
res.add(s);
return res;
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
public KeywordsClustering(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(final Config conf, String s) {
//takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
//list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword: keywordsToCodes(keywords, conf.translationMap())){
for (String city: citiesToCodes(cities)) {
combinations.add(keyword+"-"+city);
if (combinations.size()>=params.getOrDefault("max", 2)) {
return combinations;
}
}
}
return combinations;
}
@Override
public Collection<String> apply(final Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(this::cleanup)
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -0,0 +1,75 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{
private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<String> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res;
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection;
import java.util.List;
import java.util.Map;
@ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction {
public LowercaseClustering(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<String> fields) {
Collection<String> c = Sets.newLinkedHashSet();
for(String f : fields) {
c.addAll(doApply(conf, f));
}
return c;
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
if(StringUtils.isBlank(s)) {
return Lists.newArrayList();
}
return Lists.newArrayList(s.toLowerCase().trim());
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.pace.clustering;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
public class NGramUtils extends AbstractPaceFunctions {
static private final NGramUtils NGRAMUTILS = new NGramUtils();
private static final int SIZE = 100;
private static final Set<String> stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
public static String cleanupForOrdering(String s) {
String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords);
return result.isEmpty() ? result : result.replace(" ", "");
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("ngrampairs")
public class NgramPairs extends Ngrams {
public NgramPairs(Map<String, Integer> params) {
super(params, false);
}
public NgramPairs(Map<String, Integer> params, boolean sorted) {
super(params, sorted);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max"));
}
protected Collection<String> ngramPairs(final List<String> ngrams, int maxNgrams) {
Collection<String> res = Lists.newArrayList();
int j = 0;
for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) {
if (++j >= ngrams.size()) {
break;
}
res.add(ngrams.get(i) + ngrams.get(j));
//System.out.println("-- " + concatNgrams);
}
return res;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.*;
@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction {
private final boolean sorted;
public Ngrams(Map<String, Integer> params) {
this(params, false);
}
public Ngrams(Map<String, Integer> params, boolean sorted) {
super(params);
this.sorted = sorted;
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
}
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
final Collection<String> ngrams = sorted ? new TreeSet<>() : new LinkedHashSet<String>();
final StringTokenizer st = new StringTokenizer(s);
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (!token.isEmpty()) {
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
String ngram = token.substring(i, Math.min(ngramLen + i, token.length())).trim();
if (ngram.length() >= minNgramLen) {
ngrams.add(ngram);
if (ngrams.size() >= max) {
return ngrams;
}
}
}
}
}
//System.out.println(ngrams + " n: " + ngrams.size());
return ngrams;
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ClusteringClass("personClustering")
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
private Map<String, Integer> params;
private static final int MAX_TOKENS = 5;
public PersonClustering(final Map<String, Integer> params) {
this.params = params;
}
@Override
public Collection<String> apply(final Config conf, final List<String> fields) {
final Set<String> hashes = Sets.newHashSet();
for (final String f : fields) {
final Person person = new Person(f, false);
if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) {
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
} else {
for (final String token1 : tokens(f, MAX_TOKENS)) {
for (final String token2 : tokens(f, MAX_TOKENS)) {
if (!token1.equals(token2)) {
hashes.add(firstLC(token1) + token2);
}
}
}
}
}
return hashes;
}
// @Override
// public Collection<String> apply(final List<Field> fields) {
// final Set<String> hashes = Sets.newHashSet();
//
// for (final Field f : fields) {
//
// final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue());
//
// final Author a = gta.getAuthor();
//
// if (StringUtils.isNotBlank(a.getFirstname()) && StringUtils.isNotBlank(a.getSecondnames())) {
// hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase());
// } else {
// for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
// for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
// if (!token1.equals(token2)) {
// hashes.add(firstLC(token1) + token2);
// }
// }
// }
// }
// }
//
// return hashes;
// }
@Override
public Map<String, Integer> getParams() {
return params;
}
}

View File

@ -0,0 +1,32 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personHash")
public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false;
public PersonHash(final Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
res.add(new Person(s, aggressive).hash());
return res;
}
}

View File

@ -0,0 +1,19 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.config.Config;
import java.util.Collection;
import java.util.Map;
public class RandomClusteringFunction extends AbstractClusteringFunction {
public RandomClusteringFunction(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(final Config conf, String s) {
return null;
}
}

View File

@ -0,0 +1,17 @@
package eu.dnetlib.pace.clustering;
import java.util.*;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("sortedngrampairs")
public class SortedNgramPairs extends NgramPairs {
public SortedNgramPairs(Map<String, Integer> params) {
super(params, true);
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
@ClusteringClass("spacetrimmingfieldvalue")
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", ""));
return res;
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("suffixprefix")
public class SuffixPrefix extends AbstractClusteringFunction {
public SuffixPrefix(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
private Collection<String> suffixPrefix(String s, int len, int max) {
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {
int j = s.indexOf(" ", i);
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
if (j - len > 0) {
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
if (bigram.length() >= 4) {
bigrams.add(bigram);
}
}
}
return bigrams;
}
}

View File

@ -0,0 +1,52 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params;
public UrlClustering(final Map<String, Integer> params) {
this.params = params;
}
@Override
public Collection<String> apply(final Config conf, List<String> fields) {
try {
return fields.stream()
.filter(f -> !f.isEmpty())
.map(this::asUrl)
.map(URL::getHost)
.collect(Collectors.toCollection(HashSet::new));
}
catch (IllegalStateException e){
return new HashSet<>();
}
}
@Override
public Map<String, Integer> getParams() {
return null;
}
private URL asUrl(String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
}

View File

@ -0,0 +1,90 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("wordsStatsSuffixPrefixChain")
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefixChain(s, param("mod"));
}
private Collection<String> suffixPrefixChain(String s, int mod) {
//create the list of words from the string (remove short words)
List<String> wordsList =
Arrays.stream(s.split(" "))
.filter(si -> si.length() > 3)
.collect(Collectors.toList());
final int words = wordsList.size();
final int letters = s.length();
//create the prefix: number of words + number of letters/mod
String prefix = words + "-" + letters/mod + "-";
return doSuffixPrefixChain(wordsList, prefix);
}
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
Set<String> set = Sets.newLinkedHashSet();
switch(wordsList.size()){
case 0:
case 1:
break;
case 2:
set.add(
prefix +
suffix(wordsList.get(0), 3) +
prefix(wordsList.get(1), 3)
);
set.add(
prefix +
prefix(wordsList.get(0), 3) +
suffix(wordsList.get(1), 3)
);
break;
default:
set.add(
prefix +
suffix(wordsList.get(0), 3) +
prefix(wordsList.get(1), 3) +
suffix(wordsList.get(2), 3)
);
set.add(
prefix +
prefix(wordsList.get(0), 3) +
suffix(wordsList.get(1), 3) +
prefix(wordsList.get(2), 3)
);
break;
}
return set;
}
private String suffix(String s, int len) {
return s.substring(s.length()-len);
}
private String prefix(String s, int len) {
return s.substring(0, len);
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("wordssuffixprefix")
public class WordsSuffixPrefix extends AbstractClusteringFunction {
public WordsSuffixPrefix(Map<String, Integer> params) {
super(params);
}
@Override
protected Collection<String> doApply(Config conf, String s) {
return suffixPrefix(s, param("len"), param("max"));
}
private Collection<String> suffixPrefix(String s, int len, int max) {
final int words = s.split(" ").length;
// adjust the token length according to the number of words
switch (words) {
case 1:
return Sets.newLinkedHashSet();
case 2:
return doSuffixPrefix(s, len+2, max, words);
case 3:
return doSuffixPrefix(s, len+1, max, words);
default:
return doSuffixPrefix(s, len, max, words);
}
}
private Collection<String> doSuffixPrefix(String s, int len, int max, int words) {
final Set<String> bigrams = Sets.newLinkedHashSet();
int i = 0;
while (++i < s.length() && bigrams.size() < max) {
int j = s.indexOf(" ", i);
int offset = j + len + 1 < s.length() ? j + len + 1 : s.length();
if (j - len > 0) {
String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim();
if (bigram.length() >= 4) {
bigrams.add(words+bigram);
}
}
}
return bigrams;
}
}

View File

@ -0,0 +1,346 @@
package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* Set of common functions for the framework
*
* @author claudio
*/
public abstract class AbstractPaceFunctions {
//city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
//list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
protected static Set<String> stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt");
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
//transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
//blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
//html regex for normalization
public final String HTML_REGEX = "<[^>]*>";
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
//doi prefix for normalization
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected String concat(final List<String> l) {
return Joiner.on(" ").skipNulls().join(l);
}
protected String cleanup(final String s) {
final String s1 = s.replaceAll(HTML_REGEX, "");
final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2);
final String s4 = fixXML(s3);
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
final String s6 = transliterate(s5);
final String s7 = fixAliases(s6);
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
final String s10 = s9.replaceAll("\\n", " ");
final String s11 = s10.replaceAll("(?m)\\s+", " ");
final String s12 = s11.trim();
return s12;
}
protected String fixXML(final String a){
return a.replaceAll("&ndash;", " ")
.replaceAll("&amp;", " ")
.replaceAll("&quot;", " ")
.replaceAll("&minus;", " ");
}
protected boolean checkNumbers(final String a, final String b) {
final String numbersA = getNumbers(a);
final String numbersB = getNumbers(b);
final String romansA = getRomans(a);
final String romansB = getRomans(b);
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
}
protected String getRomans(final String s) {
final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isRoman(t) ? t : "");
}
return sb.toString();
}
protected boolean isRoman(final String s) {
return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop");
}
protected String getNumbers(final String s) {
final StringBuilder sb = new StringBuilder();
for (final String t : s.split(" ")) {
sb.append(isNumber(t) ? t : "");
}
return sb.toString();
}
public boolean isNumber(String strNum) {
if (strNum == null) {
return false;
}
return numberPattern.matcher(strNum).matches();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
}
catch(Exception e) {
return s;
}
}
protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
}
return sb.toString().replaceAll("\\s+", " ");
}
protected boolean notNull(final String s) {
return s != null;
}
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8);
}
public String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
protected String filterStopWords(final String s, final Set<String> stopwords) {
final StringTokenizer st = new StringTokenizer(s);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()) {
final String token = st.nextToken();
if (!stopwords.contains(token)) {
sb.append(token);
sb.append(" ");
}
}
return sb.toString().trim();
}
public String filterAllStopWords(String s) {
s = filterStopWords(s, stopwords_en);
s = filterStopWords(s, stopwords_de);
s = filterStopWords(s, stopwords_it);
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
s = filterStopWords(s, stopwords_gr);
return s;
}
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
final Set<String> newset = Sets.newLinkedHashSet();
for (final String s : set) {
if (!ngramBlacklist.contains(s)) {
newset.add(s);
}
}
return newset;
}
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
public static Map<String, String> loadMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
public String removeKeywords(String s, Set<String> keywords) {
s = " " + s + " ";
for (String k : keywords) {
s = s.replaceAll(k.toLowerCase(), "");
}
return s.trim();
}
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
double longer = Math.max(s1.size(), s2.size());
return (double) s1.stream().filter(s2::contains).count() / longer;
}
//convert the set of keywords to codes
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
}
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
return toCodes(keywords, translationMap);
}
public Set<String> citiesToCodes(Set<String> keywords) {
return toCodes(keywords, cityMap);
}
protected String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase();
}
protected Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
public String normalizePid(String pid) {
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
}
//get the list of keywords into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
String s = s1;
List<String> tokens = Arrays.asList(s.toLowerCase().split(" "));
Set<String> codes = new HashSet<>();
if (tokens.size() < windowSize)
windowSize = tokens.size();
int length = windowSize;
while (length != 0) {
for (int i = 0; i <= tokens.size() - length; i++) {
String candidate = concat(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
codes.add(candidate);
s = s.replace(candidate, "").trim();
}
}
tokens = Arrays.asList(s.split(" "));
length -= 1;
}
return codes;
}
public Set<String> getCities(String s1, int windowSize) {
return getKeywords(s1, cityMap, windowSize);
}
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.pace.config;
import java.util.List;
import java.util.Map;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
/**
* Interface for PACE configuration bean.
*
* @author claudio
*/
public interface Config {
/**
* Field configuration definitions.
*
* @return the list of definitions
*/
public List<FieldDef> model();
/**
* Decision Tree definition
*
* @return the map representing the decision tree
*/
public Map<String, TreeNodeDef> decisionTree();
/**
* Clusterings.
*
* @return the list
*/
public List<ClusteringDef> clusterings();
/**
* Blacklists.
*
* @return the map
*/
public Map<String, Predicate<String>> blacklists();
/**
* Translation map.
*
* @return the map
* */
public Map<String, String> translationMap();
}

View File

@ -0,0 +1,163 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.AbstractMap;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
public class DedupConfig implements Config, Serializable {
private static String CONFIG_TEMPLATE = "dedupConfig.st";
private PaceConfig pace;
private WfConfig wf;
@JsonIgnore
private Map<String, Predicate<String>> blacklists;
private static Map<String, String> defaults = Maps.newHashMap();
static {
defaults.put("dedupRun", "001");
defaults.put("entityType", "result");
defaults.put("subEntityType", "resulttype");
defaults.put("subEntityValue", "publication");
defaults.put("orderField", "title");
defaults.put("queueMaxSize", "2000");
defaults.put("groupMaxSize", "10");
defaults.put("slidingWindowSize", "200");
defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20");
defaults.put("idPath", "$.id");
}
public DedupConfig() {
}
public static DedupConfig load(final String json) {
final DedupConfig config;
try {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
config.getPace().initTranslationMap();
config.blacklists = config.getPace().getBlacklists().entrySet()
.stream()
.map(e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList())))
.collect(Collectors.toMap(e -> e.getKey(),
e -> (Predicate<String> & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent()))
;
return config;
} catch (IOException |
PatternSyntaxException e) {
throw new PaceException("Error in parsing configuration json", e);
}
}
public static DedupConfig loadDefault() throws IOException {
return loadDefault(new HashMap<String, String>());
}
public static DedupConfig loadDefault(final Map<String, String> params) throws IOException {
final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE));
for (final Entry<String, String> e : defaults.entrySet()) {
template.setAttribute(e.getKey(), e.getValue());
}
for (final Entry<String, String> e : params.entrySet()) {
if (template.getAttribute(e.getKey()) != null) {
template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue());
} else {
template.setAttribute(e.getKey(), e.getValue());
}
}
final String json = template.toString();
return load(json);
}
private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
}
public PaceConfig getPace() {
return pace;
}
public void setPace(final PaceConfig pace) {
this.pace = pace;
}
public WfConfig getWf() {
return wf;
}
public void setWf(final WfConfig wf) {
this.wf = wf;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise configuration", e);
}
}
@Override
public Map<String, TreeNodeDef> decisionTree() {
return getPace().getDecisionTree();
}
@Override
public List<FieldDef> model() {
return getPace().getModel();
}
@Override
public List<ClusteringDef> clusterings() {
return getPace().getClustering();
}
@Override
public Map<String, Predicate<String>> blacklists() {
return blacklists;
}
@Override
public Map<String, String> translationMap() {
return getPace().translationMap();
}
}

View File

@ -0,0 +1,105 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.Maps;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
import eu.dnetlib.pace.util.PaceResolver;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
private List<FieldDef> model;
private List<ClusteringDef> clustering;
private Map<String, TreeNodeDef> decisionTree;
private Map<String, List<String>> blacklists;
private Map<String, List<String>> synonyms;
@JsonIgnore
private Map<String, String> translationMap;
public Map<String, FieldDef> getModelMap() {
return modelMap;
}
@JsonIgnore
private Map<String, FieldDef> modelMap;
@JsonIgnore
public static PaceResolver resolver = new PaceResolver();
public PaceConfig() {}
public void initModel() {
modelMap = Maps.newHashMap();
for (FieldDef fd : getModel()) {
modelMap.put(fd.getName(), fd);
}
}
public void initTranslationMap(){
translationMap = Maps.newHashMap();
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
for (String key : synonyms.keySet()) {
for (String term : synonyms.get(key)){
translationMap.put(
fixAliases(transliterator.transliterate(term.toLowerCase())),
key);
}
}
}
public Map<String, String> translationMap(){
return translationMap;
}
public List<FieldDef> getModel() {
return model;
}
public void setModel(final List<FieldDef> model) {
this.model = model;
}
public List<ClusteringDef> getClustering() {
return clustering;
}
public void setClustering(final List<ClusteringDef> clustering) {
this.clustering = clustering;
}
public Map<String, TreeNodeDef> getDecisionTree() {
return decisionTree;
}
public void setDecisionTree(Map<String, TreeNodeDef> decisionTree) {
this.decisionTree = decisionTree;
}
public Map<String, List<String>> getBlacklists() {
return blacklists;
}
public void setBlacklists(final Map<String, List<String>> blacklists) {
this.blacklists = blacklists;
}
public Map<String, List<String>> getSynonyms() {
return synonyms;
}
public void setSynonyms(Map<String, List<String>> synonyms) {
this.synonyms = synonyms;
}
}

View File

@ -0,0 +1,5 @@
package eu.dnetlib.pace.config;
public enum Type {
String, Int, List, JSON, URL, StringConcat, DoubleArray
}

View File

@ -0,0 +1,292 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class WfConfig implements Serializable {
/**
* Entity type.
*/
private String entityType = "";
/**
* Sub-Entity type refers to one of fields declared in the model. See eu.dnetlib.pace.config.PaceConfig.modelMap
*/
private String subEntityType = "";
/**
* Sub-Entity value declares a value for subTypes to be considered.
*/
private String subEntityValue = "";
/**
* Field name used to sort the values in the reducer phase.
*/
private String orderField = "";
/**
* Column Families involved in the relations redirection.
*/
private List<String> rootBuilder = Lists.newArrayList();
/**
* Set of datasource namespace prefixes that won't be deduplicated.
*/
private Set<String> skipList = Sets.newHashSet();
/**
* Subprefix used to build the root id, allows multiple dedup runs.
*/
private String dedupRun = "";
/**
* Similarity threshold.
*/
private double threshold = 0;
/** The queue max size. */
private int queueMaxSize = 2000;
/** The group max size. */
private int groupMaxSize;
/** The sliding window size. */
private int slidingWindowSize;
/** The configuration id. */
private String configurationId;
/** The include children. */
private boolean includeChildren;
/** Default maximum number of allowed children. */
private final static int MAX_CHILDREN = 10;
/** Maximum number of allowed children. */
private int maxChildren = MAX_CHILDREN;
/** Default maximum number of iterations. */
private final static int MAX_ITERATIONS = 20;
/** Maximum number of iterations */
private int maxIterations = MAX_ITERATIONS;
/** The Jquery path to retrieve the identifier */
private String idPath = "$.id";
public WfConfig() {}
/**
* Instantiates a new dedup config.
*
* @param entityType
* the entity type
* @param orderField
* the order field
* @param rootBuilder
* the root builder families
* @param dedupRun
* the dedup run
* @param skipList
* the skip list
* @param queueMaxSize
* the queue max size
* @param groupMaxSize
* the group max size
* @param slidingWindowSize
* the sliding window size
* @param includeChildren
* allows the children to be included in the representative records or not.
* @param maxIterations
* the maximum number of iterations
* @param idPath
* the path for the id of the entity
*/
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) {
super();
this.entityType = entityType;
this.orderField = orderField;
this.rootBuilder = rootBuilder;
this.dedupRun = cleanupStringNumber(dedupRun);
this.skipList = skipList;
this.queueMaxSize = queueMaxSize;
this.groupMaxSize = groupMaxSize;
this.slidingWindowSize = slidingWindowSize;
this.includeChildren = includeChildren;
this.maxIterations = maxIterations;
this.idPath = idPath;
}
/**
* Cleanup string number.
*
* @param s
* the s
* @return the string
*/
private String cleanupStringNumber(final String s) {
return s.contains("'") ? s.replaceAll("'", "") : s;
}
public boolean hasSubType() {
return StringUtils.isNotBlank(getSubEntityType()) && StringUtils.isNotBlank(getSubEntityValue());
}
public String getEntityType() {
return entityType;
}
public void setEntityType(final String entityType) {
this.entityType = entityType;
}
public String getSubEntityType() {
return subEntityType;
}
public void setSubEntityType(final String subEntityType) {
this.subEntityType = subEntityType;
}
public String getSubEntityValue() {
return subEntityValue;
}
public void setSubEntityValue(final String subEntityValue) {
this.subEntityValue = subEntityValue;
}
public String getOrderField() {
return orderField;
}
public void setOrderField(final String orderField) {
this.orderField = orderField;
}
public List<String> getRootBuilder() {
return rootBuilder;
}
public void setRootBuilder(final List<String> rootBuilder) {
this.rootBuilder = rootBuilder;
}
public Set<String> getSkipList() {
return skipList != null ? skipList : new HashSet<String>();
}
public void setSkipList(final Set<String> skipList) {
this.skipList = skipList;
}
public String getDedupRun() {
return dedupRun;
}
public void setDedupRun(final String dedupRun) {
this.dedupRun = dedupRun;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(final double threshold) {
this.threshold = threshold;
}
public int getQueueMaxSize() {
return queueMaxSize;
}
public void setQueueMaxSize(final int queueMaxSize) {
this.queueMaxSize = queueMaxSize;
}
public int getGroupMaxSize() {
return groupMaxSize;
}
public void setGroupMaxSize(final int groupMaxSize) {
this.groupMaxSize = groupMaxSize;
}
public int getSlidingWindowSize() {
return slidingWindowSize;
}
public void setSlidingWindowSize(final int slidingWindowSize) {
this.slidingWindowSize = slidingWindowSize;
}
public String getConfigurationId() {
return configurationId;
}
public void setConfigurationId(final String configurationId) {
this.configurationId = configurationId;
}
public boolean isIncludeChildren() {
return includeChildren;
}
public void setIncludeChildren(final boolean includeChildren) {
this.includeChildren = includeChildren;
}
public int getMaxChildren() {
return maxChildren;
}
public void setMaxChildren(final int maxChildren) {
this.maxChildren = maxChildren;
}
public int getMaxIterations() {
return maxIterations;
}
public void setMaxIterations(int maxIterations) {
this.maxIterations = maxIterations;
}
public String getIdPath() {
return idPath;
}
public void setIdPath(String idPath) {
this.idPath = idPath;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
}
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.pace.model;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
import java.util.Map;
public class ClusteringDef implements Serializable {
private String name;
private List<String> fields;
private Map<String, Integer> params;
public ClusteringDef() {}
public String getName() {
return name;
}
public void setName(final String name) {
this.name = name;
}
public ClusteringFunction clusteringFunction() {
return PaceConfig.resolver.getClusteringFunction(getName(), params);
}
public List<String> getFields() {
return fields;
}
public void setFields(final List<String> fields) {
this.fields = fields;
}
public Map<String, Integer> getParams() {
return params;
}
public void setParams(final Map<String, Integer> params) {
this.params = params;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("unable to serialise " + this.getClass().getName(), e);
}
}
}

View File

@ -0,0 +1,100 @@
package eu.dnetlib.pace.model;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Type;
import java.io.Serializable;
import java.util.List;
/**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm.
*/
public class FieldDef implements Serializable {
public final static String PATH_SEPARATOR = "/";
private String name;
private String path;
private Type type;
private boolean overrideMatch;
/**
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
*/
private int size = -1;
/**
* Sets maximum length for field values in the model. -1 for unbounded length.
*/
private int length = -1;
public FieldDef() {}
public String getName() {
return name;
}
public String getPath() {
return path;
}
public List<String> getPathList() {
return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath()));
}
public Type getType() {
return type;
}
public void setType(final Type type) {
this.type = type;
}
public boolean isOverrideMatch() {
return overrideMatch;
}
public void setOverrideMatch(final boolean overrideMatch) {
this.overrideMatch = overrideMatch;
}
public int getSize() {
return size;
}
public void setSize(int size) {
this.size = size;
}
public int getLength() {
return length;
}
public void setLength(int length) {
this.length = length;
}
public void setName(String name) {
this.name = name;
}
public void setPath(String path) {
this.path = path;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (JsonProcessingException e) {
return null;
}
}
}

View File

@ -0,0 +1,155 @@
package eu.dnetlib.pace.model;
import java.nio.charset.Charset;
import java.text.Normalizer;
import java.util.List;
import java.util.Set;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.util.Capitalise;
import eu.dnetlib.pace.util.DotAbbreviations;
public class Person {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
public Person(String s, final boolean aggressive) {
original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD);
s = s.replaceAll("\\(.+\\)", "");
s = s.replaceAll("\\[.+\\]", "");
s = s.replaceAll("\\{.+\\}", "");
s = s.replaceAll("\\s+-\\s+", "-");
s = s.replaceAll("[\\p{Punct}&&[^,-]]", " ");
s = s.replaceAll("\\d", " ");
s = s.replaceAll("\\n", " ");
s = s.replaceAll("\\.", " ");
s = s.replaceAll("\\s+", " ");
if (aggressive) {
s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", "");
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);
} else if (arr.length > 1) {
surname = splitTerms(arr[0]);
name = splitTerms(arr[1]);
fullname.addAll(surname);
fullname.addAll(name);
}
} else {
fullname = splitTerms(s);
int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false;
for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i);
if (term.length() == 1) {
lastInitialPosition = i;
} else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true;
}
}
if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
} else if (hasSurnameInUpperCase) { // Case: Michele ARTINI
for (final String term : fullname) {
if ((term.length() > 1) && term.equals(term.toUpperCase())) {
surname.add(term);
} else {
name.add(term);
}
}
}
}
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
list.add(part);
}
}
return list;
}
public List<String> getName() {
return name;
}
public String getNameString() {
return Joiner.on(" ").join(getName());
}
public List<String> getSurname() {
return surname;
}
public List<String> getFullname() {
return fullname;
}
public String getOriginal() {
return original;
}
public String hash() {
return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString();
}
public String getNormalisedFirstName() {
return Joiner.on(" ").join(getCapitalFirstnames());
}
public String getNormalisedSurname() {
return Joiner.on(" ").join(getCapitalSurname());
}
public String getSurnameString() {
return Joiner.on(" ").join(getSurname());
}
public String getNormalisedFullname() {
return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname);
}
public List<String> getCapitalFirstnames() {
return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise()));
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, new Capitalise()));
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations()));
}
public boolean isAccurate() {
return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty());
}
}

View File

@ -0,0 +1,118 @@
package eu.dnetlib.pace.model;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
public class PersonComparatorUtils {
private static final int MAX_FULLNAME_LENGTH = 50;
public static Set<String> getNgramsForPerson(String fullname) {
Set<String> set = Sets.newHashSet();
if (fullname.length() > MAX_FULLNAME_LENGTH) {
return set;
}
Person p = new Person(fullname, true);
if (p.isAccurate()) {
for (String name : p.getName()) {
for (String surname : p.getSurname()) {
set.add((name.charAt(0) + "_" + surname).toLowerCase());
}
}
} else {
List<String> list = p.getFullname();
for (int i = 0; i < list.size(); i++) {
if (list.get(i).length() > 1) {
for (int j = 0; j < list.size(); j++) {
if (i != j) {
set.add((list.get(j).charAt(0) + "_" + list.get(i)).toLowerCase());
}
}
}
}
}
return set;
}
public static boolean areSimilar(String s1, String s2) {
Person p1 = new Person(s1, true);
Person p2 = new Person(s2, true);
if (p1.isAccurate() && p2.isAccurate()) {
return verifyNames(p1.getName(), p2.getName()) && verifySurnames(p1.getSurname(), p2.getSurname());
} else {
return verifyFullnames(p1.getFullname(), p2.getFullname());
}
}
private static boolean verifyNames(List<String> list1, List<String> list2) {
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
}
private static boolean verifySurnames(List<String> list1, List<String> list2) {
if (list1.size() != list2.size()) {
return false;
}
for (int i = 0; i < list1.size(); i++) {
if (!list1.get(i).equalsIgnoreCase(list2.get(i))) {
return false;
}
}
return true;
}
private static boolean verifyFullnames(List<String> list1, List<String> list2) {
Collections.sort(list1);
Collections.sort(list2);
return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2))
&& verifySimilarity(extractInitials(list1), extractInitials(list2));
}
private static List<String> extractExtendedNames(List<String> list) {
ArrayList<String> res = Lists.newArrayList();
for (String s : list) {
if (s.length() > 1) {
res.add(s.toLowerCase());
}
}
return res;
}
private static List<String> extractInitials(List<String> list) {
ArrayList<String> res = Lists.newArrayList();
for (String s : list) {
res.add(s.substring(0, 1).toLowerCase());
}
return res;
}
private static boolean verifySimilarity(List<String> list1, List<String> list2) {
if (list1.size() > list2.size()) {
return verifySimilarity(list2, list1);
}
// NB: List2 is greater than list1 (or equal)
int pos = -1;
for (String s : list1) {
int curr = list2.indexOf(s);
if (curr > pos) {
list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm"
pos = curr;
} else {
return false;
}
}
return true;
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.pace.model;
import eu.dnetlib.pace.clustering.NGramUtils;
import org.apache.spark.sql.Row;
import java.util.Comparator;
/**
* The Class MapDocumentComparator.
*/
public class RowDataOrderingComparator implements Comparator<Row> {
/** The comparator field. */
private int comparatorField;
/**
* Instantiates a new map document comparator.
*
* @param comparatorField
* the comparator field
*/
public RowDataOrderingComparator(final int comparatorField) {
this.comparatorField = comparatorField;
}
/*
* (non-Javadoc)
*
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
*/
@Override
public int compare(final Row d1, final Row d2) {
if (d1 == null)
return d2==null ? 0: -1;
else if (d2 == null) {
return 1;
}
final String o1 = d1.getString(comparatorField);
final String o2 = d2.getString(comparatorField);
if (o1 == null)
return o2==null ? 0: -1;
else if (o2 == null) {
return 1;
}
final String to1 = NGramUtils.cleanupForOrdering(o1);
final String to2 = NGramUtils.cleanupForOrdering(o2);
return to1.compareTo(to2);
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("alwaysMatch")
public class AlwaysMatch<T> extends AbstractComparator<T> {
public AlwaysMatch(final Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double compare(final Object a, final Object b, final Config conf) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,148 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ComparatorClass("authorsMatch")
public class AuthorsMatch extends AbstractListComparator {
Map<String, String> params;
private double SURNAME_THRESHOLD;
private double NAME_THRESHOLD;
private double FULLNAME_THRESHOLD;
private String MODE; //full or surname
private int SIZE_THRESHOLD;
private String TYPE; //count or percentage
private int common;
public AuthorsMatch(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
MODE = params.getOrDefault("mode", "full");
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage");
common = 0;
}
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
return 1.0;
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
common = 0;
//compare each element of List1 with each element of List2
for (Person p1 : aList)
for (Person p2 : bList) {
//both persons are inaccurate
if (!p1.isAccurate() && !p2.isAccurate()) {
//compare just normalized fullnames
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
common += 1;
break;
}
}
//one person is inaccurate
if (p1.isAccurate() ^ p2.isAccurate()) {
//prepare data
//data for the accurate person
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
//data for the inaccurate person
String fullname = normalization(
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
);
if (fullname.contains(surname)) {
if (MODE.equals("full")) {
if (fullname.contains(name)) {
common += 1;
break;
}
}
else { //MODE equals "surname"
common += 1;
break;
}
}
}
//both persons are accurate
if (p1.isAccurate() && p2.isAccurate()) {
if (compareSurname(p1, p2)) {
if (MODE.equals("full")) {
if(compareFirstname(p1, p2)) {
common += 1;
break;
}
}
else { //MODE equals "surname"
common += 1;
break;
}
}
}
}
//normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
if(TYPE.equals("percentage")) {
return (double) common / normFactor;
}
else {
return (double) common;
}
}
public boolean compareSurname(Person p1, Person p2) {
return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD;
}
public boolean compareFirstname(Person p1, Person p2) {
if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) {
if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName())))
return true;
}
return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD;
}
public String normalization(String s) {
return normalize(utf8(cleanup(s)));
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
private Map<String, String> params;
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
//if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; //undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator<double[]> {
Map<String, String> params;
public CosineSimilarity(Map<String,String> params) {
super(params);
}
@Override
public double compare(Object a, Object b, Config config) {
return compare((double[])a, (double[])b, config);
}
public double compare(final double[] a, final double[] b, final Config conf) {
if (a.length == 0 || b.length == 0)
return -1;
return cosineSimilarity(a, b);
}
double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0;
double normASum = 0;
double normBSum = 0;
for(int i = 0; i < a.length; i ++) {
dotProduct += a[i] * b[i];
normASum += a[i] * a[i];
normBSum += b[i] * b[i];
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class ExactMatch.
*
* @author claudio
*/
@ComparatorClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public DoiExactMatch(final Map<String, String> params) {
super(params);
}
@Override
protected String toString(final Object f) {
return super.toString(f).replaceAll(PREFIX, "");
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
@ComparatorClass("domainExactMatch")
public class DomainExactMatch extends ExactMatchIgnoreCase {
public DomainExactMatch(final Map<String, String> params) {
super(params);
}
@Override
protected String toString(final Object f) {
try {
return asUrl(super.toString(f)).getHost();
} catch (MalformedURLException e) {
return "";
}
}
private URL asUrl(final String value) throws MalformedURLException {
return new URL(value);
}
}

View File

@ -0,0 +1,42 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("exactMatch")
public class ExactMatch extends AbstractStringComparator {
public ExactMatch(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; //return -1 if a field is missing
}
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.pace.tree;
import com.google.common.base.Joiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List;
import java.util.Map;
@ComparatorClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractStringComparator {
public ExactMatchIgnoreCase(Map<String, String> params) {
super(params);
}
@Override
public double compare(String a, String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return a.equalsIgnoreCase(b) ? 1 : 0;
}
protected String toString(final Object object) {
return toFirstString(object);
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@ComparatorClass("instanceTypeMatch")
public class InstanceTypeMatch extends AbstractListComparator {
final Map<String, String> translationMap = new HashMap<>();
public InstanceTypeMatch(Map<String, String> params){
super(params);
//jolly types
translationMap.put("Conference object", "*");
translationMap.put("Other literature type", "*");
translationMap.put("Unknown", "*");
//article types
translationMap.put("Article", "Article");
translationMap.put("Data Paper", "Article");
translationMap.put("Software Paper", "Article");
translationMap.put("Preprint", "Article");
//thesis types
translationMap.put("Thesis", "Thesis");
translationMap.put("Master thesis", "Thesis");
translationMap.put("Bachelor thesis", "Thesis");
translationMap.put("Doctoral thesis", "Thesis");
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a == null || b == null) {
return -1;
}
if (a.isEmpty() || b.isEmpty()) {
return -1;
}
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
//if at least one is a jolly type, it must produce a match
if (ca.contains("*") || cb.contains("*"))
return 1.0;
int incommon = Sets.intersection(ca, cb).size();
//if at least one is in common, it must produce a match
return incommon >= 1 ? 1 : 0;
}
public String translate(String term){
return translationMap.getOrDefault(term, term);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,44 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinkler")
public class JaroWinkler extends AbstractStringComparator {
public JaroWinkler(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinkler(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,70 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca,cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,45 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@ComparatorClass("jaroWinklerTitle")
public class JaroWinklerTitle extends AbstractStringComparator {
public JaroWinklerTitle(Map<String, String> params){
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinklerTitle(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
boolean check = checkNumbers(ca, cb);
return check ? 0.5 : normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,72 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
@ComparatorClass("jsonListMatch")
public class JsonListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map<String, String> params;
private String MODE; //"percentage" or "count"
public JsonListMatch(final Map<String, String> params) {
super(params);
this.params = params;
MODE = params.getOrDefault("mode", "percentage");
}
@Override
public double compare(final List<String> sa, final List<String> sb, final Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
int incommon = Sets.intersection(ca, cb).size();
int simDiff = Sets.symmetricDifference(ca, cb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
if (MODE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
}
//converts every json into a comparable string basing on parameters
private String toComparableString(String json){
StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters
//for each path in the param list
for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key);
String value = MapDocumentUtil.getJPathString(path, json);
if (value == null || value.isEmpty())
value = "";
st.append(value);
st.append("::");
}
st.setLength(st.length()-2);
return st.toString();
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
import java.util.Set;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params;
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
//if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; //undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinkler")
public class Level2JaroWinkler extends AbstractStringComparator {
public Level2JaroWinkler(Map<String, String> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinkler(double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends AbstractStringComparator {
public Level2JaroWinklerTitle(Map<String,String> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinklerTitle(final double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}
protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
return ssalgo.score(ca, cb);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("level2Levenstein")
public class Level2Levenstein extends AbstractStringComparator {
public Level2Levenstein(Map<String,String> params){
super(params, new com.wcohen.ss.Level2Levenstein());
}
public Level2Levenstein(double w) {
super(w, new com.wcohen.ss.Level2Levenstein());
}
protected Level2Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("levenstein")
public class Levenstein extends AbstractStringComparator {
public Levenstein(Map<String,String> params){
super(params, new com.wcohen.ss.Levenstein());
}
public Levenstein(double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected Levenstein(double w, AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,55 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Map;
@ComparatorClass("levensteinTitle")
public class LevensteinTitle extends AbstractStringComparator {
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
public LevensteinTitle(Map<String,String> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitle(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
final String ca = cleanup(a);
final String cb = cleanup(b);
final boolean check = checkNumbers(ca, cb);
if (check) return 0.5;
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
*/
@ComparatorClass("levensteinTitleIgnoreVersion")
public class LevensteinTitleIgnoreVersion extends AbstractStringComparator {
public LevensteinTitleIgnoreVersion(Map<String,String> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitleIgnoreVersion(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
protected LevensteinTitleIgnoreVersion(final double w, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = ca.replaceAll("\\d", "").replaceAll(getRomans(ca), "").trim();
cb = cb.replaceAll("\\d", "").replaceAll(getRomans(cb), "").trim();
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
}
private double normalize(final double score, final int la, final int lb) {
return 1 - (Math.abs(score) / Math.max(la, lb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* The Class Contains match
*
* @author miconis
* */
@ComparatorClass("listContainsMatch")
public class ListContainsMatch extends AbstractListComparator {
private Map<String, String> params;
private boolean CASE_SENSITIVE;
private String STRING;
private String AGGREGATOR;
public ListContainsMatch(Map<String, String> params) {
super(params);
this.params = params;
//read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string");
AGGREGATOR = params.get("bool");
}
@Override
public double compare(List<String> sa, List<String> sb, Config conf) {
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
if (!CASE_SENSITIVE) {
sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
STRING = STRING.toLowerCase();
}
switch(AGGREGATOR) {
case "AND":
if(sa.contains(STRING) && sb.contains(STRING))
return 1.0;
break;
case "OR":
if(sa.contains(STRING) || sb.contains(STRING))
return 1.0;
break;
case "XOR":
if(sa.contains(STRING) ^ sb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("mustBeDifferent")
public class MustBeDifferent extends AbstractStringComparator {
public MustBeDifferent(Map<String,String> params){
super(params, new com.wcohen.ss.Levenstein());
}
public MustBeDifferent(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
return !a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,23 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Not all fields of a document need to partecipate in the compare measure. We model those fields as having a
* NullDistanceAlgo.
*/
@ComparatorClass("null")
public class NullDistanceAlgo<T> implements Comparator<T> {
public NullDistanceAlgo(Map<String, String> params){
}
@Override
public double compare(Object a, Object b, Config config) {
return 0;
}
}

View File

@ -0,0 +1,34 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractStringComparator {
Map<String, String> params;
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2);
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersMatch")
public class NumbersMatch extends AbstractStringComparator {
public NumbersMatch(Map<String, String> params) {
super(params);
}
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() && numbers2.isEmpty())
return 1.0;
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
if (numbers1.equals(numbers2))
return 1.0;
return 0.0;
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("romansMatch")
public class RomansMatch extends AbstractStringComparator {
public RomansMatch(Map<String, String> params) {
super(params);
}
@Override
public double distance(String a, String b, Config conf) {
//extracts romans from the field
String romans1 = getRomans(nfd(a));
String romans2 = getRomans(nfd(b));
if (romans1.isEmpty() && romans2.isEmpty())
return 1.0;
if (romans1.isEmpty() || romans2.isEmpty())
return -1.0;
if (romans1.equals(romans2))
return 1.0;
return 0.0;
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.List;
import java.util.Map;
/**
* Returns true if the number of values in the fields is the same.
*
* @author claudio
*/
@ComparatorClass("sizeMatch")
public class SizeMatch extends AbstractListComparator {
/**
* Instantiates a new size match.
*
* @param params
* the parameters
*/
public SizeMatch(final Map<String, String> params) {
super(params);
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1.0;
return a.size() == b.size() ? 1.0 : 0.0;
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedJaroWinkler")
public class SortedJaroWinkler extends AbstractSortedComparator {
public SortedJaroWinkler(Map<String,String> params){
super(params, new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedJaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.tree.support.AbstractSortedComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ComparatorClass("sortedLevel2JaroWinkler")
public class SortedLevel2JaroWinkler extends AbstractSortedComparator {
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
*/
public SortedLevel2JaroWinkler(final double weight) {
super(weight, new com.wcohen.ss.Level2JaroWinkler());
}
public SortedLevel2JaroWinkler(final Map<String, String> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,64 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* The Class Contains match
*
* @author miconis
* */
@ComparatorClass("stringContainsMatch")
public class StringContainsMatch extends AbstractStringComparator {
private Map<String, String> params;
private boolean CASE_SENSITIVE;
private String STRING;
private String AGGREGATOR;
public StringContainsMatch(Map<String, String> params) {
super(params);
this.params = params;
//read parameters
CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
STRING = params.get("string");
AGGREGATOR = params.get("aggregator");
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = a;
String cb = b;
if (!CASE_SENSITIVE) {
ca = a.toLowerCase();
cb = b.toLowerCase();
STRING = STRING.toLowerCase();
}
switch(AGGREGATOR) {
case "AND":
if(ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if(ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if(ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
}

View File

@ -0,0 +1,53 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractListComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ComparatorClass("stringListMatch")
public class StringListMatch extends AbstractListComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
final private String TYPE; //percentage or count
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
TYPE = params.getOrDefault("type", "percentage");
}
@Override
public double compare(final List<String> a, final List<String> b, final Config conf) {
final Set<String> pa = new HashSet<>(a);
final Set<String> pb = new HashSet<>(b);
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists is empty
}
int incommon = Sets.intersection(pa, pb).size();
int simDiff = Sets.symmetricDifference(pa, pb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
if(TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
}
}

View File

@ -0,0 +1,89 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.util.Map;
/**
* The Class SubStringLevenstein.
*/
@ComparatorClass("subStringLevenstein")
public class SubStringLevenstein extends AbstractStringComparator {
/**
* The limit.
*/
protected int limit;
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
*/
public SubStringLevenstein(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
public SubStringLevenstein(Map<String, String> params) {
super(params, new com.wcohen.ss.Levenstein());
this.limit = Integer.parseInt(params.getOrDefault("limit", "1"));
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
* @param limit the limit
*/
public SubStringLevenstein(final double w, final int limit) {
super(w, new com.wcohen.ss.Levenstein());
this.limit = limit;
}
/**
* Instantiates a new sub string levenstein.
*
* @param w the w
* @param limit the limit
* @param ssalgo the ssalgo
*/
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
super(w, ssalgo);
this.limit = limit;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field)
*/
@Override
public double distance(final String a, final String b, final Config conf) {
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight()
*/
@Override
public double getWeight() {
return super.weight;
}
/*
* (non-Javadoc)
*
* @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double)
*/
@Override
protected double normalize(final double d) {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
}

View File

@ -0,0 +1,38 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
/**
* Returns true if the titles in the given documents contains the same numbers, false otherwise.
*
* @author claudio
*
*/
@ComparatorClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractStringComparator {
public TitleVersionMatch(final Map<String, String> params) {
super(params);
}
@Override
public double compare(final String valueA, final String valueB, final Config conf) {
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0;
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
protected String toString(final Object object) {
return toFirstString(object);
}
}

View File

@ -0,0 +1,61 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
@ComparatorClass("urlMatcher")
public class UrlMatcher extends Levenstein {
private Map<String, String> params;
public UrlMatcher(Map<String, String> params){
super(params);
this.params = params;
}
public UrlMatcher(double weight, Map<String, String> params) {
super(weight);
this.params = params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
@Override
public double distance(String a, String b, final Config conf) {
final URL urlA = asUrl(a);
final URL urlB = asUrl(b);
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
return 0.0;
}
Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5"));
Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5"));
if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) {
return hostW * 0.5;
}
return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf);
}
private URL asUrl(final String value) {
try {
return new URL(value);
} catch (MalformedURLException e) {
// should not happen as checked by pace typing
throw new IllegalStateException("invalid URL: " + value);
}
}
protected String toString(final Object object) {
return toFirstString(object);
}
}

View File

@ -0,0 +1,50 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.lang3.StringUtils;
import java.util.Map;
/**
* Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing.
*
* @author claudio
*/
@ComparatorClass("yearMatch")
public class YearMatch extends AbstractStringComparator {
private int limit = 4;
public YearMatch(final Map<String, String> params) {
super(params);
}
@Override
public double compare(final String a, final String b, final Config conf) {
final String valueA = getNumbers(getFirstValue(a));
final String valueB = getNumbers(getFirstValue(b));
if (valueA.isEmpty() || valueB.isEmpty())
return -1;
final boolean lengthMatch = checkLength(valueA) && checkLength(valueB);
final boolean onemissing = valueA.isEmpty() || valueB.isEmpty();
return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0;
}
protected boolean checkLength(final String s) {
return s.length() == limit;
}
protected String getFirstValue(final String value) {
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
}
@Override
public String toString() {
return getClass().getSimpleName() + ":" + super.toString();
}
}

View File

@ -0,0 +1,130 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Config;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public abstract class AbstractComparator<T> extends AbstractPaceFunctions implements Comparator<T> {
/** The ssalgo. */
protected AbstractStringDistance ssalgo;
/** The weight. */
protected double weight = 0.0;
private Map<String, String> params;
protected AbstractComparator(Map<String, String> params) {
this.params = params;
}
protected AbstractComparator(Map<String, String> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = 1.0;
this.ssalgo = ssalgo;
}
/**
* Instantiates a new second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) {
this.ssalgo = ssalgo;
this.weight = weight;
}
protected AbstractComparator(final AbstractStringDistance ssalgo){
this.ssalgo = ssalgo;
}
/**
* Normalize.
*
* @param d
* the d
* @return the double
*/
protected double normalize(double d) {
return d;
}
/**
* Distance.
*
* @param a
* the a
* @param b
* the b
* @return the double
*/
protected double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
protected double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
/**
* Convert the given argument to a List of Strings
*
* @param object
* function argument
* @return the list
*/
protected List<String> toList(final Object object) {
if (object instanceof List) {
return (List<String>)object;
}
return Lists.newArrayList(object.toString());
}
/**
* Convert the given argument to a String
*
* @param object
* function argument
* @return the list
*/
protected String toString(final Object object) {
if (object instanceof List) {
List<String> l = (List<String>) object;
return Joiner.on(" ").join(l);
}
return object.toString();
}
protected String toFirstString(final Object object) {
if (object instanceof List) {
List<String> l = (List<String>) object;
return l.isEmpty() ? "" : l.get(0);
}
return object.toString();
}
public double getWeight(){
return this.weight;
}
}

View File

@ -0,0 +1,39 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import java.util.List;
import java.util.Map;
abstract public class AbstractListComparator extends AbstractComparator<List<String>>{
protected AbstractListComparator(Map<String, String> params) {
super(params);
}
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(params, ssalgo);
}
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AbstractListComparator(AbstractStringDistance ssalgo) {
super(ssalgo);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toList(a), toList(b), conf);
}
public double compare(final List<String> a, final List<String> b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(concat(a), concat(b), conf);
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.pace.tree.support;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
import java.util.AbstractList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
public abstract class AbstractSortedComparator extends AbstractListComparator {
/**
* Instantiates a new sorted second string compare algo.
*
* @param weight
* the weight
* @param ssalgo
* the ssalgo
*/
protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AbstractSortedComparator(final Map<String, String> params, final AbstractStringDistance ssalgo){
super(Double.parseDouble(params.get("weight")), ssalgo);
}
@Override
protected List<String> toList(final Object object) {
if (object instanceof List) {
List<String> fl = (List<String>) object;
List<String> values = Lists.newArrayList(fl);
Collections.sort(values);
return values;
}
return Lists.newArrayList(object.toString());
}
}

View File

@ -0,0 +1,44 @@
package eu.dnetlib.pace.tree.support;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import java.util.Map;
public abstract class AbstractStringComparator extends AbstractComparator<String>{
protected AbstractStringComparator(Map<String, String> params) {
super(params);
}
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
super(params, ssalgo);
}
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
protected AbstractStringComparator(AbstractStringDistance ssalgo) {
super(ssalgo);
}
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1; //return -1 if a field is missing
}
double score = ssalgo.score(a, b);
return normalize(score);
}
@Override
public double compare(Object a, Object b, Config conf) {
return compare(toString(a), toString(b), conf);
}
public double compare(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
return distance(a, b, conf);
}
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
public enum AggType {
W_MEAN, //weighted mean
AVG, //average
SUM,
MAX,
MIN,
AND, //used for necessary conditions
OR; //used for sufficient conditions
public static AggType getEnum(String value) {
try {
return AggType.valueOf(value);
}
catch (IllegalArgumentException e) {
throw new PaceException("Undefined aggregation type", e);
}
}
}

View File

@ -0,0 +1,12 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
public interface Comparator<T> {
/*
* return : -1 -> can't decide (i.e. missing field)
* >0 -> similarity degree (depends on the algorithm)
* */
public double compare(Object a, Object b, Config conf);
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.tree.support;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ComparatorClass {
public String value();
}

View File

@ -0,0 +1,82 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
/**
* The class that defines the configuration of each field in the decision tree.
* */
public class FieldConf implements Serializable {
private String field; //name of the field on which apply the comparator
private String comparator; //comparator name
private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,String> params; //parameters
private boolean countIfUndefined;
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public FieldConf() {
}
public FieldConf(String field, String comparator, double weight, Map<String, String> params, boolean countIfUndefined) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
this.countIfUndefined = countIfUndefined;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
public String getComparator() {
return comparator;
}
public void setComparator(String comparator) {
this.comparator = comparator;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public Map<String, String> getParams() {
return params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -0,0 +1,87 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
/**
* The class that contains the result of each comparison in the decision tree
* */
public class FieldStats implements Serializable {
private double weight; //weight for the field (to be used in the aggregation)
private double threshold; //threshold for the field (to be used in some kind of aggregations)
private double result; //the result of the comparison
private Object a;
private Object b;
private boolean countIfUndefined;
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
this.weight = weight;
this.threshold = threshold;
this.result = result;
this.countIfUndefined = countIfUndefined;
this.a = a;
this.b = b;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public double getWeight() {
return weight;
}
public void setWeight(double weight) {
this.weight = weight;
}
public double getResult() {
return result;
}
public void setResult(double result) {
this.result = result;
}
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public Object getA() {
return a;
}
public void setA(Object a) {
this.a = a;
}
public Object getB() {
return b;
}
public void setB(Object b) {
this.b = b;
}
@Override
public String toString(){
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -0,0 +1,26 @@
package eu.dnetlib.pace.tree.support;
public enum MatchType {
MATCH,
NO_MATCH,
UNDEFINED;
public static MatchType parse(String value) {
if (MATCH.name().equals(value)) {
return MATCH;
} else if (NO_MATCH.name().equals(value)) {
return NO_MATCH;
} else {
return UNDEFINED;
}
// try {
// return MatchType.valueOf(value);
// }
// catch (IllegalArgumentException e) {
// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
// }
}
}

View File

@ -0,0 +1,166 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import java.io.IOException;
import java.io.Serializable;
import java.util.List;
public class TreeNodeDef implements Serializable {
final static String CROSS_COMPARE = "crossCompare";
private List<FieldConf> fields;
private AggType aggregation;
private double threshold;
private String positive;
private String negative;
private String undefined;
boolean ignoreUndefined;
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) {
this.fields = fields;
this.aggregation = aggregation;
this.threshold = threshold;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.ignoreUndefined = ignoreUndefined;
}
public TreeNodeDef() {}
//function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats();
//for each field in the node, it computes the
for (FieldConf fieldConf : fields) {
double weight = fieldConf.getWeight();
double result;
Object value1 = getJavaValue(doc1,fieldConf.getField());
Object value2 = getJavaValue(doc2,fieldConf.getField());
//if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
if (crossField != null) {
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2,crossField), conf);
double result2 = comparator(fieldConf).compare(getJavaValue(doc1,crossField), value2, conf);
result = Math.max(result1,result2);
}
else {
result = comparator(fieldConf).compare(value1, value2, conf);
}
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),
new FieldStats(
weight,
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
result,
fieldConf.isCountIfUndefined(),
value1,
value2
));
}
return stats;
}
public Object getJavaValue(Row row, String name) {
int pos = row.fieldIndex(name);
if (pos >= 0) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
}
return null;
}
private Comparator comparator(final FieldConf field){
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
}
public List<FieldConf> getFields() {
return fields;
}
public void setFields(List<FieldConf> fields) {
this.fields = fields;
}
public AggType getAggregation() {
return aggregation;
}
public void setAggregation(AggType aggregation) {
this.aggregation = aggregation;
}
public double getThreshold() {
return threshold;
}
public void setThreshold(double threshold) {
this.threshold = threshold;
}
public String getPositive() {
return positive;
}
public void setPositive(String positive) {
this.positive = positive;
}
public String getNegative() {
return negative;
}
public void setNegative(String negative) {
this.negative = negative;
}
public String getUndefined() {
return undefined;
}
public void setUndefined(String undefined) {
this.undefined = undefined;
}
public boolean isIgnoreUndefined() {
return ignoreUndefined;
}
public void setIgnoreUndefined(boolean ignoreUndefined) {
this.ignoreUndefined = ignoreUndefined;
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -0,0 +1,134 @@
package eu.dnetlib.pace.tree.support;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; //this is an accumulator for the results of the node
public TreeNodeStats(){
this.results = new HashMap<>();
}
public Map<String, FieldStats> getResults() {
return results;
}
public void addFieldStats(String id, FieldStats fieldStats){
this.results.put(id, fieldStats);
}
public int fieldsCount(){
return this.results.size();
}
public int undefinedCount(){
int undefinedCount = 0;
for(FieldStats fs: this.results.values()){
if(fs.getResult() == -1)
undefinedCount ++;
}
return undefinedCount;
}
public double scoreSum(){
double scoreSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0) {
scoreSum += fs.getResult();
}
}
return scoreSum;
}
//return the sum of the weights without considering the fields with countIfMissing=false && result=-1
public double weightSum(){
double weightSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) {
weightSum += fs.getWeight();
}
}
return weightSum;
}
public double weightedScoreSum(){
double weightedScoreSum = 0.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>=0.0) {
weightedScoreSum += fs.getResult()*fs.getWeight();
}
}
return weightedScoreSum;
}
public double max(){
double max = -1.0;
for(FieldStats fs: this.results.values()){
if(fs.getResult()>max)
max = fs.getResult();
}
return max;
}
public double min(){
double min = 100.0; //random high value
for(FieldStats fs: this.results.values()){
if(fs.getResult()<min) {
if (fs.getResult()>=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
min = fs.getResult();
}
}
return min;
}
//if at least one is true, return 1.0
public double or(){
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0;
}
return 0.0;
}
//if at least one is false, return 0.0
public double and() {
for (FieldStats fieldStats : this.results.values()) {
if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined())
return 0.0;
}
else {
if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0;
}
}
return 1.0;
}
public double getFinalScore(AggType aggregation){
switch (aggregation){
case AVG:
return scoreSum()/fieldsCount();
case SUM:
return scoreSum();
case MAX:
return max();
case MIN:
return min();
case W_MEAN:
return weightedScoreSum()/weightSum();
case OR:
return or();
case AND:
return and();
default:
return 0.0;
}
}
}

View File

@ -0,0 +1,93 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.util.PaceException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Row;
/**
* The compare between two documents is given by the weighted mean of the field distances
*/
public class TreeProcessor {
private static final Log log = LogFactory.getLog(TreeProcessor.class);
private Config config;
public TreeProcessor(final Config config) {
this.config = config;
}
// row based copies
public boolean compare(final Row a, final Row b) {
//evaluate the decision tree
return evaluateTree(a, b).getResult() == MatchType.MATCH;
}
public TreeStats evaluateTree(final Row doc1, final Row doc2){
TreeStats treeStats = new TreeStats();
String nextNodeName = "start";
do {
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("Missing tree node: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats);
//if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
nextNodeName = currentNode.getUndefined();
}
//if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive();
}
else {
nextNodeName = currentNode.getNegative();
}
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED);
treeStats.setResult(MatchType.parse(nextNodeName));
return treeStats;
}
public double computeScore(final Row doc1, final Row doc2) {
String nextNodeName = "start";
double score = 0.0;
do {
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
//throw an exception if the node doesn't exist
if (currentNode == null)
throw new PaceException("The Tree Node doesn't exist: " + nextNodeName);
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
score = stats.getFinalScore(currentNode.getAggregation());
//if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) {
nextNodeName = currentNode.getUndefined();
}
//if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive();
}
else {
nextNodeName = currentNode.getNegative();
}
} while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED);
return score;
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.tree.support;
import eu.dnetlib.pace.util.PaceException;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
public class TreeStats {
//<layer_id, <field:comparator, result>>
Map<String, TreeNodeStats> stats;
MatchType result;
public TreeStats(){
this.stats = new HashMap<>();
this.result = MatchType.NO_MATCH;
}
public MatchType getResult(){
return this.result;
}
public void setResult(MatchType result){
this.result = result;
}
public Map<String, TreeNodeStats> getStats() {
return stats;
}
public void setStats(Map<String, TreeNodeStats> stats) {
this.stats = stats;
}
public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){
this.stats.put(layerID, treeNodeStats);
}
@Override
public String toString(){
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Impossible to convert to JSON: ", e);
}
}
}

View File

@ -0,0 +1,138 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.types.ArrayType;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.StringType;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
public class BlockProcessor {
public static final List<String> accumulators= new ArrayList<>();
private static final Log log = LogFactory.getLog(BlockProcessor.class);
private DedupConfig dedupConf;
private final int identifierFieldPos;
private final int orderFieldPos;
public static void constructAccumulator( final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
}
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
this.dedupConf = dedupConf;
this.identifierFieldPos = identifierFieldPos;
this.orderFieldPos = orderFieldPos;
}
public void processSortedRows(final Collection<Row> documents, final Reporter context) {
if (documents.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
processRows(documents, context);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
}
}
private void processRows(final Collection<Row> queue, final Reporter context) {
Iterator<Row> it = queue.iterator();
while (it.hasNext()) {
final Row pivot = it.next();
it.remove();
final String idPivot = pivot.getString(identifierFieldPos); //identifier
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
final WfConfig wf = dedupConf.getWf();
if (fieldPivot != null) {
int i = 0;
for (final Row curr : queue) {
final String idCurr = curr.getString(identifierFieldPos); //identifier
if (mustSkip(idCurr)) {
context.incrementCounter(wf.getEntityType(), "skip list", 1);
break;
}
if (i > wf.getSlidingWindowSize()) {
break;
}
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
}
}
}
}
}
public Object getJavaValue(Row row, int pos) {
DataType dt = row.schema().fields()[pos].dataType();
if (dt instanceof StringType) {
return row.getString(pos);
} else if (dt instanceof ArrayType) {
return row.getList(pos);
}
return null;
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {
writeSimilarity(context, idPivot, idCurr);
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
}
}
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}
private String getNsPrefix(final String id) {
return StringUtils.substringBetween(id, "|", "::");
}
private void writeSimilarity(final Reporter context, final String from, final String to) {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
context.emit(type, to, from);
}
}

View File

@ -0,0 +1,15 @@
package eu.dnetlib.pace.util;
import com.google.common.base.Function;
import org.apache.commons.lang3.text.WordUtils;
public class Capitalise implements Function<String, String> {
private final char[] DELIM = {' ', '-'};
@Override
public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM);
}
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
package eu.dnetlib.pace.util;
import com.google.common.base.Function;
public class DotAbbreviations implements Function<String, String> {
@Override
public String apply(String s) {
return s.length() == 1 ? s + "." : s;
}
};

View File

@ -0,0 +1,117 @@
package eu.dnetlib.pace.util;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
import java.math.BigDecimal;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
public class MapDocumentUtil {
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
public static List<String> getJPathList(String path, String json, Type type) {
if (type == Type.List)
return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
Object jresult;
List<String> result = new ArrayList<>();
try {
jresult = JsonPath.read(json, path);
} catch (Throwable e) {
return result;
}
if (jresult instanceof JSONArray) {
((JSONArray) jresult).forEach(it -> {
try {
result.add(new ObjectMapper().writeValueAsString(it));
} catch (JsonProcessingException e) {
}
}
);
return result;
}
if (jresult instanceof LinkedHashMap) {
try {
result.add(new ObjectMapper().writeValueAsString(jresult));
} catch (JsonProcessingException e) {
}
return result;
}
if (jresult instanceof String) {
result.add((String) jresult);
}
return result;
}
public static String getJPathString(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof String)
return (String)o;
if (o instanceof JSONArray && ((JSONArray)o).size()>0)
return (String)((JSONArray)o).get(0);
return "";
} catch (Exception e) {
return "";
}
}
public static double[] getJPathArray(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static String truncateValue(String value, int length) {
if (value == null)
return "";
if (length == -1 || length > value.length())
return value;
return value.substring(0, length);
}
public static List<String> truncateList(List<String> list, int size) {
if (size == -1 || size > list.size())
return list;
return list.subList(0, size);
}
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.util;
public class PaceException extends RuntimeException {
public PaceException(String s, Throwable e){
super(s, e);
}
public PaceException(String s){
super(s);
}
}

View File

@ -0,0 +1,49 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.tree.support.Comparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.reflections.Reflections;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
public class PaceResolver implements Serializable {
public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering");
public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree");
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<Comparator>> comparators;
public PaceResolver() {
this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream()
.filter(Comparator.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>)cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
try {
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + " not found ", e);
}
}
public Comparator getComparator(String name, Map<String, String> params) throws PaceException {
try {
return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) {
throw new PaceException(name + " not found ", e);
}
}
}

View File

@ -0,0 +1,11 @@
package eu.dnetlib.pace.util;
import java.io.Serializable;
public interface Reporter extends Serializable {
void incrementCounter(String counterGroup, String counterName, long delta);
void emit(String type, String from, String to);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
{
"wf" : {
"threshold" : "$threshold$",
"dedupRun" : "$run$",
"entityType" : "$entityType$",
"subEntityType" : "$subEntityType$",
"subEntityValue" : "$subEntityValue$",
"orderField" : "$orderField$",
"queueMaxSize" : "$queueMaxSize$",
"groupMaxSize" : "$groupMaxSize$",
"slidingWindowSize" : "$slidingWindowSize$",
"rootBuilder" : [ $rootBuilder:{"$it$"};separator=", "$ ],
"includeChildren" : "$includeChildren$",
"configurationId" : "$configurationId$"
},
"pace" : {
"clustering" : [
],
"sufficientConditions" : [
],
"necessaryConditions" : [
],
"model" : [
],
"blacklists" : { }
}
}

View File

@ -0,0 +1,7 @@
van
der
de
dell
sig
mr
mrs

View File

@ -0,0 +1,620 @@
a
ab
aber
ach
acht
achte
achten
achter
achtes
ag
alle
allein
allem
aller
allerdings
alles
allgemeinen
als
also
am
an
ander
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
au
auch
auf
aus
ausser
ausserdem
außer
außerdem
b
bald
bei
beide
beiden
beim
beispiel
bekannt
bereits
besonders
besser
besten
bin
bis
bisher
bist
c
d
d.h
da
dabei
dadurch
dafür
dagegen
daher
dahin
dahinter
damals
damit
danach
daneben
dank
dann
daran
darauf
daraus
darf
darfst
darin
darum
darunter
darüber
das
dasein
daselbst
dass
dasselbe
davon
davor
dazu
dazwischen
daß
dein
deine
deinem
deinen
deiner
deines
dem
dementsprechend
demgegenüber
demgemäss
demgemäß
demselben
demzufolge
den
denen
denn
denselben
der
deren
derer
derjenige
derjenigen
dermassen
dermaßen
derselbe
derselben
des
deshalb
desselben
dessen
deswegen
dich
die
diejenige
diejenigen
dies
diese
dieselbe
dieselben
diesem
diesen
dieser
dieses
dir
doch
dort
drei
drin
dritte
dritten
dritter
drittes
du
durch
durchaus
durfte
durften
dürfen
dürft
e
eben
ebenso
ehrlich
ei
ei,
eigen
eigene
eigenen
eigener
eigenes
ein
einander
eine
einem
einen
einer
eines
einig
einige
einigem
einigen
einiger
einiges
einmal
eins
elf
en
ende
endlich
entweder
er
ernst
erst
erste
ersten
erster
erstes
es
etwa
etwas
euch
euer
eure
eurem
euren
eurer
eures
f
folgende
früher
fünf
fünfte
fünften
fünfter
fünftes
für
g
gab
ganz
ganze
ganzen
ganzer
ganzes
gar
gedurft
gegen
gegenüber
gehabt
gehen
geht
gekannt
gekonnt
gemacht
gemocht
gemusst
genug
gerade
gern
gesagt
geschweige
gewesen
gewollt
geworden
gibt
ging
gleich
gott
gross
grosse
grossen
grosser
grosses
groß
große
großen
großer
großes
gut
gute
guter
gutes
h
hab
habe
haben
habt
hast
hat
hatte
hatten
hattest
hattet
heisst
her
heute
hier
hin
hinter
hoch
hätte
hätten
i
ich
ihm
ihn
ihnen
ihr
ihre
ihrem
ihren
ihrer
ihres
im
immer
in
indem
infolgedessen
ins
irgend
ist
j
ja
jahr
jahre
jahren
je
jede
jedem
jeden
jeder
jedermann
jedermanns
jedes
jedoch
jemand
jemandem
jemanden
jene
jenem
jenen
jener
jenes
jetzt
k
kam
kann
kannst
kaum
kein
keine
keinem
keinen
keiner
keines
kleine
kleinen
kleiner
kleines
kommen
kommt
konnte
konnten
kurz
können
könnt
könnte
l
lang
lange
leicht
leide
lieber
los
m
machen
macht
machte
mag
magst
mahn
mal
man
manche
manchem
manchen
mancher
manches
mann
mehr
mein
meine
meinem
meinen
meiner
meines
mensch
menschen
mich
mir
mit
mittel
mochte
mochten
morgen
muss
musst
musste
mussten
muß
mußt
möchte
mögen
möglich
mögt
müssen
müsst
müßt
n
na
nach
nachdem
nahm
natürlich
neben
nein
neue
neuen
neun
neunte
neunten
neunter
neuntes
nicht
nichts
nie
niemand
niemandem
niemanden
noch
nun
nur
o
ob
oben
oder
offen
oft
ohne
ordnung
p
q
r
recht
rechte
rechten
rechter
rechtes
richtig
rund
s
sa
sache
sagt
sagte
sah
satt
schlecht
schluss
schon
sechs
sechste
sechsten
sechster
sechstes
sehr
sei
seid
seien
sein
seine
seinem
seinen
seiner
seines
seit
seitdem
selbst
sich
sie
sieben
siebente
siebenten
siebenter
siebentes
sind
so
solang
solche
solchem
solchen
solcher
solches
soll
sollen
sollst
sollt
sollte
sollten
sondern
sonst
soweit
sowie
später
startseite
statt
steht
suche
t
tag
tage
tagen
tat
teil
tel
tritt
trotzdem
tun
u
uhr
um
und
und?
uns
unse
unsem
unsen
unser
unsere
unserer
unses
unter
v
vergangenen
viel
viele
vielem
vielen
vielleicht
vier
vierte
vierten
vierter
viertes
vom
von
vor
w
wahr?
wann
war
waren
warst
wart
warum
was
weg
wegen
weil
weit
weiter
weitere
weiteren
weiteres
welche
welchem
welchen
welcher
welches
wem
wen
wenig
wenige
weniger
weniges
wenigstens
wenn
wer
werde
werden
werdet
weshalb
wessen
wie
wieder
wieso
will
willst
wir
wird
wirklich
wirst
wissen
wo
woher
wohin
wohl
wollen
wollt
wollte
wollten
worden
wurde
wurden
während
währenddem
währenddessen
wäre
würde
würden
x
y
z
z.b
zehn
zehnte
zehnten
zehnter
zehntes
zeit
zu
zuerst
zugleich
zum
zunächst
zur
zurück
zusammen
zwanzig
zwar
zwei
zweite
zweiten
zweiter
zweites
zwischen
zwölf
über
überhaupt
übrigens

View File

@ -0,0 +1,138 @@
a
about
above
after
again
against
all
an
and
any
are
aren
as
at
be
because
been
before
being
below
between
both
but
by
can
cannot
could
couldn
did
didn
do
does
doesn
doing
don
down
during
each
few
for
from
further
had
hadn
has
hasn
have
havent
having
he
hed
her
here
hers
herself
him
himself
his
how
if
in
into
is
isn
it
its
itself
let
more
most
mustn
myself
no
nor
not
of
off
on
once
only
other
ought
our
ours
ourselves
out
over
own
s
same
shan
she
should
shouldn
so
some
such
than
that
the
their
theirs
themselves
then
there
these
they
this
those
through
to
too
under
until
up
very
was
wasn
we
were
weren
what
when
where
which
while
who
whom
why
with
won
would
wouldn
you
your
yours
yourself
yourselves

View File

@ -0,0 +1,720 @@
a
actualmente
acuerdo
adelante
ademas
además
adrede
afirmó
agregó
ahi
ahora
ahí
al
algo
alguna
algunas
alguno
algunos
algún
alli
allí
alrededor
ambos
ampleamos
antano
antaño
ante
anterior
antes
apenas
aproximadamente
aquel
aquella
aquellas
aquello
aquellos
aqui
aquél
aquélla
aquéllas
aquéllos
aquí
arriba
arribaabajo
aseguró
asi
así
atras
aun
aunque
ayer
añadió
aún
b
bajo
bastante
bien
breve
buen
buena
buenas
bueno
buenos
c
cada
casi
cerca
cierta
ciertas
cierto
ciertos
cinco
claro
comentó
como
con
conmigo
conocer
conseguimos
conseguir
considera
consideró
consigo
consigue
consiguen
consigues
contigo
contra
cosas
creo
cual
cuales
cualquier
cuando
cuanta
cuantas
cuanto
cuantos
cuatro
cuenta
cuál
cuáles
cuándo
cuánta
cuántas
cuánto
cuántos
cómo
d
da
dado
dan
dar
de
debajo
debe
deben
debido
decir
dejó
del
delante
demasiado
demás
dentro
deprisa
desde
despacio
despues
después
detras
detrás
dia
dias
dice
dicen
dicho
dieron
diferente
diferentes
dijeron
dijo
dio
donde
dos
durante
día
días
dónde
e
ejemplo
el
ella
ellas
ello
ellos
embargo
empleais
emplean
emplear
empleas
empleo
en
encima
encuentra
enfrente
enseguida
entonces
entre
era
erais
eramos
eran
eras
eres
es
esa
esas
ese
eso
esos
esta
estaba
estabais
estaban
estabas
estad
estada
estadas
estado
estados
estais
estamos
estan
estando
estar
estaremos
estará
estarán
estarás
estaré
estaréis
estaría
estaríais
estaríamos
estarían
estarías
estas
este
estemos
esto
estos
estoy
estuve
estuviera
estuvierais
estuvieran
estuvieras
estuvieron
estuviese
estuvieseis
estuviesen
estuvieses
estuvimos
estuviste
estuvisteis
estuviéramos
estuviésemos
estuvo
está
estábamos
estáis
están
estás
esté
estéis
estén
estés
ex
excepto
existe
existen
explicó
expresó
f
fin
final
fue
fuera
fuerais
fueran
fueras
fueron
fuese
fueseis
fuesen
fueses
fui
fuimos
fuiste
fuisteis
fuéramos
fuésemos
g
general
gran
grandes
gueno
h
ha
haber
habia
habida
habidas
habido
habidos
habiendo
habla
hablan
habremos
habrá
habrán
habrás
habré
habréis
habría
habríais
habríamos
habrían
habrías
habéis
había
habíais
habíamos
habían
habías
hace
haceis
hacemos
hacen
hacer
hacerlo
haces
hacia
haciendo
hago
han
has
hasta
hay
haya
hayamos
hayan
hayas
hayáis
he
hecho
hemos
hicieron
hizo
horas
hoy
hube
hubiera
hubierais
hubieran
hubieras
hubieron
hubiese
hubieseis
hubiesen
hubieses
hubimos
hubiste
hubisteis
hubiéramos
hubiésemos
hubo
i
igual
incluso
indicó
informo
informó
intenta
intentais
intentamos
intentan
intentar
intentas
intento
ir
j
junto
k
l
la
lado
largo
las
le
lejos
les
llegó
lleva
llevar
lo
los
luego
lugar
m
mal
manera
manifestó
mas
mayor
me
mediante
medio
mejor
mencionó
menos
menudo
mi
mia
mias
mientras
mio
mios
mis
misma
mismas
mismo
mismos
modo
momento
mucha
muchas
mucho
muchos
muy
más
mía
mías
mío
míos
n
nada
nadie
ni
ninguna
ningunas
ninguno
ningunos
ningún
no
nos
nosotras
nosotros
nuestra
nuestras
nuestro
nuestros
nueva
nuevas
nuevo
nuevos
nunca
o
ocho
os
otra
otras
otro
otros
p
pais
para
parece
parte
partir
pasada
pasado
paìs
peor
pero
pesar
poca
pocas
poco
pocos
podeis
podemos
poder
podria
podriais
podriamos
podrian
podrias
podrá
podrán
podría
podrían
poner
por
por qué
porque
posible
primer
primera
primero
primeros
principalmente
pronto
propia
propias
propio
propios
proximo
próximo
próximos
pudo
pueda
puede
pueden
puedo
pues
q
qeu
que
quedó
queremos
quien
quienes
quiere
quiza
quizas
quizá
quizás
quién
quiénes
qué
r
raras
realizado
realizar
realizó
repente
respecto
s
sabe
sabeis
sabemos
saben
saber
sabes
sal
salvo
se
sea
seamos
sean
seas
segun
segunda
segundo
según
seis
ser
sera
seremos
será
serán
serás
seré
seréis
sería
seríais
seríamos
serían
serías
seáis
señaló
si
sido
siempre
siendo
siete
sigue
siguiente
sin
sino
sobre
sois
sola
solamente
solas
solo
solos
somos
son
soy
soyos
su
supuesto
sus
suya
suyas
suyo
suyos
sólo
t
tal
tambien
también
tampoco
tan
tanto
tarde
te
temprano
tendremos
tendrá
tendrán
tendrás
tendré
tendréis
tendría
tendríais
tendríamos
tendrían
tendrías
tened
teneis
tenemos
tener
tenga
tengamos
tengan
tengas
tengo
tengáis
tenida
tenidas
tenido
tenidos
teniendo
tenéis
tenía
teníais
teníamos
tenían
tenías
tercera
ti
tiempo
tiene
tienen
tienes
toda
todas
todavia
todavía
todo
todos
trabaja
trabajais
trabajamos
trabajan
trabajar
trabajas
trabajo
tras
trata
través
tres
tu
tus
tuve
tuviera
tuvierais
tuvieran
tuvieras
tuvieron
tuviese
tuvieseis
tuviesen
tuvieses
tuvimos
tuviste
tuvisteis
tuviéramos
tuviésemos
tuvo
tuya
tuyas
tuyo
tuyos
u
ultimo
un
una
unas
uno
unos
usa
usais
usamos
usan
usar
usas
uso
usted
ustedes
v
va
vais
valor
vamos
van
varias
varios
vaya
veces
ver
verdad
verdadera
verdadero
vez
vosotras
vosotros
voy
vuestra
vuestras
vuestro
vuestros
w
x
y
ya
yo
z
él
éramos
ésa
ésas
ése
ésos
ésta
éstas
éste
éstos
última
últimas
último
últimos

View File

@ -0,0 +1,688 @@
a
abord
absolument
afin
ah
ai
aie
aient
aies
ailleurs
ainsi
ait
allaient
allo
allons
allô
alors
anterieur
anterieure
anterieures
apres
après
as
assez
attendu
au
aucun
aucune
aucuns
aujourd
aujourd'hui
aupres
auquel
aura
aurai
auraient
aurais
aurait
auras
aurez
auriez
aurions
aurons
auront
aussi
autre
autrefois
autrement
autres
autrui
aux
auxquelles
auxquels
avaient
avais
avait
avant
avec
avez
aviez
avions
avoir
avons
ayant
ayez
ayons
b
bah
bas
basee
bat
beau
beaucoup
bien
bigre
bon
boum
bravo
brrr
c
car
ce
ceci
cela
celle
celle-ci
celle-là
celles
celles-ci
celles-là
celui
celui-ci
celui-là
celà
cent
cependant
certain
certaine
certaines
certains
certes
ces
cet
cette
ceux
ceux-ci
ceux-là
chacun
chacune
chaque
cher
chers
chez
chiche
chut
chère
chères
ci
cinq
cinquantaine
cinquante
cinquantième
cinquième
clac
clic
combien
comme
comment
comparable
comparables
compris
concernant
contre
couic
crac
d
da
dans
de
debout
dedans
dehors
deja
delà
depuis
dernier
derniere
derriere
derrière
des
desormais
desquelles
desquels
dessous
dessus
deux
deuxième
deuxièmement
devant
devers
devra
devrait
different
differentes
differents
différent
différente
différentes
différents
dire
directe
directement
dit
dite
dits
divers
diverse
diverses
dix
dix-huit
dix-neuf
dix-sept
dixième
doit
doivent
donc
dont
dos
douze
douzième
dring
droite
du
duquel
durant
dès
début
désormais
e
effet
egale
egalement
egales
eh
elle
elle-même
elles
elles-mêmes
en
encore
enfin
entre
envers
es
essai
est
et
etant
etc
etre
eu
eue
eues
euh
eurent
eus
eusse
eussent
eusses
eussiez
eussions
eut
eux
eux-mêmes
exactement
excepté
extenso
exterieur
eûmes
eût
eûtes
f
fais
faisaient
faisant
fait
faites
façon
feront
fi
flac
floc
fois
font
force
furent
fus
fusse
fussent
fusses
fussiez
fussions
fut
fûmes
fût
fûtes
g
gens
h
ha
haut
hein
hem
hep
hi
ho
holà
hop
hormis
hors
hou
houp
hue
hui
huit
huitième
hum
hurrah
hélas
i
ici
il
ils
importe
j
je
jusqu
jusque
juste
k
l
la
laisser
laquelle
las
le
lequel
les
lesquelles
lesquels
leur
leurs
longtemps
lors
lorsque
lui
lui-meme
lui-même
lès
m
ma
maint
maintenant
mais
malgre
malgré
maximale
me
meme
memes
merci
mes
mien
mienne
miennes
miens
mille
mince
mine
minimale
moi
moi-meme
moi-même
moindres
moins
mon
mot
moyennant
multiple
multiples
même
mêmes
n
na
naturel
naturelle
naturelles
ne
neanmoins
necessaire
necessairement
neuf
neuvième
ni
nombreuses
nombreux
nommés
non
nos
notamment
notre
nous
nous-mêmes
nouveau
nouveaux
nul
néanmoins
nôtre
nôtres
o
oh
ohé
ollé
olé
on
ont
onze
onzième
ore
ou
ouf
ouias
oust
ouste
outre
ouvert
ouverte
ouverts
o|
p
paf
pan
par
parce
parfois
parle
parlent
parler
parmi
parole
parseme
partant
particulier
particulière
particulièrement
pas
passé
pendant
pense
permet
personne
personnes
peu
peut
peuvent
peux
pff
pfft
pfut
pif
pire
pièce
plein
plouf
plupart
plus
plusieurs
plutôt
possessif
possessifs
possible
possibles
pouah
pour
pourquoi
pourrais
pourrait
pouvait
prealable
precisement
premier
première
premièrement
pres
probable
probante
procedant
proche
près
psitt
pu
puis
puisque
pur
pure
q
qu
quand
quant
quant-à-soi
quanta
quarante
quatorze
quatre
quatre-vingt
quatrième
quatrièmement
que
quel
quelconque
quelle
quelles
quelqu'un
quelque
quelques
quels
qui
quiconque
quinze
quoi
quoique
r
rare
rarement
rares
relative
relativement
remarquable
rend
rendre
restant
reste
restent
restrictif
retour
revoici
revoilà
rien
s
sa
sacrebleu
sait
sans
sapristi
sauf
se
sein
seize
selon
semblable
semblaient
semble
semblent
sent
sept
septième
sera
serai
seraient
serais
serait
seras
serez
seriez
serions
serons
seront
ses
seul
seule
seulement
si
sien
sienne
siennes
siens
sinon
six
sixième
soi
soi-même
soient
sois
soit
soixante
sommes
son
sont
sous
souvent
soyez
soyons
specifique
specifiques
speculatif
stop
strictement
subtiles
suffisant
suffisante
suffit
suis
suit
suivant
suivante
suivantes
suivants
suivre
sujet
superpose
sur
surtout
t
ta
tac
tandis
tant
tardive
te
tel
telle
tellement
telles
tels
tenant
tend
tenir
tente
tes
tic
tien
tienne
tiennes
tiens
toc
toi
toi-même
ton
touchant
toujours
tous
tout
toute
toutefois
toutes
treize
trente
tres
trois
troisième
troisièmement
trop
très
tsoin
tsouin
tu
u
un
une
unes
uniformement
unique
uniques
uns
v
va
vais
valeur
vas
vers
via
vif
vifs
vingt
vivat
vive
vives
vlan
voici
voie
voient
voilà
vont
vos
votre
vous
vous-mêmes
vu
vôtre
vôtres
w
x
y
z
zut
à
â
ça
ès
étaient
étais
était
étant
état
étiez
étions
été
étée
étées
étés
êtes
être
ô

View File

@ -0,0 +1,847 @@
ένα
έναν
ένας
αι
ακομα
ακομη
ακριβως
αληθεια
αληθινα
αλλα
αλλαχου
αλλες
αλλη
αλλην
αλλης
αλλιως
αλλιωτικα
αλλο
αλλοι
αλλοιως
αλλοιωτικα
αλλον
αλλος
αλλοτε
αλλου
αλλους
αλλων
αμα
αμεσα
αμεσως
αν
ανα
αναμεσα
αναμεταξυ
ανευ
αντι
αντιπερα
αντις
ανω
ανωτερω
αξαφνα
απ
απεναντι
απο
αποψε
από
αρα
αραγε
αργα
αργοτερο
αριστερα
αρκετα
αρχικα
ας
αυριο
αυτα
αυτες
αυτεσ
αυτη
αυτην
αυτης
αυτο
αυτοι
αυτον
αυτος
αυτοσ
αυτου
αυτους
αυτουσ
αυτων
αφοτου
αφου
αἱ
αἳ
αἵ
αὐτόσ
αὐτὸς
αὖ
α∆ιακοπα
βεβαια
βεβαιοτατα
γάρ
γα
γα^
γε
γι
για
γοῦν
γρηγορα
γυρω
γὰρ
δ'
δέ
δή
δαί
δαίσ
δαὶ
δαὶς
δε
δεν
δι
δι'
διά
δια
διὰ
δὲ
δὴ
δ’
εαν
εαυτο
εαυτον
εαυτου
εαυτους
εαυτων
εγκαιρα
εγκαιρως
εγω
ειθε
ειμαι
ειμαστε
ειναι
εις
εισαι
εισαστε
ειστε
ειτε
ειχα
ειχαμε
ειχαν
ειχατε
ειχε
ειχες
ει∆εμη
εκ
εκαστα
εκαστες
εκαστη
εκαστην
εκαστης
εκαστο
εκαστοι
εκαστον
εκαστος
εκαστου
εκαστους
εκαστων
εκει
εκεινα
εκεινες
εκεινεσ
εκεινη
εκεινην
εκεινης
εκεινο
εκεινοι
εκεινον
εκεινος
εκεινοσ
εκεινου
εκεινους
εκεινουσ
εκεινων
εκτος
εμας
εμεις
εμενα
εμπρος
εν
ενα
εναν
ενας
ενος
εντελως
εντος
εντωμεταξυ
ενω
ενός
εξ
εξαφνα
εξης
εξισου
εξω
επ
επί
επανω
επειτα
επει∆η
επι
επισης
επομενως
εσας
εσεις
εσενα
εστω
εσυ
ετερα
ετεραι
ετερας
ετερες
ετερη
ετερης
ετερο
ετεροι
ετερον
ετερος
ετερου
ετερους
ετερων
ετουτα
ετουτες
ετουτη
ετουτην
ετουτης
ετουτο
ετουτοι
ετουτον
ετουτος
ετουτου
ετουτους
ετουτων
ετσι
ευγε
ευθυς
ευτυχως
εφεξης
εχει
εχεις
εχετε
εχθες
εχομε
εχουμε
εχουν
εχτες
εχω
εως
εἰ
εἰμί
εἰμὶ
εἰς
εἰσ
εἴ
εἴμι
εἴτε
ε∆ω
η
ημασταν
ημαστε
ημουν
ησασταν
ησαστε
ησουν
ηταν
ητανε
ητοι
ηττον
η∆η
θα
ι
ιι
ιιι
ισαμε
ισια
ισως
ισωσ
ι∆ια
ι∆ιαν
ι∆ιας
ι∆ιες
ι∆ιο
ι∆ιοι
ι∆ιον
ι∆ιος
ι∆ιου
ι∆ιους
ι∆ιων
ι∆ιως
κ
καί
καίτοι
καθ
καθε
καθεμια
καθεμιας
καθενα
καθενας
καθενος
καθετι
καθολου
καθως
και
κακα
κακως
καλα
καλως
καμια
καμιαν
καμιας
καμποσα
καμποσες
καμποση
καμποσην
καμποσης
καμποσο
καμποσοι
καμποσον
καμποσος
καμποσου
καμποσους
καμποσων
κανεις
κανεν
κανενα
κανεναν
κανενας
κανενος
καποια
καποιαν
καποιας
καποιες
καποιο
καποιοι
καποιον
καποιος
καποιου
καποιους
καποιων
καποτε
καπου
καπως
κατ
κατά
κατα
κατι
κατιτι
κατοπιν
κατω
κατὰ
καὶ
κι
κιολας
κλπ
κοντα
κτλ
κυριως
κἀν
κἂν
λιγακι
λιγο
λιγωτερο
λογω
λοιπα
λοιπον
μέν
μέσα
μή
μήτε
μία
μα
μαζι
μακαρι
μακρυα
μαλιστα
μαλλον
μας
με
μεθ
μεθαυριο
μειον
μελει
μελλεται
μεμιας
μεν
μερικα
μερικες
μερικοι
μερικους
μερικων
μεσα
μετ
μετά
μετα
μεταξυ
μετὰ
μεχρι
μη
μην
μηπως
μητε
μη∆ε
μιά
μια
μιαν
μιας
μολις
μολονοτι
μοναχα
μονες
μονη
μονην
μονης
μονο
μονοι
μονομιας
μονος
μονου
μονους
μονων
μου
μπορει
μπορουν
μπραβο
μπρος
μἐν
μὲν
μὴ
μὴν
να
ναι
νωρις
ξανα
ξαφνικα
ο
οι
ολα
ολες
ολη
ολην
ολης
ολο
ολογυρα
ολοι
ολον
ολονεν
ολος
ολοτελα
ολου
ολους
ολων
ολως
ολως∆ιολου
ομως
ομωσ
οποια
οποιαν
οποιαν∆ηποτε
οποιας
οποιας∆ηποτε
οποια∆ηποτε
οποιες
οποιες∆ηποτε
οποιο
οποιοι
οποιον
οποιον∆ηποτε
οποιος
οποιος∆ηποτε
οποιου
οποιους
οποιους∆ηποτε
οποιου∆ηποτε
οποιο∆ηποτε
οποιων
οποιων∆ηποτε
οποι∆ηποτε
οποτε
οποτε∆ηποτε
οπου
οπου∆ηποτε
οπως
οπωσ
ορισμενα
ορισμενες
ορισμενων
ορισμενως
οσα
οσα∆ηποτε
οσες
οσες∆ηποτε
οση
οσην
οσην∆ηποτε
οσης
οσης∆ηποτε
οση∆ηποτε
οσο
οσοι
οσοι∆ηποτε
οσον
οσον∆ηποτε
οσος
οσος∆ηποτε
οσου
οσους
οσους∆ηποτε
οσου∆ηποτε
οσο∆ηποτε
οσων
οσων∆ηποτε
οταν
οτι
οτι∆ηποτε
οτου
ου
ουτε
ου∆ε
οχι
οἱ
οἳ
οἷς
οὐ
οὐδ
οὐδέ
οὐδείσ
οὐδεὶς
οὐδὲ
οὐδὲν
οὐκ
οὐχ
οὐχὶ
οὓς
οὔτε
οὕτω
οὕτως
οὕτωσ
οὖν
οὗ
οὗτος
οὗτοσ
παλι
παντοτε
παντου
παντως
παρ
παρά
παρα
παρὰ
περί
περα
περι
περιπου
περισσοτερο
περσι
περυσι
περὶ
πια
πιθανον
πιο
πισω
πλαι
πλεον
πλην
ποια
ποιαν
ποιας
ποιες
ποιεσ
ποιο
ποιοι
ποιον
ποιος
ποιοσ
ποιου
ποιους
ποιουσ
ποιων
πολυ
ποσες
ποση
ποσην
ποσης
ποσοι
ποσος
ποσους
ποτε
που
πουθε
πουθενα
ποῦ
πρεπει
πριν
προ
προκειμενου
προκειται
προπερσι
προς
προσ
προτου
προχθες
προχτες
πρωτυτερα
πρόσ
πρὸ
πρὸς
πως
πωσ
σαν
σας
σε
σεις
σημερα
σιγα
σου
στα
στη
στην
στης
στις
στο
στον
στου
στους
στων
συγχρονως
συν
συναμα
συνεπως
συνηθως
συχνα
συχνας
συχνες
συχνη
συχνην
συχνης
συχνο
συχνοι
συχνον
συχνος
συχνου
συχνους
συχνων
συχνως
σχε∆ον
σωστα
σόσ
σύ
σύν
σὸς
σὺ
σὺν
τά
τήν
τί
τίς
τίσ
τα
ταυτα
ταυτες
ταυτη
ταυτην
ταυτης
ταυτο,ταυτον
ταυτος
ταυτου
ταυτων
ταχα
ταχατε
ταῖς
τα∆ε
τε
τελικα
τελικως
τες
τετοια
τετοιαν
τετοιας
τετοιες
τετοιο
τετοιοι
τετοιον
τετοιος
τετοιου
τετοιους
τετοιων
τη
την
της
τησ
τι
τινα
τιποτα
τιποτε
τις
τισ
το
τοί
τοι
τοιοῦτος
τοιοῦτοσ
τον
τος
τοσα
τοσες
τοση
τοσην
τοσης
τοσο
τοσοι
τοσον
τοσος
τοσου
τοσους
τοσων
τοτε
του
τουλαχιστο
τουλαχιστον
τους
τουτα
τουτες
τουτη
τουτην
τουτης
τουτο
τουτοι
τουτοις
τουτον
τουτος
τουτου
τουτους
τουτων
τούσ
τοὺς
τοῖς
τοῦ
τυχον
των
τωρα
τό
τόν
τότε
τὰ
τὰς
τὴν
τὸ
τὸν
τῆς
τῆσ
τῇ
τῶν
τῷ
υπ
υπερ
υπο
υποψη
υποψιν
υπό
υστερα
φετος
χαμηλα
χθες
χτες
χωρις
χωριστα
ψηλα
ω
ωραια
ως
ωσ
ωσαν
ωσοτου
ωσπου
ωστε
ωστοσο
ωχ
ἀλλ'
ἀλλά
ἀλλὰ
ἀλλ’
ἀπ
ἀπό
ἀπὸ
ἀφ
ἂν
ἄλλος
ἄλλοσ
ἄν
ἄρα
ἅμα
ἐάν
ἐγώ
ἐγὼ
ἐκ
ἐμόσ
ἐμὸς
ἐν
ἐξ
ἐπί
ἐπεὶ
ἐπὶ
ἐστι
ἐφ
ἐὰν
ἑαυτοῦ
ἔτι
ἧς
ἵνα
ὃν
ὃς
ὅδε
ὅθεν
ὅπερ
ὅς
ὅσ
ὅστις
ὅστισ
ὅτε
ὅτι
ὑμόσ
ὑπ
ὑπέρ
ὑπό
ὑπὲρ
ὑπὸ
ὡς
ὡσ
ὥς
ὥστε
∆α
∆ε
∆εινα
∆εν
∆εξια
∆ηθεν
∆ηλα∆η
∆ι
∆ια
∆ιαρκως
∆ικα
∆ικο
∆ικοι
∆ικος
∆ικου
∆ικους
∆ιολου
∆ιπλα
∆ιχως

View File

@ -0,0 +1,655 @@
a
abbastanza
abbia
abbiamo
abbiano
abbiate
accidenti
ad
adesso
affinche
agl
agli
ahime
ahimã¨
ahimè
ai
al
alcuna
alcuni
alcuno
all
alla
alle
allo
allora
altre
altri
altrimenti
altro
altrove
altrui
anche
ancora
anni
anno
ansa
anticipo
assai
attesa
attraverso
avanti
avemmo
avendo
avente
aver
avere
averlo
avesse
avessero
avessi
avessimo
aveste
avesti
avete
aveva
avevamo
avevano
avevate
avevi
avevo
avrai
avranno
avrebbe
avrebbero
avrei
avremmo
avremo
avreste
avresti
avrete
avrà
avrò
avuta
avute
avuti
avuto
basta
ben
bene
benissimo
berlusconi
brava
bravo
buono
c
casa
caso
cento
certa
certe
certi
certo
che
chi
chicchessia
chiunque
ci
ciascuna
ciascuno
cima
cinque
cio
cioe
cioã¨
cioè
circa
citta
città
cittã
ciã²
ciò
co
codesta
codesti
codesto
cogli
coi
col
colei
coll
coloro
colui
come
cominci
comprare
comunque
con
concernente
conciliarsi
conclusione
consecutivi
consecutivo
consiglio
contro
cortesia
cos
cosa
cosi
cosã¬
così
cui
d
da
dagl
dagli
dai
dal
dall
dalla
dalle
dallo
dappertutto
davanti
degl
degli
dei
del
dell
della
delle
dello
dentro
detto
deve
devo
di
dice
dietro
dire
dirimpetto
diventa
diventare
diventato
dopo
doppio
dov
dove
dovra
dovrà
dovrã
dovunque
due
dunque
durante
e
ebbe
ebbero
ebbi
ecc
ecco
ed
effettivamente
egli
ella
entrambi
eppure
era
erano
eravamo
eravate
eri
ero
esempio
esse
essendo
esser
essere
essi
ex
fa
faccia
facciamo
facciano
facciate
faccio
facemmo
facendo
facesse
facessero
facessi
facessimo
faceste
facesti
faceva
facevamo
facevano
facevate
facevi
facevo
fai
fanno
farai
faranno
fare
farebbe
farebbero
farei
faremmo
faremo
fareste
faresti
farete
farà
farò
fatto
favore
fece
fecero
feci
fin
finalmente
finche
fine
fino
forse
forza
fosse
fossero
fossi
fossimo
foste
fosti
fra
frattempo
fu
fui
fummo
fuori
furono
futuro
generale
gente
gia
giacche
giorni
giorno
giu
già
giã
gli
gliela
gliele
glieli
glielo
gliene
governo
grande
grazie
gruppo
ha
haha
hai
hanno
ho
i
ie
ieri
il
improvviso
in
inc
indietro
infatti
inoltre
insieme
intanto
intorno
invece
io
l
la
lasciato
lato
lavoro
le
lei
li
lo
lontano
loro
lui
lungo
luogo
ma
macche
magari
maggior
mai
male
malgrado
malissimo
mancanza
me
medesimo
mediante
meglio
meno
mentre
mesi
mezzo
mi
mia
mie
miei
mila
miliardi
milioni
minimi
ministro
mio
modo
molta
molti
moltissimo
molto
momento
mondo
mosto
nazionale
ne
negl
negli
nei
nel
nell
nella
nelle
nello
nemmeno
neppure
nessun
nessuna
nessuno
niente
no
noi
nome
non
nondimeno
nonostante
nonsia
nostra
nostre
nostri
nostro
novanta
nove
nulla
nuovi
nuovo
o
od
oggi
ogni
ognuna
ognuno
oltre
oppure
ora
ore
osi
ossia
ottanta
otto
paese
parecchi
parecchie
parecchio
parte
partendo
peccato
peggio
per
perche
perchã¨
perchè
perché
percio
perciã²
perciò
perfino
pero
persino
persone
perã²
però
piedi
pieno
piglia
piu
piuttosto
piã¹
più
po
pochissimo
poco
poi
poiche
possa
possedere
posteriore
posto
potrebbe
preferibilmente
presa
prima
primo
principalmente
probabilmente
promesso
proprio
puo
pure
purtroppo
puã²
può
qua
qualche
qualcosa
qualcuna
qualcuno
quale
quali
qualunque
quando
quanta
quante
quanti
quanto
quantunque
quarto
quasi
quattro
quel
quella
quelle
quelli
quello
quest
questa
queste
questi
questo
qui
quindi
quinto
realmente
recente
recentemente
registrazione
relativo
riecco
rispetto
salvo
sara
sarai
saranno
sarebbe
sarebbero
sarei
saremmo
saremo
sareste
saresti
sarete
sarà
sarã
sarò
scola
scopo
scorso
se
secondo
seguente
seguito
sei
sembra
sembrare
sembrato
sembrava
sembri
sempre
senza
sette
si
sia
siamo
siano
siate
siete
sig
solito
solo
soltanto
sono
sopra
soprattutto
sotto
spesso
srl
sta
stai
stando
stanno
starai
staranno
starebbe
starebbero
starei
staremmo
staremo
stareste
staresti
starete
starà
starò
stata
stati
stava
stavamo
stavano
stavate
stavi
stavo
stemmo
stessa
stesse
stessero
stessi
stessimo
stesso
steste
stesti
stette
stettero
stetti
stia
stiamo
stiano
stiate
sto
su
sua
subito
successivamente
successivo
sue
sugl
sugli
sui
sul
sull
sulla
sulle
sullo
suo
suoi
tale
tali
talvolta
tanto
te
tempo
terzo
th
ti
titolo
tra
tranne
tre
trenta
triplo
troppo
trovato
tu
tua
tue
tuo
tuoi
tutta
tuttavia
tutte
tutti
tutto
uguali
ulteriore
ultimo
un
una
uno
uomo
va
vai
vale
vari
varia
varie
vario
verso
vi
via
vicino
visto
vita
voi
volta
volte
vostra
vostre
vostri
vostro
ã¨
è

Some files were not shown because too many files have changed in this diff Show More