diff --git a/dhp-pace-core/pom.xml b/dhp-pace-core/pom.xml index b66976ea6..cd6305051 100644 --- a/dhp-pace-core/pom.xml +++ b/dhp-pace-core/pom.xml @@ -81,9 +81,12 @@ org.apache.spark - spark-catalyst_2.11 - 2.4.0.cloudera2 - compile + spark-core_2.11 + + + + org.apache.spark + spark-sql_2.11 diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index b7a70d607..3da8eb490 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,8 +1,5 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.HashSet; @@ -10,32 +7,39 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; + public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { protected Map params; - + public AbstractClusteringFunction(final Map params) { this.params = params; } protected abstract Collection doApply(Config conf, String s); - + @Override public Collection apply(Config conf, List fields) { - return fields.stream().filter(f -> !f.isEmpty()) - .map(this::normalize) - .map(s -> filterAllStopWords(s)) - .map(s -> doApply(conf, s)) - .map(c -> filterBlacklisted(c, ngramBlacklist)) - .flatMap(c -> c.stream()) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toCollection(HashSet::new)); + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(this::normalize) + .map(s -> filterAllStopWords(s)) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); } public Map getParams() { return params; } - + protected Integer param(String name) { return params.get(name); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index d3008332d..9072fbb4b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -6,6 +7,7 @@ import java.util.Set; import java.util.StringTokenizer; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; @ClusteringClass("acronyms") @@ -19,16 +21,16 @@ public class Acronyms extends AbstractClusteringFunction { protected Collection doApply(Config conf, String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); } - + private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { - + final Set acronyms = Sets.newLinkedHashSet(); - + for (int i = 0; i < maxAcronyms; i++) { - + final StringTokenizer st = new StringTokenizer(s); final StringBuilder sb = new StringBuilder(); - + while (st.hasMoreTokens()) { final String token = st.nextToken(); if (sb.length() > maxLen) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java index e67767171..3bb845b15 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.lang.annotation.ElementType; @@ -9,5 +10,5 @@ import java.lang.annotation.Target; @Target(ElementType.TYPE) public @interface ClusteringClass { - public String value(); -} \ No newline at end of file + public String value(); +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 4660d2b6c..8b7852418 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -1,15 +1,16 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; +import eu.dnetlib.pace.config.Config; + public interface ClusteringFunction { - + public Collection apply(Config config, List fields); - + public Map getParams(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index 7f342f69c..bc8844aee 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -5,6 +6,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; + import eu.dnetlib.pace.config.Config; @ClusteringClass("immutablefieldvalue") diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 73ba221c3..38299adb4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,50 +1,54 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.config.Config; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.pace.config.Config; + @ClusteringClass("keywordsclustering") public class KeywordsClustering extends AbstractClusteringFunction { - public KeywordsClustering(Map params) { - super(params); - } + public KeywordsClustering(Map params) { + super(params); + } - @Override - protected Collection doApply(final Config conf, String s) { + @Override + protected Collection doApply(final Config conf, String s) { - //takes city codes and keywords codes without duplicates - Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); - Set cities = getCities(s, params.getOrDefault("windowSize", 4)); + // takes city codes and keywords codes without duplicates + Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); + Set cities = getCities(s, params.getOrDefault("windowSize", 4)); - //list of combination to return as result - final Collection combinations = new LinkedHashSet(); + // list of combination to return as result + final Collection combinations = new LinkedHashSet(); - for (String keyword: keywordsToCodes(keywords, conf.translationMap())){ - for (String city: citiesToCodes(cities)) { - combinations.add(keyword+"-"+city); - if (combinations.size()>=params.getOrDefault("max", 2)) { - return combinations; - } - } - } + for (String keyword : keywordsToCodes(keywords, conf.translationMap())) { + for (String city : citiesToCodes(cities)) { + combinations.add(keyword + "-" + city); + if (combinations.size() >= params.getOrDefault("max", 2)) { + return combinations; + } + } + } - return combinations; - } + return combinations; + } - @Override - public Collection apply(final Config conf, List fields) { - return fields.stream().filter(f -> !f.isEmpty()) - .map(this::cleanup) - .map(this::normalize) - .map(s -> filterAllStopWords(s)) - .map(s -> doApply(conf, s)) - .map(c -> filterBlacklisted(c, ngramBlacklist)) - .flatMap(c -> c.stream()) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toCollection(HashSet::new)); - } -} \ No newline at end of file + @Override + public Collection apply(final Config conf, List fields) { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(this::cleanup) + .map(this::normalize) + .map(s -> filterAllStopWords(s)) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java index fa45ac909..5a385961a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java @@ -1,75 +1,79 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Lists; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Person; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Person; + @ClusteringClass("lnfi") -public class LastNameFirstInitial extends AbstractClusteringFunction{ +public class LastNameFirstInitial extends AbstractClusteringFunction { - private boolean DEFAULT_AGGRESSIVE = true; + private boolean DEFAULT_AGGRESSIVE = true; - public LastNameFirstInitial(final Map params) { - super(params); - } + public LastNameFirstInitial(final Map params) { + super(params); + } - @Override - public Collection apply(Config conf, List fields) { - return fields.stream().filter(f -> !f.isEmpty()) - .map(this::normalize) - .map(s -> doApply(conf, s)) - .map(c -> filterBlacklisted(c, ngramBlacklist)) - .flatMap(c -> c.stream()) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toCollection(HashSet::new)); - } + @Override + public Collection apply(Config conf, List fields) { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(this::normalize) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } - @Override - protected String normalize(final String s) { - return fixAliases(transliterate(nfd(unicodeNormalization(s)))) - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings - .replaceAll("[^ \\w]+", "") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + @Override + protected String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input + // strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - @Override - protected Collection doApply(final Config conf, final String s) { + @Override + protected Collection doApply(final Config conf, final String s) { - final List res = Lists.newArrayList(); + final List res = Lists.newArrayList(); - final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") + : DEFAULT_AGGRESSIVE); - Person p = new Person(s, aggressive); + Person p = new Person(s, aggressive); - if (p.isAccurate()) { - String lastName = p.getNormalisedSurname().toLowerCase(); - String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1); + if (p.isAccurate()) { + String lastName = p.getNormalisedSurname().toLowerCase(); + String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0, 1); - res.add(firstInitial.concat(lastName)); - } - else { // is not accurate, meaning it has no defined name and surname - List fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); - if (fullname.size() == 1) { - res.add(p.getNormalisedFullname().toLowerCase()); - } - else if (fullname.size() == 2) { - res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase()); - res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase()); - } - else { - res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase()); - res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase()); - } - } + res.add(firstInitial.concat(lastName)); + } else { // is not accurate, meaning it has no defined name and surname + List fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); + if (fullname.size() == 1) { + res.add(p.getNormalisedFullname().toLowerCase()); + } else if (fullname.size() == 2) { + res.add(fullname.get(0).substring(0, 1).concat(fullname.get(1)).toLowerCase()); + res.add(fullname.get(1).substring(0, 1).concat(fullname.get(0)).toLowerCase()); + } else { + res.add(fullname.get(0).substring(0, 1).concat(fullname.get(fullname.size() - 1)).toLowerCase()); + res.add(fullname.get(fullname.size() - 1).substring(0, 1).concat(fullname.get(0)).toLowerCase()); + } + } - return res; - } -} \ No newline at end of file + return res; + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index d50a95008..a3a6c4881 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -1,14 +1,17 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; + @ClusteringClass("lowercase") public class LowercaseClustering extends AbstractClusteringFunction { @@ -19,7 +22,7 @@ public class LowercaseClustering extends AbstractClusteringFunction { @Override public Collection apply(Config conf, List fields) { Collection c = Sets.newLinkedHashSet(); - for(String f : fields) { + for (String f : fields) { c.addAll(doApply(conf, f)); } return c; @@ -27,7 +30,7 @@ public class LowercaseClustering extends AbstractClusteringFunction { @Override protected Collection doApply(final Config conf, final String s) { - if(StringUtils.isBlank(s)) { + if (StringUtils.isBlank(s)) { return Lists.newArrayList(); } return Lists.newArrayList(s.toLowerCase().trim()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java index 30d33629c..4c81e9a48 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Set; @@ -11,7 +12,8 @@ public class NGramUtils extends AbstractPaceFunctions { private static final int SIZE = 100; - private static final Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + private static final Set stopwords = AbstractPaceFunctions + .loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); public static String cleanupForOrdering(String s) { String result = NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index fd7c17ec3..e42cabd8d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -6,6 +7,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; + import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrampairs") @@ -32,7 +34,7 @@ public class NgramPairs extends Ngrams { break; } res.add(ngrams.get(i) + ngrams.get(j)); - //System.out.println("-- " + concatNgrams); + // System.out.println("-- " + concatNgrams); } return res; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index 3af7e98e8..96c305a16 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -1,9 +1,10 @@ + package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.config.Config; - import java.util.*; +import eu.dnetlib.pace.config.Config; + @ClusteringClass("ngrams") public class Ngrams extends AbstractClusteringFunction { @@ -44,7 +45,7 @@ public class Ngrams extends AbstractClusteringFunction { } } } - //System.out.println(ngrams + " n: " + ngrams.size()); + // System.out.println(ngrams + " n: " + ngrams.size()); return ngrams; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index a5bad2075..b4a04ce65 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -1,16 +1,19 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Person; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Person; + @ClusteringClass("personClustering") public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { @@ -30,7 +33,8 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin final Person person = new Person(f, false); - if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) { + if (StringUtils.isNotBlank(person.getNormalisedFirstName()) + && StringUtils.isNotBlank(person.getNormalisedSurname())) { hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase()); } else { for (final String token1 : tokens(f, MAX_TOKENS)) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index f6c4fe07f..a3d58a9be 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -22,7 +23,8 @@ public class PersonHash extends AbstractClusteringFunction { protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); - final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") + : DEFAULT_AGGRESSIVE); res.add(new Person(s, aggressive).hash()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index 86a2e4e4f..2aab926da 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -1,10 +1,11 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.Map; +import eu.dnetlib.pace.config.Config; + public class RandomClusteringFunction extends AbstractClusteringFunction { public RandomClusteringFunction(Map params) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 77c2c0155..5809d8216 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.*; @@ -5,6 +6,7 @@ import java.util.*; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; + import eu.dnetlib.pace.config.Config; @ClusteringClass("sortedngrampairs") diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 50cea4db3..392aecc79 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.config.Config; import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; + @ClusteringClass("spacetrimmingfieldvalue") public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { @@ -21,7 +23,10 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); - res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); + res + .add( + StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) + : s.toLowerCase().replaceAll("\\s+", "")); return res; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index fa1f64362..2a1c023a9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -5,6 +6,7 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; @ClusteringClass("suffixprefix") @@ -18,7 +20,7 @@ public class SuffixPrefix extends AbstractClusteringFunction { protected Collection doApply(Config conf, String s) { return suffixPrefix(s, param("len"), param("max")); } - + private Collection suffixPrefix(String s, int len, int max) { final Set bigrams = Sets.newLinkedHashSet(); int i = 0; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 235cec101..5b267ad10 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -1,7 +1,5 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.clustering; import java.net.MalformedURLException; import java.net.URL; @@ -11,42 +9,44 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; + @ClusteringClass("urlclustering") public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { - protected Map params; + protected Map params; - public UrlClustering(final Map params) { - this.params = params; - } + public UrlClustering(final Map params) { + this.params = params; + } - @Override - public Collection apply(final Config conf, List fields) { - try { - return fields.stream() - .filter(f -> !f.isEmpty()) - .map(this::asUrl) - .map(URL::getHost) - .collect(Collectors.toCollection(HashSet::new)); - } - catch (IllegalStateException e){ - return new HashSet<>(); - } - } + @Override + public Collection apply(final Config conf, List fields) { + try { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(this::asUrl) + .map(URL::getHost) + .collect(Collectors.toCollection(HashSet::new)); + } catch (IllegalStateException e) { + return new HashSet<>(); + } + } - @Override - public Map getParams() { - return null; - } - - private URL asUrl(String value) { - try { - return new URL(value); - } catch (MalformedURLException e) { - // should not happen as checked by pace typing - throw new IllegalStateException("invalid URL: " + value); - } - } + @Override + public Map getParams() { + return null; + } + private URL asUrl(String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java index 6fa2668fa..c8e02f8f0 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java @@ -1,90 +1,91 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.clustering; import java.util.*; import java.util.stream.Collectors; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; + @ClusteringClass("wordsStatsSuffixPrefixChain") public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction { - public WordsStatsSuffixPrefixChain(Map params) { - super(params); - } + public WordsStatsSuffixPrefixChain(Map params) { + super(params); + } - @Override - protected Collection doApply(Config conf, String s) { - return suffixPrefixChain(s, param("mod")); - } + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefixChain(s, param("mod")); + } - private Collection suffixPrefixChain(String s, int mod) { + private Collection suffixPrefixChain(String s, int mod) { - //create the list of words from the string (remove short words) - List wordsList = - Arrays.stream(s.split(" ")) - .filter(si -> si.length() > 3) - .collect(Collectors.toList()); + // create the list of words from the string (remove short words) + List wordsList = Arrays + .stream(s.split(" ")) + .filter(si -> si.length() > 3) + .collect(Collectors.toList()); - final int words = wordsList.size(); - final int letters = s.length(); + final int words = wordsList.size(); + final int letters = s.length(); - //create the prefix: number of words + number of letters/mod - String prefix = words + "-" + letters/mod + "-"; + // create the prefix: number of words + number of letters/mod + String prefix = words + "-" + letters / mod + "-"; - return doSuffixPrefixChain(wordsList, prefix); + return doSuffixPrefixChain(wordsList, prefix); - } + } - private Collection doSuffixPrefixChain(List wordsList, String prefix) { + private Collection doSuffixPrefixChain(List wordsList, String prefix) { - Set set = Sets.newLinkedHashSet(); - switch(wordsList.size()){ - case 0: - case 1: - break; - case 2: - set.add( - prefix + - suffix(wordsList.get(0), 3) + - prefix(wordsList.get(1), 3) - ); + Set set = Sets.newLinkedHashSet(); + switch (wordsList.size()) { + case 0: + case 1: + break; + case 2: + set + .add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3)); - set.add( - prefix + - prefix(wordsList.get(0), 3) + - suffix(wordsList.get(1), 3) - ); + set + .add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3)); - break; - default: - set.add( - prefix + - suffix(wordsList.get(0), 3) + - prefix(wordsList.get(1), 3) + - suffix(wordsList.get(2), 3) - ); + break; + default: + set + .add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3) + + suffix(wordsList.get(2), 3)); - set.add( - prefix + - prefix(wordsList.get(0), 3) + - suffix(wordsList.get(1), 3) + - prefix(wordsList.get(2), 3) - ); - break; - } + set + .add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3) + + prefix(wordsList.get(2), 3)); + break; + } - return set; + return set; - } + } + private String suffix(String s, int len) { + return s.substring(s.length() - len); + } - private String suffix(String s, int len) { - return s.substring(s.length()-len); - } - - private String prefix(String s, int len) { - return s.substring(0, len); - } + private String prefix(String s, int len) { + return s.substring(0, len); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java index 1e94b34d2..e606590a5 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -5,53 +6,54 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; @ClusteringClass("wordssuffixprefix") public class WordsSuffixPrefix extends AbstractClusteringFunction { - public WordsSuffixPrefix(Map params) { - super(params); - } + public WordsSuffixPrefix(Map params) { + super(params); + } - @Override - protected Collection doApply(Config conf, String s) { - return suffixPrefix(s, param("len"), param("max")); - } + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefix(s, param("len"), param("max")); + } - private Collection suffixPrefix(String s, int len, int max) { + private Collection suffixPrefix(String s, int len, int max) { - final int words = s.split(" ").length; + final int words = s.split(" ").length; - // adjust the token length according to the number of words - switch (words) { - case 1: - return Sets.newLinkedHashSet(); - case 2: - return doSuffixPrefix(s, len+2, max, words); - case 3: - return doSuffixPrefix(s, len+1, max, words); - default: - return doSuffixPrefix(s, len, max, words); - } - } + // adjust the token length according to the number of words + switch (words) { + case 1: + return Sets.newLinkedHashSet(); + case 2: + return doSuffixPrefix(s, len + 2, max, words); + case 3: + return doSuffixPrefix(s, len + 1, max, words); + default: + return doSuffixPrefix(s, len, max, words); + } + } - private Collection doSuffixPrefix(String s, int len, int max, int words) { - final Set bigrams = Sets.newLinkedHashSet(); - int i = 0; - while (++i < s.length() && bigrams.size() < max) { - int j = s.indexOf(" ", i); + private Collection doSuffixPrefix(String s, int len, int max, int words) { + final Set bigrams = Sets.newLinkedHashSet(); + int i = 0; + while (++i < s.length() && bigrams.size() < max) { + int j = s.indexOf(" ", i); - int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); + int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); - if (j - len > 0) { - String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); - if (bigram.length() >= 4) { - bigrams.add(words+bigram); - } - } - } - return bigrams; - } + if (j - len > 0) { + String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); + if (bigram.length() >= 4) { + bigrams.add(words + bigram); + } + } + } + return bigrams; + } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 3b80bfcd1..06a955ba5 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -1,14 +1,5 @@ -package eu.dnetlib.pace.common; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.ibm.icu.text.Transliterator; -import eu.dnetlib.pace.clustering.NGramUtils; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.common; import java.io.IOException; import java.io.StringWriter; @@ -19,6 +10,18 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; + +import eu.dnetlib.pace.clustering.NGramUtils; + /** * Set of common functions for the framework * @@ -26,321 +29,325 @@ import java.util.stream.Collectors; */ public abstract class AbstractPaceFunctions { - //city map to be used when translating the city names into codes - private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); + // city map to be used when translating the city names into codes + private static Map cityMap = AbstractPaceFunctions + .loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); - //list of stopwords in different languages - protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); - protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); - protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); - protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); - protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); - protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); - protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); + // list of stopwords in different languages + protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); + protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); + protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); + protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); + protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); + protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); - //transliterator - protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + // transliterator + protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - //blacklist of ngrams: to avoid generic keys - protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); + // blacklist of ngrams: to avoid generic keys + protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); - //html regex for normalization - public final String HTML_REGEX = "<[^>]*>"; + // html regex for normalization + public final String HTML_REGEX = "<[^>]*>"; - private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; - private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; - private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; + private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; + private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; - //doi prefix for normalization - public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + // doi prefix for normalization + public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); + private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); - private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); + private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); - protected String concat(final List l) { - return Joiner.on(" ").skipNulls().join(l); - } + protected String concat(final List l) { + return Joiner.on(" ").skipNulls().join(l); + } - protected String cleanup(final String s) { + protected String cleanup(final String s) { - final String s1 = s.replaceAll(HTML_REGEX, ""); - final String s2 = unicodeNormalization(s1.toLowerCase()); - final String s3 = nfd(s2); - final String s4 = fixXML(s3); - final String s5 = s4.replaceAll("([0-9]+)", " $1 "); - final String s6 = transliterate(s5); - final String s7 = fixAliases(s6); - final String s8 = s7.replaceAll("[^\\p{ASCII}]", ""); - final String s9 = s8.replaceAll("[\\p{Punct}]", " "); - final String s10 = s9.replaceAll("\\n", " "); - final String s11 = s10.replaceAll("(?m)\\s+", " "); - final String s12 = s11.trim(); - return s12; - } + final String s1 = s.replaceAll(HTML_REGEX, ""); + final String s2 = unicodeNormalization(s1.toLowerCase()); + final String s3 = nfd(s2); + final String s4 = fixXML(s3); + final String s5 = s4.replaceAll("([0-9]+)", " $1 "); + final String s6 = transliterate(s5); + final String s7 = fixAliases(s6); + final String s8 = s7.replaceAll("[^\\p{ASCII}]", ""); + final String s9 = s8.replaceAll("[\\p{Punct}]", " "); + final String s10 = s9.replaceAll("\\n", " "); + final String s11 = s10.replaceAll("(?m)\\s+", " "); + final String s12 = s11.trim(); + return s12; + } - protected String fixXML(final String a){ + protected String fixXML(final String a) { - return a.replaceAll("–", " ") - .replaceAll("&", " ") - .replaceAll(""", " ") - .replaceAll("−", " "); - } + return a + .replaceAll("–", " ") + .replaceAll("&", " ") + .replaceAll(""", " ") + .replaceAll("−", " "); + } - protected boolean checkNumbers(final String a, final String b) { - final String numbersA = getNumbers(a); - final String numbersB = getNumbers(b); - final String romansA = getRomans(a); - final String romansB = getRomans(b); - return !numbersA.equals(numbersB) || !romansA.equals(romansB); - } + protected boolean checkNumbers(final String a, final String b) { + final String numbersA = getNumbers(a); + final String numbersB = getNumbers(b); + final String romansA = getRomans(a); + final String romansB = getRomans(b); + return !numbersA.equals(numbersB) || !romansA.equals(romansB); + } - protected String getRomans(final String s) { - final StringBuilder sb = new StringBuilder(); - for (final String t : s.split(" ")) { - sb.append(isRoman(t) ? t : ""); - } - return sb.toString(); - } + protected String getRomans(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final String t : s.split(" ")) { + sb.append(isRoman(t) ? t : ""); + } + return sb.toString(); + } - protected boolean isRoman(final String s) { - return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop"); - } + protected boolean isRoman(final String s) { + return s + .replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop") + .equals("qwertyuiop"); + } - protected String getNumbers(final String s) { - final StringBuilder sb = new StringBuilder(); - for (final String t : s.split(" ")) { - sb.append(isNumber(t) ? t : ""); - } - return sb.toString(); - } + protected String getNumbers(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final String t : s.split(" ")) { + sb.append(isNumber(t) ? t : ""); + } + return sb.toString(); + } - public boolean isNumber(String strNum) { - if (strNum == null) { - return false; - } - return numberPattern.matcher(strNum).matches(); - } + public boolean isNumber(String strNum) { + if (strNum == null) { + return false; + } + return numberPattern.matcher(strNum).matches(); + } - protected static String fixAliases(final String s) { - final StringBuilder sb = new StringBuilder(); - for (final char ch : Lists.charactersOf(s)) { - final int i = StringUtils.indexOf(aliases_from, ch); - sb.append(i >= 0 ? aliases_to.charAt(i) : ch); - } - return sb.toString(); - } + protected static String fixAliases(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final char ch : Lists.charactersOf(s)) { + final int i = StringUtils.indexOf(aliases_from, ch); + sb.append(i >= 0 ? aliases_to.charAt(i) : ch); + } + return sb.toString(); + } - protected static String transliterate(final String s) { - try { - return transliterator.transliterate(s); - } - catch(Exception e) { - return s; - } - } + protected static String transliterate(final String s) { + try { + return transliterator.transliterate(s); + } catch (Exception e) { + return s; + } + } - protected String removeSymbols(final String s) { - final StringBuilder sb = new StringBuilder(); + protected String removeSymbols(final String s) { + final StringBuilder sb = new StringBuilder(); - for (final char ch : Lists.charactersOf(s)) { - sb.append(StringUtils.contains(alpha, ch) ? ch : " "); - } - return sb.toString().replaceAll("\\s+", " "); - } - - protected boolean notNull(final String s) { - return s != null; - } + for (final char ch : Lists.charactersOf(s)) { + sb.append(StringUtils.contains(alpha, ch) ? ch : " "); + } + return sb.toString().replaceAll("\\s+", " "); + } - protected String normalize(final String s) { - return fixAliases(transliterate(nfd(unicodeNormalization(s)))) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings - .replaceAll("[^ \\w]+", "") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + protected boolean notNull(final String s) { + return s != null; + } - public String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + protected String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input + // strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - public String utf8(final String s) { - byte[] bytes = s.getBytes(StandardCharsets.UTF_8); - return new String(bytes, StandardCharsets.UTF_8); - } + public String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } - public String unicodeNormalization(final String s) { + public String utf8(final String s) { + byte[] bytes = s.getBytes(StandardCharsets.UTF_8); + return new String(bytes, StandardCharsets.UTF_8); + } - Matcher m = hexUnicodePattern.matcher(s); - StringBuffer buf = new StringBuffer(s.length()); - while (m.find()) { - String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16)); - m.appendReplacement(buf, Matcher.quoteReplacement(ch)); - } - m.appendTail(buf); - return buf.toString(); - } + public String unicodeNormalization(final String s) { - protected String filterStopWords(final String s, final Set stopwords) { - final StringTokenizer st = new StringTokenizer(s); - final StringBuilder sb = new StringBuilder(); - while (st.hasMoreTokens()) { - final String token = st.nextToken(); - if (!stopwords.contains(token)) { - sb.append(token); - sb.append(" "); - } - } - return sb.toString().trim(); - } + Matcher m = hexUnicodePattern.matcher(s); + StringBuffer buf = new StringBuffer(s.length()); + while (m.find()) { + String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16)); + m.appendReplacement(buf, Matcher.quoteReplacement(ch)); + } + m.appendTail(buf); + return buf.toString(); + } - public String filterAllStopWords(String s) { + protected String filterStopWords(final String s, final Set stopwords) { + final StringTokenizer st = new StringTokenizer(s); + final StringBuilder sb = new StringBuilder(); + while (st.hasMoreTokens()) { + final String token = st.nextToken(); + if (!stopwords.contains(token)) { + sb.append(token); + sb.append(" "); + } + } + return sb.toString().trim(); + } - s = filterStopWords(s, stopwords_en); - s = filterStopWords(s, stopwords_de); - s = filterStopWords(s, stopwords_it); - s = filterStopWords(s, stopwords_fr); - s = filterStopWords(s, stopwords_pt); - s = filterStopWords(s, stopwords_es); - s = filterStopWords(s, stopwords_gr); + public String filterAllStopWords(String s) { - return s; - } + s = filterStopWords(s, stopwords_en); + s = filterStopWords(s, stopwords_de); + s = filterStopWords(s, stopwords_it); + s = filterStopWords(s, stopwords_fr); + s = filterStopWords(s, stopwords_pt); + s = filterStopWords(s, stopwords_es); + s = filterStopWords(s, stopwords_gr); - protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { - final Set newset = Sets.newLinkedHashSet(); - for (final String s : set) { - if (!ngramBlacklist.contains(s)) { - newset.add(s); - } - } - return newset; - } + return s; + } - public static Set loadFromClasspath(final String classpath) { + protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { + final Set newset = Sets.newLinkedHashSet(); + for (final String s : set) { + if (!ngramBlacklist.contains(s)) { + newset.add(s); + } + } + return newset; + } - Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + public static Set loadFromClasspath(final String classpath) { - final Set h = Sets.newHashSet(); - try { - for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) { - h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords - } - } catch (final Throwable e) { - return Sets.newHashSet(); - } - return h; - } + Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - public static Map loadMapFromClasspath(final String classpath) { + final Set h = Sets.newHashSet(); + try { + for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) { + h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords + } + } catch (final Throwable e) { + return Sets.newHashSet(); + } + return h; + } - Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + public static Map loadMapFromClasspath(final String classpath) { - final Map m = new HashMap<>(); - try { - for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) { - //string is like this: code;word1;word2;word3 - String[] line = s.split(";"); - String value = line[0]; - for (int i = 1; i < line.length; i++) { - m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value); - } - } - } catch (final Throwable e) { - return new HashMap<>(); - } - return m; - } + Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - public String removeKeywords(String s, Set keywords) { + final Map m = new HashMap<>(); + try { + for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) { + // string is like this: code;word1;word2;word3 + String[] line = s.split(";"); + String value = line[0]; + for (int i = 1; i < line.length; i++) { + m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value); + } + } + } catch (final Throwable e) { + return new HashMap<>(); + } + return m; + } - s = " " + s + " "; - for (String k : keywords) { - s = s.replaceAll(k.toLowerCase(), ""); - } + public String removeKeywords(String s, Set keywords) { - return s.trim(); - } + s = " " + s + " "; + for (String k : keywords) { + s = s.replaceAll(k.toLowerCase(), ""); + } - public double commonElementsPercentage(Set s1, Set s2) { + return s.trim(); + } - double longer = Math.max(s1.size(), s2.size()); - return (double) s1.stream().filter(s2::contains).count() / longer; - } + public double commonElementsPercentage(Set s1, Set s2) { - //convert the set of keywords to codes - public Set toCodes(Set keywords, Map translationMap) { - return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); - } + double longer = Math.max(s1.size(), s2.size()); + return (double) s1.stream().filter(s2::contains).count() / longer; + } - public Set keywordsToCodes(Set keywords, Map translationMap) { - return toCodes(keywords, translationMap); - } + // convert the set of keywords to codes + public Set toCodes(Set keywords, Map translationMap) { + return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); + } - public Set citiesToCodes(Set keywords) { - return toCodes(keywords, cityMap); - } + public Set keywordsToCodes(Set keywords, Map translationMap) { + return toCodes(keywords, translationMap); + } - protected String firstLC(final String s) { - return StringUtils.substring(s, 0, 1).toLowerCase(); - } + public Set citiesToCodes(Set keywords) { + return toCodes(keywords, cityMap); + } - protected Iterable tokens(final String s, final int maxTokens) { - return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); - } + protected String firstLC(final String s) { + return StringUtils.substring(s, 0, 1).toLowerCase(); + } - public String normalizePid(String pid) { - return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); - } + protected Iterable tokens(final String s, final int maxTokens) { + return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); + } - //get the list of keywords into the input string - public Set getKeywords(String s1, Map translationMap, int windowSize) { + public String normalizePid(String pid) { + return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); + } - String s = s1; + // get the list of keywords into the input string + public Set getKeywords(String s1, Map translationMap, int windowSize) { - List tokens = Arrays.asList(s.toLowerCase().split(" ")); + String s = s1; - Set codes = new HashSet<>(); + List tokens = Arrays.asList(s.toLowerCase().split(" ")); - if (tokens.size() < windowSize) - windowSize = tokens.size(); + Set codes = new HashSet<>(); - int length = windowSize; + if (tokens.size() < windowSize) + windowSize = tokens.size(); - while (length != 0) { + int length = windowSize; - for (int i = 0; i <= tokens.size() - length; i++) { - String candidate = concat(tokens.subList(i, i + length)); - if (translationMap.containsKey(candidate)) { - codes.add(candidate); - s = s.replace(candidate, "").trim(); - } - } + while (length != 0) { - tokens = Arrays.asList(s.split(" ")); - length -= 1; - } + for (int i = 0; i <= tokens.size() - length; i++) { + String candidate = concat(tokens.subList(i, i + length)); + if (translationMap.containsKey(candidate)) { + codes.add(candidate); + s = s.replace(candidate, "").trim(); + } + } - return codes; - } + tokens = Arrays.asList(s.split(" ")); + length -= 1; + } - public Set getCities(String s1, int windowSize) { - return getKeywords(s1, cityMap, windowSize); - } + return codes; + } - public static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } + public Set getCities(String s1, int windowSize) { + return getKeywords(s1, cityMap, windowSize); + } + + public static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 0db0270fb..00faff0bd 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.config; import java.util.List; @@ -44,7 +45,6 @@ public interface Config { */ public Map> blacklists(); - /** * Translation map. * diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 63fc96aef..eeec68ae6 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,16 +1,5 @@ -package eu.dnetlib.pace.config; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Maps; -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package eu.dnetlib.pace.config; import java.io.IOException; import java.io.Serializable; @@ -25,139 +14,167 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import java.util.stream.Collectors; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; + +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; - +import eu.dnetlib.pace.util.PaceException; public class DedupConfig implements Config, Serializable { - private static String CONFIG_TEMPLATE = "dedupConfig.st"; + private static String CONFIG_TEMPLATE = "dedupConfig.st"; - private PaceConfig pace; + private PaceConfig pace; - private WfConfig wf; + private WfConfig wf; - @JsonIgnore - private Map> blacklists; + @JsonIgnore + private Map> blacklists; - private static Map defaults = Maps.newHashMap(); + private static Map defaults = Maps.newHashMap(); - static { - defaults.put("dedupRun", "001"); - defaults.put("entityType", "result"); - defaults.put("subEntityType", "resulttype"); - defaults.put("subEntityValue", "publication"); - defaults.put("orderField", "title"); - defaults.put("queueMaxSize", "2000"); - defaults.put("groupMaxSize", "10"); - defaults.put("slidingWindowSize", "200"); - defaults.put("rootBuilder", "result"); - defaults.put("includeChildren", "true"); - defaults.put("maxIterations", "20"); - defaults.put("idPath", "$.id"); - } + static { + defaults.put("dedupRun", "001"); + defaults.put("entityType", "result"); + defaults.put("subEntityType", "resulttype"); + defaults.put("subEntityValue", "publication"); + defaults.put("orderField", "title"); + defaults.put("queueMaxSize", "2000"); + defaults.put("groupMaxSize", "10"); + defaults.put("slidingWindowSize", "200"); + defaults.put("rootBuilder", "result"); + defaults.put("includeChildren", "true"); + defaults.put("maxIterations", "20"); + defaults.put("idPath", "$.id"); + } - public DedupConfig() { - } + public DedupConfig() { + } - public static DedupConfig load(final String json) { + public static DedupConfig load(final String json) { - final DedupConfig config; - try { - config = new ObjectMapper().readValue(json, DedupConfig.class); - config.getPace().initModel(); - config.getPace().initTranslationMap(); + final DedupConfig config; + try { + config = new ObjectMapper().readValue(json, DedupConfig.class); + config.getPace().initModel(); + config.getPace().initTranslationMap(); - config.blacklists = config.getPace().getBlacklists().entrySet() - .stream() - .map(e -> new AbstractMap.SimpleEntry>(e.getKey(), e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()))) - .collect(Collectors.toMap(e -> e.getKey(), - e -> (Predicate & Serializable) s -> e.getValue().stream().filter(p -> p.matcher(s).matches()).findFirst().isPresent())) + config.blacklists = config + .getPace() + .getBlacklists() + .entrySet() + .stream() + .map( + e -> new AbstractMap.SimpleEntry>(e.getKey(), + e + .getValue() + .stream() + .filter(s -> !StringUtils.isBlank(s)) + .map(Pattern::compile) + .collect(Collectors.toList()))) + .collect( + Collectors + .toMap( + e -> e.getKey(), + e -> (Predicate & Serializable) s -> e + .getValue() + .stream() + .filter(p -> p.matcher(s).matches()) + .findFirst() + .isPresent())) - ; + ; - return config; - } catch (IOException | - PatternSyntaxException e) { - throw new PaceException("Error in parsing configuration json", e); - } + return config; + } catch (IOException | PatternSyntaxException e) { + throw new PaceException("Error in parsing configuration json", e); + } - } + } - public static DedupConfig loadDefault() throws IOException { - return loadDefault(new HashMap()); - } + public static DedupConfig loadDefault() throws IOException { + return loadDefault(new HashMap()); + } - public static DedupConfig loadDefault(final Map params) throws IOException { + public static DedupConfig loadDefault(final Map params) throws IOException { - final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE)); + final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE)); - for (final Entry e : defaults.entrySet()) { - template.setAttribute(e.getKey(), e.getValue()); - } - for (final Entry e : params.entrySet()) { - if (template.getAttribute(e.getKey()) != null) { - template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue()); - } else { - template.setAttribute(e.getKey(), e.getValue()); - } - } + for (final Entry e : defaults.entrySet()) { + template.setAttribute(e.getKey(), e.getValue()); + } + for (final Entry e : params.entrySet()) { + if (template.getAttribute(e.getKey()) != null) { + template.getAttributes().computeIfPresent(e.getKey(), (o, o2) -> e.getValue()); + } else { + template.setAttribute(e.getKey(), e.getValue()); + } + } - final String json = template.toString(); - return load(json); - } + final String json = template.toString(); + return load(json); + } - private String readFromClasspath(final String resource) throws IOException { - return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8); - } + private String readFromClasspath(final String resource) throws IOException { + return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8); + } - public PaceConfig getPace() { - return pace; - } + public PaceConfig getPace() { + return pace; + } - public void setPace(final PaceConfig pace) { - this.pace = pace; - } + public void setPace(final PaceConfig pace) { + this.pace = pace; + } - public WfConfig getWf() { - return wf; - } + public WfConfig getWf() { + return wf; + } - public void setWf(final WfConfig wf) { - this.wf = wf; - } + public void setWf(final WfConfig wf) { + this.wf = wf; + } - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("unable to serialise configuration", e); - } - } + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("unable to serialise configuration", e); + } + } - @Override - public Map decisionTree() { - return getPace().getDecisionTree(); - } + @Override + public Map decisionTree() { + return getPace().getDecisionTree(); + } - @Override - public List model() { - return getPace().getModel(); - } + @Override + public List model() { + return getPace().getModel(); + } - @Override - public List clusterings() { - return getPace().getClustering(); - } + @Override + public List clusterings() { + return getPace().getClustering(); + } - @Override - public Map> blacklists() { - return blacklists; - } + @Override + public Map> blacklists() { + return blacklists; + } - @Override - public Map translationMap() { - return getPace().translationMap(); - } + @Override + public Map translationMap() { + return getPace().translationMap(); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index dc87a1b06..f1bc49f4a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,19 +1,20 @@ + package eu.dnetlib.pace.config; +import java.io.Serializable; +import java.util.List; +import java.util.Map; import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Maps; import com.ibm.icu.text.Transliterator; + import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; -import java.io.Serializable; -import java.util.List; -import java.util.Map; - public class PaceConfig extends AbstractPaceFunctions implements Serializable { private List model; @@ -37,7 +38,8 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable { @JsonIgnore public static PaceResolver resolver = new PaceResolver(); - public PaceConfig() {} + public PaceConfig() { + } public void initModel() { modelMap = Maps.newHashMap(); @@ -46,20 +48,21 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable { } } - public void initTranslationMap(){ + public void initTranslationMap() { translationMap = Maps.newHashMap(); Transliterator transliterator = Transliterator.getInstance("Any-Eng"); for (String key : synonyms.keySet()) { - for (String term : synonyms.get(key)){ - translationMap.put( + for (String term : synonyms.get(key)) { + translationMap + .put( fixAliases(transliterator.transliterate(term.toLowerCase())), - key); + key); } } } - public Map translationMap(){ + public Map translationMap() { return translationMap; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java index 20981c427..9f3323edc 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.config; public enum Type { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index 78fc18a13..8dea04232 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,10 +1,5 @@ -package eu.dnetlib.pace.config; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.config; import java.io.IOException; import java.io.Serializable; @@ -12,6 +7,13 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import org.apache.commons.lang3.StringUtils; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.util.PaceException; public class WfConfig implements Serializable { @@ -76,7 +78,6 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; - /** Default maximum number of iterations. */ private final static int MAX_ITERATIONS = 20; @@ -84,9 +85,10 @@ public class WfConfig implements Serializable { private int maxIterations = MAX_ITERATIONS; /** The Jquery path to retrieve the identifier */ - private String idPath = "$.id"; + private String idPath = "$.id"; - public WfConfig() {} + public WfConfig() { + } /** * Instantiates a new dedup config. @@ -114,8 +116,10 @@ public class WfConfig implements Serializable { * @param idPath * the path for the id of the entity */ - public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun, - final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) { + public WfConfig(final String entityType, final String orderField, final List rootBuilder, + final String dedupRun, + final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, + final boolean includeChildren, final int maxIterations, final String idPath) { super(); this.entityType = entityType; this.orderField = orderField; @@ -257,7 +261,6 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } - public int getMaxIterations() { return maxIterations; } @@ -277,7 +280,6 @@ public class WfConfig implements Serializable { /* * (non-Javadoc) - * * @see java.lang.Object#toString() */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index c15885ecf..d9ad81d42 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,15 +1,16 @@ -package eu.dnetlib.pace.model; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.clustering.ClusteringFunction; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; +package eu.dnetlib.pace.model; import java.io.IOException; import java.io.Serializable; import java.util.List; import java.util.Map; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; public class ClusteringDef implements Serializable { @@ -19,7 +20,8 @@ public class ClusteringDef implements Serializable { private Map params; - public ClusteringDef() {} + public ClusteringDef() { + } public String getName() { return name; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 196ac7248..f34545e6d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -1,13 +1,15 @@ + package eu.dnetlib.pace.model; +import java.io.Serializable; +import java.util.List; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; import com.google.common.collect.Lists; -import eu.dnetlib.pace.config.Type; -import java.io.Serializable; -import java.util.List; +import eu.dnetlib.pace.config.Type; /** * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm. @@ -34,7 +36,8 @@ public class FieldDef implements Serializable { */ private int length = -1; - public FieldDef() {} + public FieldDef() { + } public String getName() { return name; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java index 543b1bdfe..96120cf4d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.nio.charset.Charset; @@ -43,7 +44,7 @@ public class Person { // s = s.replaceAll("[\\W&&[^,-]]", ""); } - if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname + if (s.contains(",")) { // if the name contains a comma it is easy derivable the name and the surname final String[] arr = s.split(","); if (arr.length == 1) { fullname = splitTerms(arr[0]); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java index a900a6082..1f8aab4bf 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.util.ArrayList; @@ -57,7 +58,7 @@ public class PersonComparatorUtils { private static boolean verifyNames(List list1, List list2) { return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) - && verifySimilarity(extractInitials(list1), extractInitials(list2)); + && verifySimilarity(extractInitials(list1), extractInitials(list2)); } private static boolean verifySurnames(List list1, List list2) { @@ -76,7 +77,7 @@ public class PersonComparatorUtils { Collections.sort(list1); Collections.sort(list2); return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) - && verifySimilarity(extractInitials(list1), extractInitials(list2)); + && verifySimilarity(extractInitials(list1), extractInitials(list2)); } private static List extractExtendedNames(List list) { @@ -107,7 +108,7 @@ public class PersonComparatorUtils { for (String s : list1) { int curr = list2.indexOf(s); if (curr > pos) { - list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm" + list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm" pos = curr; } else { return false; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java index 3926b2897..42c226f87 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/RowDataOrderingComparator.java @@ -1,9 +1,11 @@ + package eu.dnetlib.pace.model; -import eu.dnetlib.pace.clustering.NGramUtils; +import java.util.Comparator; + import org.apache.spark.sql.Row; -import java.util.Comparator; +import eu.dnetlib.pace.clustering.NGramUtils; /** * The Class MapDocumentComparator. @@ -25,13 +27,12 @@ public class RowDataOrderingComparator implements Comparator { /* * (non-Javadoc) - * * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) */ @Override public int compare(final Row d1, final Row d2) { if (d1 == null) - return d2==null ? 0: -1; + return d2 == null ? 0 : -1; else if (d2 == null) { return 1; } @@ -40,7 +41,7 @@ public class RowDataOrderingComparator implements Comparator { final String o2 = d2.getString(comparatorField); if (o1 == null) - return o2==null ? 0: -1; + return o2 == null ? 0 : -1; else if (o2 == null) { return 1; } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/SparkDedupConfig.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDedupConfig.scala similarity index 82% rename from dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/SparkDedupConfig.scala rename to dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDedupConfig.scala index 4300e80c6..def5ebb84 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/SparkDedupConfig.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDedupConfig.scala @@ -1,32 +1,30 @@ -package eu.dnetlib.dhp.oa.dedup.model +package eu.dnetlib.pace.model import com.jayway.jsonpath.{Configuration, JsonPath, Option} -import eu.dnetlib.dhp.oa.dedup.{DedupUtility, SparkReporter} import eu.dnetlib.pace.config.{DedupConfig, Type} -import eu.dnetlib.pace.model.{ClusteringDef, FieldDef} import eu.dnetlib.pace.tree.support.TreeProcessor import eu.dnetlib.pace.util.MapDocumentUtil.truncateValue -import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil} +import eu.dnetlib.pace.util.{BlockProcessor, MapDocumentUtil, SparkReporter} import org.apache.spark.SparkContext +import org.apache.spark.sql.{Column, Dataset, Row, functions} import org.apache.spark.sql.catalyst.expressions.{GenericRowWithSchema, Literal} import org.apache.spark.sql.expressions.{UserDefinedFunction, Window} -import org.apache.spark.sql.functions.{col, lit, udf} import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} -import org.apache.spark.sql.{Column, Dataset, Row, functions} import java.util import java.util.function.Predicate import java.util.regex.Pattern import scala.collection.JavaConverters._ import scala.collection.mutable +import org.apache.spark.sql.functions.{col, lit, udf} -class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable { +case class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializable { private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*") private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|") - private var urlFilter = (s: String) => URL_REGEX.matcher(s).matches + private val urlFilter = (s: String) => URL_REGEX.matcher(s).matches val modelExtractor: (Dataset[String] => Dataset[Row]) = df => { df.withColumn("mapDocument", rowFromJsonUDF.apply(df.col(df.columns(0)))) @@ -226,60 +224,59 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab val orderingFieldPosition: Int = rowDataType.fieldIndex(conf.getWf.getOrderField) - val rowFromJsonUDF = udf( - (json: String) => { - val documentContext = - JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json) - val values = new Array[Any](rowDataType.size) + val rowFromJson = (json: String) => { + val documentContext = + JsonPath.using(Configuration.defaultConfiguration.addOptions(Option.SUPPRESS_EXCEPTIONS)).parse(json) + val values = new Array[Any](rowDataType.size) - values(identityFieldPosition) = DFMapDocumentUtils.getJPathString(conf.getWf.getIdPath, documentContext) + values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext) - rowDataType.fieldNames.zipWithIndex.foldLeft(values) { - case ((res, (fname, index))) => { - val fdef = conf.getPace.getModelMap.get(fname) + rowDataType.fieldNames.zipWithIndex.foldLeft(values) { + case ((res, (fname, index))) => { + val fdef = conf.getPace.getModelMap.get(fname) - if (fdef != null) { - res(index) = fdef.getType match { - case Type.String | Type.Int => - MapDocumentUtil.truncateValue( - DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext), - fdef.getLength - ) + if (fdef != null) { + res(index) = fdef.getType match { + case Type.String | Type.Int => + MapDocumentUtil.truncateValue( + MapDocumentUtil.getJPathString(fdef.getPath, documentContext), + fdef.getLength + ) - case Type.URL => - var uv = DFMapDocumentUtils.getJPathString(fdef.getPath, documentContext) - if (!urlFilter(uv)) uv = "" - uv + case Type.URL => + var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext) + if (!urlFilter(uv)) uv = "" + uv - case Type.List | Type.JSON => - MapDocumentUtil.truncateList( - DFMapDocumentUtils.getJPathList(fdef.getPath, documentContext, fdef.getType), - fdef.getSize - ) + case Type.List | Type.JSON => + MapDocumentUtil.truncateList( + MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), + fdef.getSize + ) - case Type.StringConcat => - val jpaths = CONCAT_REGEX.split(fdef.getPath) + case Type.StringConcat => + val jpaths = CONCAT_REGEX.split(fdef.getPath) - truncateValue( - jpaths - .map(jpath => DFMapDocumentUtils.getJPathString(jpath, documentContext)) - .mkString(" "), - fdef.getLength - ) + truncateValue( + jpaths + .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext)) + .mkString(" "), + fdef.getLength + ) - case Type.DoubleArray => - MapDocumentUtil.getJPathArray(fdef.getPath, json) - } + case Type.DoubleArray => + MapDocumentUtil.getJPathArray(fdef.getPath, json) } - - res } - } - new GenericRowWithSchema(values, rowDataType) - }, - rowDataType - ) + res + } + } + + new GenericRowWithSchema(values, rowDataType) + } + + val rowFromJsonUDF = udf(rowFromJson, rowDataType) def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = { @@ -310,7 +307,7 @@ class SparkDedupConfig(conf: DedupConfig, numPartitions: Int) extends Serializab } def processBlock(implicit sc: SparkContext) = { - val accumulators = DedupUtility.constructAccumulator(conf, sc) + val accumulators = SparkReporter.constructAccumulator(conf, sc) udf[Array[Tuple2[String, String]], mutable.WrappedArray[Row]](block => { val reporter = new SparkReporter(accumulators) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java index aaac36ad7..4d31df5b3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -1,41 +1,42 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("alwaysMatch") public class AlwaysMatch extends AbstractComparator { - public AlwaysMatch(final Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } + public AlwaysMatch(final Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } - public AlwaysMatch(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public AlwaysMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double compare(final Object a, final Object b, final Config conf) { - return 1.0; - } + @Override + public double compare(final Object a, final Object b, final Config conf) { + return 1.0; + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } - diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index eedc7f562..5c6939e60 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -1,148 +1,157 @@ -package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Person; -import eu.dnetlib.pace.tree.support.AbstractListComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Person; +import eu.dnetlib.pace.tree.support.AbstractListComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("authorsMatch") public class AuthorsMatch extends AbstractListComparator { - Map params; + Map params; - private double SURNAME_THRESHOLD; - private double NAME_THRESHOLD; - private double FULLNAME_THRESHOLD; - private String MODE; //full or surname - private int SIZE_THRESHOLD; - private String TYPE; //count or percentage - private int common; + private double SURNAME_THRESHOLD; + private double NAME_THRESHOLD; + private double FULLNAME_THRESHOLD; + private String MODE; // full or surname + private int SIZE_THRESHOLD; + private String TYPE; // count or percentage + private int common; - public AuthorsMatch(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - this.params = params; + public AuthorsMatch(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + this.params = params; - MODE = params.getOrDefault("mode", "full"); - SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95")); - NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95")); - FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); - SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20")); - TYPE = params.getOrDefault("type", "percentage"); - common = 0; - } + MODE = params.getOrDefault("mode", "full"); + SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95")); + NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95")); + FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); + SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20")); + TYPE = params.getOrDefault("type", "percentage"); + common = 0; + } - protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { - super(w, ssalgo); - } + protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } - @Override - public double compare(final List a, final List b, final Config conf) { + @Override + public double compare(final List a, final List b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; + if (a.isEmpty() || b.isEmpty()) + return -1; - if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) - return 1.0; + if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) + return 1.0; - List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - common = 0; - //compare each element of List1 with each element of List2 - for (Person p1 : aList) + common = 0; + // compare each element of List1 with each element of List2 + for (Person p1 : aList) - for (Person p2 : bList) { + for (Person p2 : bList) { - //both persons are inaccurate - if (!p1.isAccurate() && !p2.isAccurate()) { - //compare just normalized fullnames - String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname()); - String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname()); + // both persons are inaccurate + if (!p1.isAccurate() && !p2.isAccurate()) { + // compare just normalized fullnames + String fullname1 = normalization( + p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()); + String fullname2 = normalization( + p2.getNormalisedFullname().isEmpty() ? p2.getOriginal() : p2.getNormalisedFullname()); - if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) { - common += 1; - break; - } - } + if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) { + common += 1; + break; + } + } - //one person is inaccurate - if (p1.isAccurate() ^ p2.isAccurate()) { - //prepare data - //data for the accurate person - String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName()); - String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname()); + // one person is inaccurate + if (p1.isAccurate() ^ p2.isAccurate()) { + // prepare data + // data for the accurate person + String name = normalization( + p1.isAccurate() ? p1.getNormalisedFirstName() : p2.getNormalisedFirstName()); + String surname = normalization( + p1.isAccurate() ? p1.getNormalisedSurname() : p2.getNormalisedSurname()); - //data for the inaccurate person - String fullname = normalization( - p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()) - ); + // data for the inaccurate person + String fullname = normalization( + p1.isAccurate() + ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) + : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())); - if (fullname.contains(surname)) { - if (MODE.equals("full")) { - if (fullname.contains(name)) { - common += 1; - break; - } - } - else { //MODE equals "surname" - common += 1; - break; - } - } - } + if (fullname.contains(surname)) { + if (MODE.equals("full")) { + if (fullname.contains(name)) { + common += 1; + break; + } + } else { // MODE equals "surname" + common += 1; + break; + } + } + } - //both persons are accurate - if (p1.isAccurate() && p2.isAccurate()) { + // both persons are accurate + if (p1.isAccurate() && p2.isAccurate()) { - if (compareSurname(p1, p2)) { - if (MODE.equals("full")) { - if(compareFirstname(p1, p2)) { - common += 1; - break; - } - } - else { //MODE equals "surname" - common += 1; - break; - } - } + if (compareSurname(p1, p2)) { + if (MODE.equals("full")) { + if (compareFirstname(p1, p2)) { + common += 1; + break; + } + } else { // MODE equals "surname" + common += 1; + break; + } + } - } + } - } + } - //normalization factor to compute the score - int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); + // normalization factor to compute the score + int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); - if(TYPE.equals("percentage")) { - return (double) common / normFactor; - } - else { - return (double) common; - } - } + if (TYPE.equals("percentage")) { + return (double) common / normFactor; + } else { + return (double) common; + } + } - public boolean compareSurname(Person p1, Person p2) { - return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD; - } + public boolean compareSurname(Person p1, Person p2) { + return ssalgo + .score( + normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD; + } - public boolean compareFirstname(Person p1, Person p2) { + public boolean compareFirstname(Person p1, Person p2) { - if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) { - if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName()))) - return true; - } + if (p1.getNormalisedFirstName().length() <= 2 || p2.getNormalisedFirstName().length() <= 2) { + if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName()))) + return true; + } - return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD; - } + return ssalgo + .score( + normalization(p1.getNormalisedFirstName()), + normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD; + } - public String normalization(String s) { - return normalize(utf8(cleanup(s))); - } + public String normalization(String s) { + return normalize(utf8(cleanup(s))); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java index a51d07eb7..1d898ad83 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java @@ -1,47 +1,48 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; +import java.util.Set; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; -import java.util.Set; - @ComparatorClass("cityMatch") public class CityMatch extends AbstractStringComparator { - private Map params; + private Map params; - public CityMatch(Map params) { - super(params); - this.params = params; - } + public CityMatch(Map params) { + super(params); + this.params = params; + } - @Override - public double distance(final String a, final String b, final Config conf) { + @Override + public double distance(final String a, final String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + String ca = cleanup(a); + String cb = cleanup(b); - ca = normalize(ca); - cb = normalize(cb); + ca = normalize(ca); + cb = normalize(cb); - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); - Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set codes1 = citiesToCodes(cities1); - Set codes2 = citiesToCodes(cities2); + Set codes1 = citiesToCodes(cities1); + Set codes2 = citiesToCodes(cities2); - //if no cities are detected, the comparator gives 1.0 - if (codes1.isEmpty() && codes2.isEmpty()) - return 1.0; - else { - if (codes1.isEmpty() ^ codes2.isEmpty()) - return -1; //undefined if one of the two has no cities - return commonElementsPercentage(codes1, codes2); - } - } + // if no cities are detected, the comparator gives 1.0 + if (codes1.isEmpty() && codes2.isEmpty()) + return 1.0; + else { + if (codes1.isEmpty() ^ codes2.isEmpty()) + return -1; // undefined if one of the two has no cities + return commonElementsPercentage(codes1, codes2); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java index 59e5dd346..d255612ba 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java @@ -1,47 +1,47 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("cosineSimilarity") public class CosineSimilarity extends AbstractComparator { - Map params; + Map params; - public CosineSimilarity(Map params) { - super(params); - } + public CosineSimilarity(Map params) { + super(params); + } - @Override - public double compare(Object a, Object b, Config config) { - return compare((double[])a, (double[])b, config); - } + @Override + public double compare(Object a, Object b, Config config) { + return compare((double[]) a, (double[]) b, config); + } - public double compare(final double[] a, final double[] b, final Config conf) { + public double compare(final double[] a, final double[] b, final Config conf) { - if (a.length == 0 || b.length == 0) - return -1; + if (a.length == 0 || b.length == 0) + return -1; - return cosineSimilarity(a, b); - } + return cosineSimilarity(a, b); + } - double cosineSimilarity(double[] a, double[] b) { - double dotProduct = 0; - double normASum = 0; - double normBSum = 0; + double cosineSimilarity(double[] a, double[] b) { + double dotProduct = 0; + double normASum = 0; + double normBSum = 0; - for(int i = 0; i < a.length; i ++) { - dotProduct += a[i] * b[i]; - normASum += a[i] * a[i]; - normBSum += b[i] * b[i]; - } - - double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum); - return dotProduct / eucledianDist; - } + for (int i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i]; + normASum += a[i] * a[i]; + normBSum += b[i] * b[i]; + } + double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum); + return dotProduct / eucledianDist; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java index 429882450..d3c5bc10d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java @@ -1,9 +1,10 @@ + package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.tree.support.ComparatorClass; - import java.util.Map; +import eu.dnetlib.pace.tree.support.ComparatorClass; + /** * The Class ExactMatch. * @@ -12,15 +13,15 @@ import java.util.Map; @ComparatorClass("doiExactMatch") public class DoiExactMatch extends ExactMatchIgnoreCase { - public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - public DoiExactMatch(final Map params) { - super(params); - } + public DoiExactMatch(final Map params) { + super(params); + } - @Override - protected String toString(final Object f) { - return super.toString(f).replaceAll(PREFIX, ""); - } + @Override + protected String toString(final Object f) { + return super.toString(f).replaceAll(PREFIX, ""); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java index 2e99595e0..c28274652 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java @@ -1,29 +1,30 @@ -package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("domainExactMatch") public class DomainExactMatch extends ExactMatchIgnoreCase { - public DomainExactMatch(final Map params) { - super(params); - } + public DomainExactMatch(final Map params) { + super(params); + } - @Override - protected String toString(final Object f) { + @Override + protected String toString(final Object f) { - try { - return asUrl(super.toString(f)).getHost(); - } catch (MalformedURLException e) { - return ""; - } - } + try { + return asUrl(super.toString(f)).getHost(); + } catch (MalformedURLException e) { + return ""; + } + } - private URL asUrl(final String value) throws MalformedURLException { - return new URL(value); - } + private URL asUrl(final String value) throws MalformedURLException { + return new URL(value); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index 08fca05c9..35357c553 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -1,42 +1,44 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("exactMatch") public class ExactMatch extends AbstractStringComparator { - public ExactMatch(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } + public ExactMatch(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } - public ExactMatch(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public ExactMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1.0; //return -1 if a field is missing - } - return a.equals(b) ? 1.0 : 0; - } + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } + return a.equals(b) ? 1.0 : 0; + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java index b6b4d1af4..220bfb7dd 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java @@ -1,30 +1,32 @@ -package eu.dnetlib.pace.tree; -import com.google.common.base.Joiner; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractStringComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; +import com.google.common.base.Joiner; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("exactMatchIgnoreCase") public class ExactMatchIgnoreCase extends AbstractStringComparator { - public ExactMatchIgnoreCase(Map params) { - super(params); - } + public ExactMatchIgnoreCase(Map params) { + super(params); + } - @Override - public double compare(String a, String b, final Config conf) { + @Override + public double compare(String a, String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; + if (a.isEmpty() || b.isEmpty()) + return -1; - return a.equalsIgnoreCase(b) ? 1 : 0; - } + return a.equalsIgnoreCase(b) ? 1 : 0; + } - protected String toString(final Object object) { - return toFirstString(object); - } -} \ No newline at end of file + protected String toString(final Object object) { + return toFirstString(object); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java index 074b82a19..238cb16ce 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java @@ -1,9 +1,5 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractListComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.HashMap; import java.util.List; @@ -11,70 +7,74 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractListComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("instanceTypeMatch") public class InstanceTypeMatch extends AbstractListComparator { - final Map translationMap = new HashMap<>(); + final Map translationMap = new HashMap<>(); - public InstanceTypeMatch(Map params){ - super(params); + public InstanceTypeMatch(Map params) { + super(params); - //jolly types - translationMap.put("Conference object", "*"); - translationMap.put("Other literature type", "*"); - translationMap.put("Unknown", "*"); + // jolly types + translationMap.put("Conference object", "*"); + translationMap.put("Other literature type", "*"); + translationMap.put("Unknown", "*"); - //article types - translationMap.put("Article", "Article"); - translationMap.put("Data Paper", "Article"); - translationMap.put("Software Paper", "Article"); - translationMap.put("Preprint", "Article"); + // article types + translationMap.put("Article", "Article"); + translationMap.put("Data Paper", "Article"); + translationMap.put("Software Paper", "Article"); + translationMap.put("Preprint", "Article"); - //thesis types - translationMap.put("Thesis", "Thesis"); - translationMap.put("Master thesis", "Thesis"); - translationMap.put("Bachelor thesis", "Thesis"); - translationMap.put("Doctoral thesis", "Thesis"); - } + // thesis types + translationMap.put("Thesis", "Thesis"); + translationMap.put("Master thesis", "Thesis"); + translationMap.put("Bachelor thesis", "Thesis"); + translationMap.put("Doctoral thesis", "Thesis"); + } + @Override + public double compare(final List a, final List b, final Config conf) { - @Override - public double compare(final List a, final List b, final Config conf) { + if (a == null || b == null) { + return -1; + } - if (a == null || b == null) { - return -1; - } + if (a.isEmpty() || b.isEmpty()) { + return -1; + } + final Set ca = a.stream().map(this::translate).collect(Collectors.toSet()); + final Set cb = b.stream().map(this::translate).collect(Collectors.toSet()); - if (a.isEmpty() || b.isEmpty()) { - return -1; - } + // if at least one is a jolly type, it must produce a match + if (ca.contains("*") || cb.contains("*")) + return 1.0; - final Set ca = a.stream().map(this::translate).collect(Collectors.toSet()); - final Set cb = b.stream().map(this::translate).collect(Collectors.toSet()); + int incommon = Sets.intersection(ca, cb).size(); - //if at least one is a jolly type, it must produce a match - if (ca.contains("*") || cb.contains("*")) - return 1.0; + // if at least one is in common, it must produce a match + return incommon >= 1 ? 1 : 0; + } - int incommon = Sets.intersection(ca, cb).size(); + public String translate(String term) { + return translationMap.getOrDefault(term, term); + } - //if at least one is in common, it must produce a match - return incommon >= 1 ? 1 : 0; - } + @Override + public double getWeight() { + return super.weight; + } - public String translate(String term){ - return translationMap.getOrDefault(term, term); - } - - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java index e151edaaf..2cb411d26 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -1,44 +1,46 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @ComparatorClass("jaroWinkler") public class JaroWinkler extends AbstractStringComparator { - public JaroWinkler(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } + public JaroWinkler(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } - public JaroWinkler(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public JaroWinkler(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double distance(String a, String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + @Override + public double distance(String a, String b, final Config conf) { + String ca = cleanup(a); + String cb = cleanup(b); - return normalize(ssalgo.score(ca, cb)); - } + return normalize(ssalgo.score(ca, cb)); + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(double d) { - return d; - } + @Override + protected double normalize(double d) { + return d; + } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index 3f122cd62..576b9281d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -1,70 +1,74 @@ -package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractStringComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.Map; import java.util.Set; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("jaroWinklerNormalizedName") public class JaroWinklerNormalizedName extends AbstractStringComparator { - private Map params; + private Map params; - public JaroWinklerNormalizedName(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - this.params = params; - } + public JaroWinklerNormalizedName(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + this.params = params; + } - public JaroWinklerNormalizedName(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public JaroWinklerNormalizedName(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double distance(String a, String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + @Override + public double distance(String a, String b, final Config conf) { + String ca = cleanup(a); + String cb = cleanup(b); - ca = normalize(ca); - cb = normalize(cb); + ca = normalize(ca); + cb = normalize(cb); - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords1 = getKeywords( + ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords2 = getKeywords( + cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - ca = removeKeywords(ca, keywords1); - ca = removeKeywords(ca, cities1); - cb = removeKeywords(cb, keywords2); - cb = removeKeywords(cb, cities2); + ca = removeKeywords(ca, keywords1); + ca = removeKeywords(ca, cities1); + cb = removeKeywords(cb, keywords2); + cb = removeKeywords(cb, cities2); - ca = ca.replaceAll("[ ]{2,}", " "); - cb = cb.replaceAll("[ ]{2,}", " "); + ca = ca.replaceAll("[ ]{2,}", " "); + cb = cb.replaceAll("[ ]{2,}", " "); - if (ca.isEmpty() && cb.isEmpty()) - return 1.0; - else - return normalize(ssalgo.score(ca,cb)); - } + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca, cb)); + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(double d) { - return d; - } + @Override + protected double normalize(double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java index a98778ac9..6ba7dd2a4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -1,17 +1,19 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @ComparatorClass("jaroWinklerTitle") public class JaroWinklerTitle extends AbstractStringComparator { - public JaroWinklerTitle(Map params){ + public JaroWinklerTitle(Map params) { super(params, new com.wcohen.ss.JaroWinkler()); } @@ -22,7 +24,7 @@ public class JaroWinklerTitle extends AbstractStringComparator { protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { super(weight, ssalgo); } - + @Override public double distance(String a, String b, final Config conf) { String ca = cleanup(a); @@ -30,7 +32,7 @@ public class JaroWinklerTitle extends AbstractStringComparator { boolean check = checkNumbers(ca, cb); return check ? 0.5 : normalize(ssalgo.score(ca, cb)); - } + } @Override public double getWeight() { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index 16145b47a..43fbfb07d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -1,72 +1,76 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractListComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractListComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.util.MapDocumentUtil; + @ComparatorClass("jsonListMatch") public class JsonListMatch extends AbstractListComparator { - private static final Log log = LogFactory.getLog(JsonListMatch.class); - private Map params; + private static final Log log = LogFactory.getLog(JsonListMatch.class); + private Map params; - private String MODE; //"percentage" or "count" + private String MODE; // "percentage" or "count" - public JsonListMatch(final Map params) { - super(params); - this.params = params; + public JsonListMatch(final Map params) { + super(params); + this.params = params; - MODE = params.getOrDefault("mode", "percentage"); - } + MODE = params.getOrDefault("mode", "percentage"); + } - @Override - public double compare(final List sa, final List sb, final Config conf) { - if (sa.isEmpty() || sb.isEmpty()) { - return -1; - } + @Override + public double compare(final List sa, final List sb, final Config conf) { + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } - final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); - final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); + final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); + final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); - int incommon = Sets.intersection(ca, cb).size(); - int simDiff = Sets.symmetricDifference(ca, cb).size(); + int incommon = Sets.intersection(ca, cb).size(); + int simDiff = Sets.symmetricDifference(ca, cb).size(); - if (incommon + simDiff == 0) { - return 0.0; - } + if (incommon + simDiff == 0) { + return 0.0; + } - if (MODE.equals("percentage")) - return (double)incommon / (incommon + simDiff); - else - return incommon; + if (MODE.equals("percentage")) + return (double) incommon / (incommon + simDiff); + else + return incommon; - } + } - //converts every json into a comparable string basing on parameters - private String toComparableString(String json){ + // converts every json into a comparable string basing on parameters + private String toComparableString(String json) { - StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters + StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into + // parameters - //for each path in the param list - for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { - String path = params.get(key); - String value = MapDocumentUtil.getJPathString(path, json); - if (value == null || value.isEmpty()) - value = ""; - st.append(value); - st.append("::"); - } + // for each path in the param list + for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { + String path = params.get(key); + String value = MapDocumentUtil.getJPathString(path, json); + if (value == null || value.isEmpty()) + value = ""; + st.append(value); + st.append("::"); + } - st.setLength(st.length()-2); - return st.toString(); - } + st.setLength(st.length() - 2); + return st.toString(); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java index 8c4e6e50f..53acb4dc8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -1,47 +1,50 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; +import java.util.Set; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; -import java.util.Set; - @ComparatorClass("keywordMatch") public class KeywordMatch extends AbstractStringComparator { - Map params; + Map params; - public KeywordMatch(Map params) { - super(params); - this.params = params; - } + public KeywordMatch(Map params) { + super(params); + this.params = params; + } - @Override - public double distance(final String a, final String b, final Config conf) { + @Override + public double distance(final String a, final String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + String ca = cleanup(a); + String cb = cleanup(b); - ca = normalize(ca); - cb = normalize(cb); + ca = normalize(ca); + cb = normalize(cb); - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords1 = getKeywords( + ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords2 = getKeywords( + cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set codes1 = toCodes(keywords1, conf.translationMap()); - Set codes2 = toCodes(keywords2, conf.translationMap()); + Set codes1 = toCodes(keywords1, conf.translationMap()); + Set codes2 = toCodes(keywords2, conf.translationMap()); - //if no cities are detected, the comparator gives 1.0 - if (codes1.isEmpty() && codes2.isEmpty()) - return 1.0; - else { - if (codes1.isEmpty() ^ codes2.isEmpty()) - return -1.0; //undefined if one of the two has no keywords - return commonElementsPercentage(codes1, codes2); - } - } + // if no cities are detected, the comparator gives 1.0 + if (codes1.isEmpty() && codes2.isEmpty()) + return 1.0; + else { + if (codes1.isEmpty() ^ codes2.isEmpty()) + return -1.0; // undefined if one of the two has no keywords + return commonElementsPercentage(codes1, codes2); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java index 7e6e306e2..4de8934ae 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -1,16 +1,18 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("level2JaroWinkler") public class Level2JaroWinkler extends AbstractStringComparator { - public Level2JaroWinkler(Map params){ + public Level2JaroWinkler(Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index 6430a8e92..e351058f9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -1,16 +1,18 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("level2JaroWinklerTitle") public class Level2JaroWinklerTitle extends AbstractStringComparator { - public Level2JaroWinklerTitle(Map params){ + public Level2JaroWinklerTitle(Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); } @@ -29,7 +31,8 @@ public class Level2JaroWinklerTitle extends AbstractStringComparator { final boolean check = checkNumbers(ca, cb); - if (check) return 0.5; + if (check) + return 0.5; return ssalgo.score(ca, cb); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java index 50bc51566..e66602e4f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("level2Levenstein") public class Level2Levenstein extends AbstractStringComparator { - public Level2Levenstein(Map params){ + public Level2Levenstein(Map params) { super(params, new com.wcohen.ss.Level2Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java index 22e091839..0871f8176 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("levenstein") public class Levenstein extends AbstractStringComparator { - public Levenstein(Map params){ + public Levenstein(Map params) { super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index 37ed9f9e9..877cb95ab 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -1,20 +1,23 @@ + package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractStringComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +import java.util.Map; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import java.util.Map; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("levensteinTitle") public class LevensteinTitle extends AbstractStringComparator { private static final Log log = LogFactory.getLog(LevensteinTitle.class); - public LevensteinTitle(Map params){ + public LevensteinTitle(Map params) { super(params, new com.wcohen.ss.Levenstein()); } @@ -33,7 +36,8 @@ public class LevensteinTitle extends AbstractStringComparator { final boolean check = checkNumbers(ca, cb); - if (check) return 0.5; + if (check) + return 0.5; return normalize(ssalgo.score(ca, cb), ca.length(), cb.length()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 05d4dc41c..341c0a62b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -1,19 +1,21 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. */ @ComparatorClass("levensteinTitleIgnoreVersion") public class LevensteinTitleIgnoreVersion extends AbstractStringComparator { - public LevensteinTitleIgnoreVersion(Map params){ + public LevensteinTitleIgnoreVersion(Map params) { super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java index b2420e05f..059db8de5 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java @@ -1,13 +1,14 @@ -package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractListComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractListComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + /** * The Class Contains match * @@ -16,51 +17,50 @@ import java.util.stream.Collectors; @ComparatorClass("listContainsMatch") public class ListContainsMatch extends AbstractListComparator { - private Map params; - private boolean CASE_SENSITIVE; - private String STRING; - private String AGGREGATOR; + private Map params; + private boolean CASE_SENSITIVE; + private String STRING; + private String AGGREGATOR; - public ListContainsMatch(Map params) { - super(params); - this.params = params; + public ListContainsMatch(Map params) { + super(params); + this.params = params; - //read parameters - CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); - STRING = params.get("string"); - AGGREGATOR = params.get("bool"); - } + // read parameters + CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); + STRING = params.get("string"); + AGGREGATOR = params.get("bool"); + } - @Override - public double compare(List sa, List sb, Config conf) { - if (sa.isEmpty() || sb.isEmpty()) { - return -1; - } + @Override + public double compare(List sa, List sb, Config conf) { + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } - if (!CASE_SENSITIVE) { - sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList()); - sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList()); - STRING = STRING.toLowerCase(); - } + if (!CASE_SENSITIVE) { + sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList()); + sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList()); + STRING = STRING.toLowerCase(); + } - switch(AGGREGATOR) { - case "AND": - if(sa.contains(STRING) && sb.contains(STRING)) - return 1.0; - break; - case "OR": - if(sa.contains(STRING) || sb.contains(STRING)) - return 1.0; - break; - case "XOR": - if(sa.contains(STRING) ^ sb.contains(STRING)) - return 1.0; - break; - default: - return 0.0; - } - return 0.0; + switch (AGGREGATOR) { + case "AND": + if (sa.contains(STRING) && sb.contains(STRING)) + return 1.0; + break; + case "OR": + if (sa.contains(STRING) || sb.contains(STRING)) + return 1.0; + break; + case "XOR": + if (sa.contains(STRING) ^ sb.contains(STRING)) + return 1.0; + break; + default: + return 0.0; + } + return 0.0; - } + } } - diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java index cdeeaa055..b9d62cf16 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -1,16 +1,18 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("mustBeDifferent") public class MustBeDifferent extends AbstractStringComparator { - public MustBeDifferent(Map params){ + public MustBeDifferent(Map params) { super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java index 91d82d21a..3ae1dcde0 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -1,11 +1,12 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a * NullDistanceAlgo. @@ -13,7 +14,7 @@ import java.util.Map; @ComparatorClass("null") public class NullDistanceAlgo implements Comparator { - public NullDistanceAlgo(Map params){ + public NullDistanceAlgo(Map params) { } @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java index a8986f2e5..2c003a170 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java @@ -1,34 +1,35 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("numbersComparator") public class NumbersComparator extends AbstractStringComparator { - Map params; + Map params; - public NumbersComparator(Map params) { - super(params); - this.params = params; - } + public NumbersComparator(Map params) { + super(params); + this.params = params; + } - @Override - public double distance(String a, String b, Config conf) { + @Override + public double distance(String a, String b, Config conf) { - //extracts numbers from the field - String numbers1 = getNumbers(nfd(a)); - String numbers2 = getNumbers(nfd(b)); + // extracts numbers from the field + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); - if (numbers1.isEmpty() || numbers2.isEmpty()) - return -1.0; + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; - int n1 = Integer.parseInt(numbers1); - int n2 = Integer.parseInt(numbers2); + int n1 = Integer.parseInt(numbers1); + int n2 = Integer.parseInt(numbers2); - return Math.abs(n1 - n2); - } + return Math.abs(n1 - n2); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java index f750a350a..77f9f3c6e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java @@ -1,36 +1,36 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("numbersMatch") public class NumbersMatch extends AbstractStringComparator { + public NumbersMatch(Map params) { + super(params); + } - public NumbersMatch(Map params) { - super(params); - } + @Override + public double distance(String a, String b, Config conf) { - @Override - public double distance(String a, String b, Config conf) { + // extracts numbers from the field + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); - //extracts numbers from the field - String numbers1 = getNumbers(nfd(a)); - String numbers2 = getNumbers(nfd(b)); + if (numbers1.isEmpty() && numbers2.isEmpty()) + return 1.0; - if (numbers1.isEmpty() && numbers2.isEmpty()) - return 1.0; + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; - if (numbers1.isEmpty() || numbers2.isEmpty()) - return -1.0; + if (numbers1.equals(numbers2)) + return 1.0; - if (numbers1.equals(numbers2)) - return 1.0; - - return 0.0; - } -} \ No newline at end of file + return 0.0; + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java index 5fb514b36..401328b53 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java @@ -1,36 +1,36 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("romansMatch") public class RomansMatch extends AbstractStringComparator { + public RomansMatch(Map params) { + super(params); + } - public RomansMatch(Map params) { - super(params); - } + @Override + public double distance(String a, String b, Config conf) { - @Override - public double distance(String a, String b, Config conf) { + // extracts romans from the field + String romans1 = getRomans(nfd(a)); + String romans2 = getRomans(nfd(b)); - //extracts romans from the field - String romans1 = getRomans(nfd(a)); - String romans2 = getRomans(nfd(b)); + if (romans1.isEmpty() && romans2.isEmpty()) + return 1.0; - if (romans1.isEmpty() && romans2.isEmpty()) - return 1.0; + if (romans1.isEmpty() || romans2.isEmpty()) + return -1.0; - if (romans1.isEmpty() || romans2.isEmpty()) - return -1.0; + if (romans1.equals(romans2)) + return 1.0; - if (romans1.equals(romans2)) - return 1.0; - - return 0.0; - } + return 0.0; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java index e4690381a..79cecf122 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java @@ -1,13 +1,15 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Lists; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractListComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractListComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + /** * Returns true if the number of values in the fields is the same. * @@ -16,23 +18,23 @@ import java.util.Map; @ComparatorClass("sizeMatch") public class SizeMatch extends AbstractListComparator { - /** - * Instantiates a new size match. - * - * @param params - * the parameters - */ - public SizeMatch(final Map params) { - super(params); - } + /** + * Instantiates a new size match. + * + * @param params + * the parameters + */ + public SizeMatch(final Map params) { + super(params); + } - @Override - public double compare(final List a, final List b, final Config conf) { + @Override + public double compare(final List a, final List b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1.0; + if (a.isEmpty() || b.isEmpty()) + return -1.0; - return a.size() == b.size() ? 1.0 : 0.0; - } + return a.size() == b.size() ? 1.0 : 0.0; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java index 79173ba66..6e5c36def 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java @@ -1,18 +1,20 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * The Class SortedJaroWinkler. */ @ComparatorClass("sortedJaroWinkler") public class SortedJaroWinkler extends AbstractSortedComparator { - public SortedJaroWinkler(Map params){ + public SortedJaroWinkler(Map params) { super(params, new com.wcohen.ss.Levenstein()); } @@ -40,7 +42,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override @@ -50,7 +51,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java index de8c669d7..3046fceae 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java @@ -1,11 +1,13 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * The Class SortedJaroWinkler. */ @@ -22,7 +24,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { super(weight, new com.wcohen.ss.Level2JaroWinkler()); } - public SortedLevel2JaroWinkler(final Map params){ + public SortedLevel2JaroWinkler(final Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); } @@ -40,7 +42,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override @@ -50,7 +51,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java index 65cbaba24..6fa7bc980 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java @@ -1,12 +1,13 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * The Class Contains match * @@ -15,50 +16,50 @@ import java.util.Map; @ComparatorClass("stringContainsMatch") public class StringContainsMatch extends AbstractStringComparator { - private Map params; + private Map params; - private boolean CASE_SENSITIVE; - private String STRING; - private String AGGREGATOR; + private boolean CASE_SENSITIVE; + private String STRING; + private String AGGREGATOR; - public StringContainsMatch(Map params) { - super(params); - this.params = params; + public StringContainsMatch(Map params) { + super(params); + this.params = params; - //read parameters - CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); - STRING = params.get("string"); - AGGREGATOR = params.get("aggregator"); + // read parameters + CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); + STRING = params.get("string"); + AGGREGATOR = params.get("aggregator"); - } + } - @Override - public double distance(final String a, final String b, final Config conf) { + @Override + public double distance(final String a, final String b, final Config conf) { - String ca = a; - String cb = b; - if (!CASE_SENSITIVE) { - ca = a.toLowerCase(); - cb = b.toLowerCase(); - STRING = STRING.toLowerCase(); - } + String ca = a; + String cb = b; + if (!CASE_SENSITIVE) { + ca = a.toLowerCase(); + cb = b.toLowerCase(); + STRING = STRING.toLowerCase(); + } - switch(AGGREGATOR) { - case "AND": - if(ca.contains(STRING) && cb.contains(STRING)) - return 1.0; - break; - case "OR": - if(ca.contains(STRING) || cb.contains(STRING)) - return 1.0; - break; - case "XOR": - if(ca.contains(STRING) ^ cb.contains(STRING)) - return 1.0; - break; - default: - return 0.0; - } - return 0.0; - } + switch (AGGREGATOR) { + case "AND": + if (ca.contains(STRING) && cb.contains(STRING)) + return 1.0; + break; + case "OR": + if (ca.contains(STRING) || cb.contains(STRING)) + return 1.0; + break; + case "XOR": + if (ca.contains(STRING) ^ cb.contains(STRING)) + return 1.0; + break; + default: + return 0.0; + } + return 0.0; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java index f49ff62a9..b4dbef3bb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java @@ -1,53 +1,56 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractListComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package eu.dnetlib.pace.tree; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractListComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("stringListMatch") public class StringListMatch extends AbstractListComparator { - private static final Log log = LogFactory.getLog(StringListMatch.class); - private Map params; + private static final Log log = LogFactory.getLog(StringListMatch.class); + private Map params; - final private String TYPE; //percentage or count + final private String TYPE; // percentage or count - public StringListMatch(final Map params) { - super(params); - this.params = params; + public StringListMatch(final Map params) { + super(params); + this.params = params; - TYPE = params.getOrDefault("type", "percentage"); - } + TYPE = params.getOrDefault("type", "percentage"); + } - @Override - public double compare(final List a, final List b, final Config conf) { + @Override + public double compare(final List a, final List b, final Config conf) { - final Set pa = new HashSet<>(a); - final Set pb = new HashSet<>(b); + final Set pa = new HashSet<>(a); + final Set pb = new HashSet<>(b); - if (pa.isEmpty() || pb.isEmpty()) { - return -1; //return undefined if one of the two lists is empty - } + if (pa.isEmpty() || pb.isEmpty()) { + return -1; // return undefined if one of the two lists is empty + } - int incommon = Sets.intersection(pa, pb).size(); - int simDiff = Sets.symmetricDifference(pa, pb).size(); + int incommon = Sets.intersection(pa, pb).size(); + int simDiff = Sets.symmetricDifference(pa, pb).size(); - if (incommon + simDiff == 0) { - return 0.0; - } + if (incommon + simDiff == 0) { + return 0.0; + } - if(TYPE.equals("percentage")) - return (double)incommon / (incommon + simDiff); - else - return incommon; + if (TYPE.equals("percentage")) + return (double) incommon / (incommon + simDiff); + else + return incommon; - } -} \ No newline at end of file + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java index 4f1d4370c..3f8c40599 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -1,12 +1,15 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.lang3.StringUtils; - -import java.util.Map; /** * The Class SubStringLevenstein. @@ -14,76 +17,74 @@ import java.util.Map; @ComparatorClass("subStringLevenstein") public class SubStringLevenstein extends AbstractStringComparator { - /** - * The limit. - */ - protected int limit; + /** + * The limit. + */ + protected int limit; - /** - * Instantiates a new sub string levenstein. - * - * @param w the w - */ - public SubStringLevenstein(final double w) { - super(w, new com.wcohen.ss.Levenstein()); - } + /** + * Instantiates a new sub string levenstein. + * + * @param w the w + */ + public SubStringLevenstein(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } - public SubStringLevenstein(Map params) { - super(params, new com.wcohen.ss.Levenstein()); - this.limit = Integer.parseInt(params.getOrDefault("limit", "1")); - } + public SubStringLevenstein(Map params) { + super(params, new com.wcohen.ss.Levenstein()); + this.limit = Integer.parseInt(params.getOrDefault("limit", "1")); + } - /** - * Instantiates a new sub string levenstein. - * - * @param w the w - * @param limit the limit - */ - public SubStringLevenstein(final double w, final int limit) { - super(w, new com.wcohen.ss.Levenstein()); - this.limit = limit; - } + /** + * Instantiates a new sub string levenstein. + * + * @param w the w + * @param limit the limit + */ + public SubStringLevenstein(final double w, final int limit) { + super(w, new com.wcohen.ss.Levenstein()); + this.limit = limit; + } - /** - * Instantiates a new sub string levenstein. - * - * @param w the w - * @param limit the limit - * @param ssalgo the ssalgo - */ - protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { - super(w, ssalgo); - this.limit = limit; - } + /** + * Instantiates a new sub string levenstein. + * + * @param w the w + * @param limit the limit + * @param ssalgo the ssalgo + */ + protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + this.limit = limit; + } - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) - */ - @Override - public double distance(final String a, final String b, final Config conf) { - return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf); - } + /* + * (non-Javadoc) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, + * eu.dnetlib.pace.model.Field) + */ + @Override + public double distance(final String a, final String b, final Config conf) { + return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf); + } - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() - */ - @Override - public double getWeight() { - return super.weight; - } + /* + * (non-Javadoc) + * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } - /* - * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) - */ - @Override - protected double normalize(final double d) { - return 1 / Math.pow(Math.abs(d) + 1, 0.1); - } + /* + * (non-Javadoc) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java index d847c585d..8d99ac27f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java @@ -1,11 +1,12 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * Returns true if the titles in the given documents contains the same numbers, false otherwise. * @@ -15,24 +16,24 @@ import java.util.Map; @ComparatorClass("titleVersionMatch") public class TitleVersionMatch extends AbstractStringComparator { - public TitleVersionMatch(final Map params) { - super(params); - } + public TitleVersionMatch(final Map params) { + super(params); + } - @Override - public double compare(final String valueA, final String valueB, final Config conf) { - if (valueA.isEmpty() || valueB.isEmpty()) - return -1; + @Override + public double compare(final String valueA, final String valueB, final Config conf) { + if (valueA.isEmpty() || valueB.isEmpty()) + return -1; - return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0; - } + return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0; + } - @Override - public String toString() { - return getClass().getSimpleName() + ":" + super.toString(); - } + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } - protected String toString(final Object object) { - return toFirstString(object); - } -} \ No newline at end of file + protected String toString(final Object object) { + return toFirstString(object); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java index 63c63fdbb..722236be6 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -1,61 +1,63 @@ -package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.tree; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("urlMatcher") public class UrlMatcher extends Levenstein { - private Map params; + private Map params; - public UrlMatcher(Map params){ - super(params); - this.params = params; - } + public UrlMatcher(Map params) { + super(params); + this.params = params; + } - public UrlMatcher(double weight, Map params) { - super(weight); - this.params = params; - } + public UrlMatcher(double weight, Map params) { + super(weight); + this.params = params; + } - public void setParams(Map params) { - this.params = params; - } + public void setParams(Map params) { + this.params = params; + } - @Override - public double distance(String a, String b, final Config conf) { - final URL urlA = asUrl(a); - final URL urlB = asUrl(b); + @Override + public double distance(String a, String b, final Config conf) { + final URL urlA = asUrl(a); + final URL urlB = asUrl(b); - if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { - return 0.0; - } + if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { + return 0.0; + } - Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5")); - Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5")); + Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5")); + Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5")); - if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { - return hostW * 0.5; - } + if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { + return hostW * 0.5; + } - return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); - } + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); + } - private URL asUrl(final String value) { - try { - return new URL(value); - } catch (MalformedURLException e) { - // should not happen as checked by pace typing - throw new IllegalStateException("invalid URL: " + value); - } - } + private URL asUrl(final String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } - protected String toString(final Object object) { - return toFirstString(object); - } + protected String toString(final Object object) { + return toFirstString(object); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java index d2c2d2627..95f796f6a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java @@ -1,11 +1,13 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractStringComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.lang3.StringUtils; - -import java.util.Map; /** * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing. @@ -15,36 +17,36 @@ import java.util.Map; @ComparatorClass("yearMatch") public class YearMatch extends AbstractStringComparator { - private int limit = 4; + private int limit = 4; - public YearMatch(final Map params) { - super(params); - } + public YearMatch(final Map params) { + super(params); + } - @Override - public double compare(final String a, final String b, final Config conf) { - final String valueA = getNumbers(getFirstValue(a)); - final String valueB = getNumbers(getFirstValue(b)); + @Override + public double compare(final String a, final String b, final Config conf) { + final String valueA = getNumbers(getFirstValue(a)); + final String valueB = getNumbers(getFirstValue(b)); - if (valueA.isEmpty() || valueB.isEmpty()) - return -1; + if (valueA.isEmpty() || valueB.isEmpty()) + return -1; - final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); - final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); + final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); + final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); - return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0; - } + return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0; + } - protected boolean checkLength(final String s) { - return s.length() == limit; - } + protected boolean checkLength(final String s) { + return s.length() == limit; + } - protected String getFirstValue(final String value) { - return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : ""; - } + protected String getFirstValue(final String value) { + return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : ""; + } - @Override - public String toString() { - return getClass().getSimpleName() + ":" + super.toString(); - } -} \ No newline at end of file + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index fa8853a63..46857e01b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -1,130 +1,131 @@ -package eu.dnetlib.pace.tree.support; -import com.google.common.base.Joiner; -import com.google.common.collect.Lists; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.tree.support; import java.util.Collections; import java.util.List; import java.util.Map; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; + public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator { - /** The ssalgo. */ - protected AbstractStringDistance ssalgo; + /** The ssalgo. */ + protected AbstractStringDistance ssalgo; - /** The weight. */ - protected double weight = 0.0; + /** The weight. */ + protected double weight = 0.0; - private Map params; + private Map params; - protected AbstractComparator(Map params) { - this.params = params; - } + protected AbstractComparator(Map params) { + this.params = params; + } - protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ - this.params = params; - this.weight = 1.0; - this.ssalgo = ssalgo; - } + protected AbstractComparator(Map params, final AbstractStringDistance ssalgo) { + this.params = params; + this.weight = 1.0; + this.ssalgo = ssalgo; + } - /** - * Instantiates a new second string compare algo. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { - this.ssalgo = ssalgo; - this.weight = weight; - } + /** + * Instantiates a new second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + this.weight = weight; + } - protected AbstractComparator(final AbstractStringDistance ssalgo){ - this.ssalgo = ssalgo; - } + protected AbstractComparator(final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + } - /** - * Normalize. - * - * @param d - * the d - * @return the double - */ - protected double normalize(double d) { - return d; - } + /** + * Normalize. + * + * @param d + * the d + * @return the double + */ + protected double normalize(double d) { + return d; + } - /** - * Distance. - * - * @param a - * the a - * @param b - * the b - * @return the double - */ + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ - protected double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1; //return -1 if a field is missing - } - double score = ssalgo.score(a, b); - return normalize(score); - } + protected double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1; // return -1 if a field is missing + } + double score = ssalgo.score(a, b); + return normalize(score); + } - protected double compare(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; - return distance(a, b, conf); - } + protected double compare(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) + return -1; + return distance(a, b, conf); + } - /** - * Convert the given argument to a List of Strings - * - * @param object - * function argument - * @return the list - */ - protected List toList(final Object object) { - if (object instanceof List) { - return (List)object; - } + /** + * Convert the given argument to a List of Strings + * + * @param object + * function argument + * @return the list + */ + protected List toList(final Object object) { + if (object instanceof List) { + return (List) object; + } - return Lists.newArrayList(object.toString()); - } + return Lists.newArrayList(object.toString()); + } - /** - * Convert the given argument to a String - * - * @param object - * function argument - * @return the list - */ - protected String toString(final Object object) { - if (object instanceof List) { - List l = (List) object; - return Joiner.on(" ").join(l); - } + /** + * Convert the given argument to a String + * + * @param object + * function argument + * @return the list + */ + protected String toString(final Object object) { + if (object instanceof List) { + List l = (List) object; + return Joiner.on(" ").join(l); + } - return object.toString(); - } + return object.toString(); + } - protected String toFirstString(final Object object) { - if (object instanceof List) { - List l = (List) object; - return l.isEmpty() ? "" : l.get(0); - } + protected String toFirstString(final Object object) { + if (object instanceof List) { + List l = (List) object; + return l.isEmpty() ? "" : l.get(0); + } - return object.toString(); - } + return object.toString(); + } - - public double getWeight(){ - return this.weight; - } + public double getWeight() { + return this.weight; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java index e2ef057d2..f0b1d4cff 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractListComparator.java @@ -1,39 +1,41 @@ -package eu.dnetlib.pace.tree.support; -import com.google.common.collect.Lists; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.Type; +package eu.dnetlib.pace.tree.support; import java.util.List; import java.util.Map; -abstract public class AbstractListComparator extends AbstractComparator>{ - protected AbstractListComparator(Map params) { - super(params); - } +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; - protected AbstractListComparator(Map params, AbstractStringDistance ssalgo) { - super(params, ssalgo); - } +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.Type; - protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } +abstract public class AbstractListComparator extends AbstractComparator> { + protected AbstractListComparator(Map params) { + super(params); + } - protected AbstractListComparator(AbstractStringDistance ssalgo) { - super(ssalgo); - } + protected AbstractListComparator(Map params, AbstractStringDistance ssalgo) { + super(params, ssalgo); + } - @Override - public double compare(Object a, Object b, Config conf) { - return compare(toList(a), toList(b), conf); - } + protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - public double compare(final List a, final List b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; + protected AbstractListComparator(AbstractStringDistance ssalgo) { + super(ssalgo); + } - return distance(concat(a), concat(b), conf); - } + @Override + public double compare(Object a, Object b, Config conf) { + return compare(toList(a), toList(b), conf); + } + + public double compare(final List a, final List b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) + return -1; + + return distance(concat(a), concat(b), conf); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java index dbcb26278..2da1de17a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java @@ -1,40 +1,41 @@ -package eu.dnetlib.pace.tree.support; -import com.google.common.collect.Lists; -import com.wcohen.ss.AbstractStringDistance; +package eu.dnetlib.pace.tree.support; import java.util.AbstractList; import java.util.Collections; import java.util.List; import java.util.Map; +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; + public abstract class AbstractSortedComparator extends AbstractListComparator { - /** - * Instantiates a new sorted second string compare algo. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + /** + * Instantiates a new sorted second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo){ - super(Double.parseDouble(params.get("weight")), ssalgo); - } + protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo) { + super(Double.parseDouble(params.get("weight")), ssalgo); + } - @Override - protected List toList(final Object object) { - if (object instanceof List) { - List fl = (List) object; - List values = Lists.newArrayList(fl); - Collections.sort(values); - return values; - } + @Override + protected List toList(final Object object) { + if (object instanceof List) { + List fl = (List) object; + List values = Lists.newArrayList(fl); + Collections.sort(values); + return values; + } - return Lists.newArrayList(object.toString()); - } + return Lists.newArrayList(object.toString()); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java index 2001b9b5d..037ff6634 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractStringComparator.java @@ -1,44 +1,46 @@ -package eu.dnetlib.pace.tree.support; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.tree.support; import java.util.Map; -public abstract class AbstractStringComparator extends AbstractComparator{ - protected AbstractStringComparator(Map params) { - super(params); - } +import com.wcohen.ss.AbstractStringDistance; - protected AbstractStringComparator(Map params, AbstractStringDistance ssalgo) { - super(params, ssalgo); - } +import eu.dnetlib.pace.config.Config; - protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } +public abstract class AbstractStringComparator extends AbstractComparator { + protected AbstractStringComparator(Map params) { + super(params); + } - protected AbstractStringComparator(AbstractStringDistance ssalgo) { - super(ssalgo); - } + protected AbstractStringComparator(Map params, AbstractStringDistance ssalgo) { + super(params, ssalgo); + } - public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1; //return -1 if a field is missing - } - double score = ssalgo.score(a, b); - return normalize(score); - } + protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double compare(Object a, Object b, Config conf) { - return compare(toString(a), toString(b), conf); - } + protected AbstractStringComparator(AbstractStringDistance ssalgo) { + super(ssalgo); + } - public double compare(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; - return distance(a, b, conf); - } + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1; // return -1 if a field is missing + } + double score = ssalgo.score(a, b); + return normalize(score); + } + + @Override + public double compare(Object a, Object b, Config conf) { + return compare(toString(a), toString(b), conf); + } + + public double compare(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) + return -1; + return distance(a, b, conf); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java index caf7cd4c8..7011f2bf3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -1,24 +1,21 @@ + package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.util.PaceException; public enum AggType { - W_MEAN, //weighted mean - AVG, //average - SUM, - MAX, - MIN, - AND, //used for necessary conditions - OR; //used for sufficient conditions + W_MEAN, // weighted mean + AVG, // average + SUM, MAX, MIN, AND, // used for necessary conditions + OR; // used for sufficient conditions - public static AggType getEnum(String value) { + public static AggType getEnum(String value) { - try { - return AggType.valueOf(value); - } - catch (IllegalArgumentException e) { - throw new PaceException("Undefined aggregation type", e); - } - } + try { + return AggType.valueOf(value); + } catch (IllegalArgumentException e) { + throw new PaceException("Undefined aggregation type", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java index 033f18766..15a39921b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -1,12 +1,12 @@ + package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.config.Config; public interface Comparator { - /* - * return : -1 -> can't decide (i.e. missing field) - * >0 -> similarity degree (depends on the algorithm) - * */ - public double compare(Object a, Object b, Config conf); + /* + * return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm) + */ + public double compare(Object a, Object b, Config conf); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java index 8c3002eb6..5ef0932cf 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree.support; import java.lang.annotation.ElementType; @@ -9,5 +10,5 @@ import java.lang.annotation.Target; @Target(ElementType.TYPE) public @interface ComparatorClass { - public String value(); + public String value(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 44971876f..d81c68e38 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -1,82 +1,84 @@ + package eu.dnetlib.pace.tree.support; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.util.PaceException; - - import java.io.IOException; import java.io.Serializable; import java.util.Map; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.util.PaceException; + /** * The class that defines the configuration of each field in the decision tree. * */ public class FieldConf implements Serializable { - private String field; //name of the field on which apply the comparator - private String comparator; //comparator name - private double weight = 1.0; //weight for the field (to be used in the aggregation) - private Map params; //parameters + private String field; // name of the field on which apply the comparator + private String comparator; // comparator name + private double weight = 1.0; // weight for the field (to be used in the aggregation) + private Map params; // parameters - private boolean countIfUndefined; + private boolean countIfUndefined; - public boolean isCountIfUndefined() { - return countIfUndefined; - } + public boolean isCountIfUndefined() { + return countIfUndefined; + } - public void setCountIfUndefined(boolean countIfUndefined) { - this.countIfUndefined = countIfUndefined; - } + public void setCountIfUndefined(boolean countIfUndefined) { + this.countIfUndefined = countIfUndefined; + } - public FieldConf() { - } + public FieldConf() { + } - public FieldConf(String field, String comparator, double weight, Map params, boolean countIfUndefined) { - this.field = field; - this.comparator = comparator; - this.weight = weight; - this.params = params; - this.countIfUndefined = countIfUndefined; - } + public FieldConf(String field, String comparator, double weight, Map params, + boolean countIfUndefined) { + this.field = field; + this.comparator = comparator; + this.weight = weight; + this.params = params; + this.countIfUndefined = countIfUndefined; + } - public String getField() { - return field; - } + public String getField() { + return field; + } - public void setField(String field) { - this.field = field; - } + public void setField(String field) { + this.field = field; + } - public String getComparator() { - return comparator; - } + public String getComparator() { + return comparator; + } - public void setComparator(String comparator) { - this.comparator = comparator; - } + public void setComparator(String comparator) { + this.comparator = comparator; + } - public double getWeight() { - return weight; - } + public double getWeight() { + return weight; + } - public void setWeight(double weight) { - this.weight = weight; - } + public void setWeight(double weight) { + this.weight = weight; + } - public Map getParams() { - return params; - } + public Map getParams() { + return params; + } - public void setParams(Map params) { - this.params = params; - } + public void setParams(Map params) { + this.params = params; + } - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } -} \ No newline at end of file + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index c4633eac8..46e66378e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -1,87 +1,89 @@ -package eu.dnetlib.pace.tree.support; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.util.PaceException; +package eu.dnetlib.pace.tree.support; import java.io.IOException; import java.io.Serializable; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.util.PaceException; + /** * The class that contains the result of each comparison in the decision tree * */ public class FieldStats implements Serializable { - private double weight; //weight for the field (to be used in the aggregation) - private double threshold; //threshold for the field (to be used in some kind of aggregations) - private double result; //the result of the comparison - private Object a; - private Object b; + private double weight; // weight for the field (to be used in the aggregation) + private double threshold; // threshold for the field (to be used in some kind of aggregations) + private double result; // the result of the comparison + private Object a; + private Object b; - private boolean countIfUndefined; + private boolean countIfUndefined; - public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) { - this.weight = weight; - this.threshold = threshold; - this.result = result; - this.countIfUndefined = countIfUndefined; - this.a = a; - this.b = b; - } + public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) { + this.weight = weight; + this.threshold = threshold; + this.result = result; + this.countIfUndefined = countIfUndefined; + this.a = a; + this.b = b; + } - public double getThreshold() { - return threshold; - } + public double getThreshold() { + return threshold; + } - public void setThreshold(double threshold) { - this.threshold = threshold; - } + public void setThreshold(double threshold) { + this.threshold = threshold; + } - public double getWeight() { - return weight; - } + public double getWeight() { + return weight; + } - public void setWeight(double weight) { - this.weight = weight; - } + public void setWeight(double weight) { + this.weight = weight; + } - public double getResult() { - return result; - } + public double getResult() { + return result; + } - public void setResult(double result) { - this.result = result; - } + public void setResult(double result) { + this.result = result; + } - public boolean isCountIfUndefined() { - return countIfUndefined; - } + public boolean isCountIfUndefined() { + return countIfUndefined; + } - public void setCountIfUndefined(boolean countIfUndefined) { - this.countIfUndefined = countIfUndefined; - } + public void setCountIfUndefined(boolean countIfUndefined) { + this.countIfUndefined = countIfUndefined; + } - public Object getA() { - return a; - } + public Object getA() { + return a; + } - public void setA(Object a) { - this.a = a; - } + public void setA(Object a) { + this.a = a; + } - public Object getB() { - return b; - } + public Object getB() { + return b; + } - public void setB(Object b) { - this.b = b; - } + public void setB(Object b) { + this.b = b; + } - @Override - public String toString(){ - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java index da6b738d5..60559412d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -1,20 +1,19 @@ + package eu.dnetlib.pace.tree.support; public enum MatchType { - MATCH, - NO_MATCH, - UNDEFINED; + MATCH, NO_MATCH, UNDEFINED; - public static MatchType parse(String value) { + public static MatchType parse(String value) { - if (MATCH.name().equals(value)) { - return MATCH; - } else if (NO_MATCH.name().equals(value)) { - return NO_MATCH; - } else { - return UNDEFINED; - } + if (MATCH.name().equals(value)) { + return MATCH; + } else if (NO_MATCH.name().equals(value)) { + return NO_MATCH; + } else { + return UNDEFINED; + } // try { // return MatchType.valueOf(value); @@ -22,5 +21,5 @@ public enum MatchType { // catch (IllegalArgumentException e) { // return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable // } - } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index 92da055f4..0973fdf1e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,166 +1,170 @@ -package eu.dnetlib.pace.tree.support; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.types.ArrayType; -import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.StringType; +package eu.dnetlib.pace.tree.support; import java.io.IOException; import java.io.Serializable; import java.util.List; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.types.ArrayType; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.StringType; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; + public class TreeNodeDef implements Serializable { - final static String CROSS_COMPARE = "crossCompare"; + final static String CROSS_COMPARE = "crossCompare"; - private List fields; - private AggType aggregation; + private List fields; + private AggType aggregation; - private double threshold; + private double threshold; - private String positive; - private String negative; - private String undefined; + private String positive; + private String negative; + private String undefined; - boolean ignoreUndefined; + boolean ignoreUndefined; - public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) { - this.fields = fields; - this.aggregation = aggregation; - this.threshold = threshold; - this.positive = positive; - this.negative = negative; - this.undefined = undefined; - this.ignoreUndefined = ignoreUndefined; - } + public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, + String undefined, boolean ignoreUndefined) { + this.fields = fields; + this.aggregation = aggregation; + this.threshold = threshold; + this.positive = positive; + this.negative = negative; + this.undefined = undefined; + this.ignoreUndefined = ignoreUndefined; + } - public TreeNodeDef() {} + public TreeNodeDef() { + } - //function for the evaluation of the node - public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) { + // function for the evaluation of the node + public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) { - TreeNodeStats stats = new TreeNodeStats(); + TreeNodeStats stats = new TreeNodeStats(); - //for each field in the node, it computes the - for (FieldConf fieldConf : fields) { - double weight = fieldConf.getWeight(); - double result; + // for each field in the node, it computes the + for (FieldConf fieldConf : fields) { + double weight = fieldConf.getWeight(); + double result; - Object value1 = getJavaValue(doc1,fieldConf.getField()); - Object value2 = getJavaValue(doc2,fieldConf.getField()); + Object value1 = getJavaValue(doc1, fieldConf.getField()); + Object value2 = getJavaValue(doc2, fieldConf.getField()); + // if the param specifies a cross comparison (i.e. compare elements from different fields), compute the + // result for both sides and return the maximum + String crossField = fieldConf.getParams().get(CROSS_COMPARE); + if (crossField != null) { + double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf); + double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf); + result = Math.max(result1, result2); + } else { + result = comparator(fieldConf).compare(value1, value2, conf); + } - //if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum - String crossField = fieldConf.getParams().get(CROSS_COMPARE); - if (crossField != null) { - double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2,crossField), conf); - double result2 = comparator(fieldConf).compare(getJavaValue(doc1,crossField), value2, conf); - result = Math.max(result1,result2); - } - else { - result = comparator(fieldConf).compare(value1, value2, conf); - } + stats + .addFieldStats( + fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), + new FieldStats( + weight, + Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")), + result, + fieldConf.isCountIfUndefined(), + value1, + value2)); + } - stats.addFieldStats( - fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), - new FieldStats( - weight, - Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")), - result, - fieldConf.isCountIfUndefined(), - value1, - value2 - )); - } + return stats; + } - return stats; - } + public Object getJavaValue(Row row, String name) { + int pos = row.fieldIndex(name); + if (pos >= 0) { + DataType dt = row.schema().fields()[pos].dataType(); + if (dt instanceof StringType) { + return row.getString(pos); + } else if (dt instanceof ArrayType) { + return row.getList(pos); + } + } - public Object getJavaValue(Row row, String name) { - int pos = row.fieldIndex(name); - if (pos >= 0) { - DataType dt = row.schema().fields()[pos].dataType(); - if (dt instanceof StringType) { - return row.getString(pos); - } else if (dt instanceof ArrayType) { - return row.getList(pos); - } - } + return null; + } - return null; - } + private Comparator comparator(final FieldConf field) { - private Comparator comparator(final FieldConf field){ + return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); + } - return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); - } + public List getFields() { + return fields; + } - public List getFields() { - return fields; - } + public void setFields(List fields) { + this.fields = fields; + } - public void setFields(List fields) { - this.fields = fields; - } + public AggType getAggregation() { + return aggregation; + } - public AggType getAggregation() { - return aggregation; - } + public void setAggregation(AggType aggregation) { + this.aggregation = aggregation; + } - public void setAggregation(AggType aggregation) { - this.aggregation = aggregation; - } + public double getThreshold() { + return threshold; + } - public double getThreshold() { - return threshold; - } + public void setThreshold(double threshold) { + this.threshold = threshold; + } - public void setThreshold(double threshold) { - this.threshold = threshold; - } + public String getPositive() { + return positive; + } - public String getPositive() { - return positive; - } + public void setPositive(String positive) { + this.positive = positive; + } - public void setPositive(String positive) { - this.positive = positive; - } + public String getNegative() { + return negative; + } - public String getNegative() { - return negative; - } + public void setNegative(String negative) { + this.negative = negative; + } - public void setNegative(String negative) { - this.negative = negative; - } + public String getUndefined() { + return undefined; + } - public String getUndefined() { - return undefined; - } + public void setUndefined(String undefined) { + this.undefined = undefined; + } - public void setUndefined(String undefined) { - this.undefined = undefined; - } + public boolean isIgnoreUndefined() { + return ignoreUndefined; + } - public boolean isIgnoreUndefined() { - return ignoreUndefined; - } + public void setIgnoreUndefined(boolean ignoreUndefined) { + this.ignoreUndefined = ignoreUndefined; + } - public void setIgnoreUndefined(boolean ignoreUndefined) { - this.ignoreUndefined = ignoreUndefined; - } - - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java index f9612a41f..2b96048b4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree.support; import java.io.Serializable; @@ -6,129 +7,128 @@ import java.util.Map; public class TreeNodeStats implements Serializable { - private Map results; //this is an accumulator for the results of the node + private Map results; // this is an accumulator for the results of the node - public TreeNodeStats(){ - this.results = new HashMap<>(); - } + public TreeNodeStats() { + this.results = new HashMap<>(); + } - public Map getResults() { - return results; - } + public Map getResults() { + return results; + } - public void addFieldStats(String id, FieldStats fieldStats){ - this.results.put(id, fieldStats); - } + public void addFieldStats(String id, FieldStats fieldStats) { + this.results.put(id, fieldStats); + } - public int fieldsCount(){ - return this.results.size(); - } + public int fieldsCount() { + return this.results.size(); + } - public int undefinedCount(){ - int undefinedCount = 0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult() == -1) - undefinedCount ++; - } - return undefinedCount; - } + public int undefinedCount() { + int undefinedCount = 0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() == -1) + undefinedCount++; + } + return undefinedCount; + } - public double scoreSum(){ - double scoreSum = 0.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>=0.0) { - scoreSum += fs.getResult(); - } - } - return scoreSum; - } + public double scoreSum() { + double scoreSum = 0.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() >= 0.0) { + scoreSum += fs.getResult(); + } + } + return scoreSum; + } - //return the sum of the weights without considering the fields with countIfMissing=false && result=-1 - public double weightSum(){ - double weightSum = 0.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) { - weightSum += fs.getWeight(); - } - } - return weightSum; - } + // return the sum of the weights without considering the fields with countIfMissing=false && result=-1 + public double weightSum() { + double weightSum = 0.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() >= 0.0 || (fs.getResult() < 0.0 && fs.isCountIfUndefined())) { + weightSum += fs.getWeight(); + } + } + return weightSum; + } - public double weightedScoreSum(){ - double weightedScoreSum = 0.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>=0.0) { - weightedScoreSum += fs.getResult()*fs.getWeight(); - } - } - return weightedScoreSum; - } + public double weightedScoreSum() { + double weightedScoreSum = 0.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() >= 0.0) { + weightedScoreSum += fs.getResult() * fs.getWeight(); + } + } + return weightedScoreSum; + } - public double max(){ - double max = -1.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>max) - max = fs.getResult(); - } - return max; - } + public double max() { + double max = -1.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() > max) + max = fs.getResult(); + } + return max; + } - public double min(){ - double min = 100.0; //random high value - for(FieldStats fs: this.results.values()){ - if(fs.getResult()=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) - min = fs.getResult(); - } - } - return min; - } + public double min() { + double min = 100.0; // random high value + for (FieldStats fs : this.results.values()) { + if (fs.getResult() < min) { + if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) + min = fs.getResult(); + } + } + return min; + } - //if at least one is true, return 1.0 - public double or(){ - for (FieldStats fieldStats : this.results.values()) { - if (fieldStats.getResult() >= fieldStats.getThreshold()) - return 1.0; - } - return 0.0; - } + // if at least one is true, return 1.0 + public double or() { + for (FieldStats fieldStats : this.results.values()) { + if (fieldStats.getResult() >= fieldStats.getThreshold()) + return 1.0; + } + return 0.0; + } - //if at least one is false, return 0.0 - public double and() { - for (FieldStats fieldStats : this.results.values()) { + // if at least one is false, return 0.0 + public double and() { + for (FieldStats fieldStats : this.results.values()) { - if (fieldStats.getResult() == -1) { - if (fieldStats.isCountIfUndefined()) - return 0.0; - } - else { - if (fieldStats.getResult() < fieldStats.getThreshold()) - return 0.0; - } + if (fieldStats.getResult() == -1) { + if (fieldStats.isCountIfUndefined()) + return 0.0; + } else { + if (fieldStats.getResult() < fieldStats.getThreshold()) + return 0.0; + } - } - return 1.0; - } + } + return 1.0; + } - public double getFinalScore(AggType aggregation){ + public double getFinalScore(AggType aggregation) { - switch (aggregation){ - case AVG: - return scoreSum()/fieldsCount(); - case SUM: - return scoreSum(); - case MAX: - return max(); - case MIN: - return min(); - case W_MEAN: - return weightedScoreSum()/weightSum(); - case OR: - return or(); - case AND: - return and(); - default: - return 0.0; - } - } + switch (aggregation) { + case AVG: + return scoreSum() / fieldsCount(); + case SUM: + return scoreSum(); + case MAX: + return max(); + case MIN: + return min(); + case W_MEAN: + return weightedScoreSum() / weightSum(); + case OR: + return or(); + case AND: + return and(); + default: + return 0.0; + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index f769bd962..263504dbb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -1,11 +1,12 @@ + package eu.dnetlib.pace.tree.support; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.util.PaceException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.sql.Row; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.util.PaceException; /** * The compare between two documents is given by the weighted mean of the field distances @@ -23,11 +24,11 @@ public class TreeProcessor { // row based copies public boolean compare(final Row a, final Row b) { - //evaluate the decision tree + // evaluate the decision tree return evaluateTree(a, b).getResult() == MatchType.MATCH; } - public TreeStats evaluateTree(final Row doc1, final Row doc2){ + public TreeStats evaluateTree(final Row doc1, final Row doc2) { TreeStats treeStats = new TreeStats(); @@ -36,26 +37,25 @@ public class TreeProcessor { do { TreeNodeDef currentNode = config.decisionTree().get(nextNodeName); - //throw an exception if the node doesn't exist + // throw an exception if the node doesn't exist if (currentNode == null) throw new PaceException("Missing tree node: " + nextNodeName); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); treeStats.addNodeStats(nextNodeName, stats); - //if ignoreUndefined=false the miss is considered as undefined - if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { + // if ignoreUndefined=false the miss is considered as undefined + if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { nextNodeName = currentNode.getUndefined(); } - //if ignoreUndefined=true the miss is ignored and the score computed anyway + // if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { nextNodeName = currentNode.getPositive(); - } - else { + } else { nextNodeName = currentNode.getNegative(); } - } while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED); + } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED); treeStats.setResult(MatchType.parse(nextNodeName)); return treeStats; @@ -68,25 +68,24 @@ public class TreeProcessor { do { TreeNodeDef currentNode = config.decisionTree().get(nextNodeName); - //throw an exception if the node doesn't exist + // throw an exception if the node doesn't exist if (currentNode == null) throw new PaceException("The Tree Node doesn't exist: " + nextNodeName); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); score = stats.getFinalScore(currentNode.getAggregation()); - //if ignoreUndefined=false the miss is considered as undefined - if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { + // if ignoreUndefined=false the miss is considered as undefined + if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { nextNodeName = currentNode.getUndefined(); } - //if ignoreUndefined=true the miss is ignored and the score computed anyway + // if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { nextNodeName = currentNode.getPositive(); - } - else { + } else { nextNodeName = currentNode.getNegative(); } - } while (MatchType.parse(nextNodeName)==MatchType.UNDEFINED); + } while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED); return score; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java index 186e8d11e..9a7f38f47 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java @@ -1,51 +1,52 @@ -package eu.dnetlib.pace.tree.support; -import eu.dnetlib.pace.util.PaceException; -import com.fasterxml.jackson.databind.ObjectMapper; +package eu.dnetlib.pace.tree.support; import java.io.IOException; import java.util.HashMap; import java.util.Map; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.util.PaceException; + public class TreeStats { - //> - Map stats; - MatchType result; + // > + Map stats; + MatchType result; - public TreeStats(){ - this.stats = new HashMap<>(); - this.result = MatchType.NO_MATCH; - } + public TreeStats() { + this.stats = new HashMap<>(); + this.result = MatchType.NO_MATCH; + } - public MatchType getResult(){ - return this.result; - } + public MatchType getResult() { + return this.result; + } - public void setResult(MatchType result){ - this.result = result; - } + public void setResult(MatchType result) { + this.result = result; + } - public Map getStats() { - return stats; - } + public Map getStats() { + return stats; + } - public void setStats(Map stats) { - this.stats = stats; - } + public void setStats(Map stats) { + this.stats = stats; + } - public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){ - this.stats.put(layerID, treeNodeStats); - } - - @Override - public String toString(){ - try { - return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } + public void addNodeStats(String layerID, TreeNodeStats treeNodeStats) { + this.stats.put(layerID, treeNodeStats); + } + @Override + public String toString() { + try { + return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index 3f2414e5c..feae7402c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -1,8 +1,11 @@ + package eu.dnetlib.pace.util; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.tree.support.TreeProcessor; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -12,127 +15,137 @@ import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StringType; import org.apache.spark.sql.types.StructType; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.WfConfig; +import eu.dnetlib.pace.tree.support.TreeProcessor; public class BlockProcessor { - public static final List accumulators= new ArrayList<>(); + public static final List accumulators = new ArrayList<>(); - private static final Log log = LogFactory.getLog(BlockProcessor.class); + private static final Log log = LogFactory.getLog(BlockProcessor.class); - private DedupConfig dedupConf; + private DedupConfig dedupConf; - private final int identifierFieldPos; - private final int orderFieldPos; + private final int identifierFieldPos; + private final int orderFieldPos; - public static void constructAccumulator( final DedupConfig dedupConf) { - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()))); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); - } + public static void constructAccumulator(final DedupConfig dedupConf) { + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1")); + accumulators + .add( + String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); + accumulators + .add( + String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), + dedupConf.getWf().getGroupMaxSize()))); + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list")); + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); + accumulators + .add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); + } - public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) { - this.dedupConf = dedupConf; - this.identifierFieldPos = identifierFieldPos; - this.orderFieldPos = orderFieldPos; - } + public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) { + this.dedupConf = dedupConf; + this.identifierFieldPos = identifierFieldPos; + this.orderFieldPos = orderFieldPos; + } - public void processSortedRows(final Collection documents, final Reporter context) { - if (documents.size() > 1) { + public void processSortedRows(final Collection documents, final Reporter context) { + if (documents.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - processRows(documents, context); + processRows(documents, context); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } + private void processRows(final Collection queue, final Reporter context) { - private void processRows(final Collection queue, final Reporter context) { + Iterator it = queue.iterator(); + while (it.hasNext()) { - Iterator it = queue.iterator(); - while (it.hasNext()) { + final Row pivot = it.next(); + it.remove(); - final Row pivot = it.next(); - it.remove(); + final String idPivot = pivot.getString(identifierFieldPos); // identifier + final Object fieldsPivot = getJavaValue(pivot, orderFieldPos); + final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString(); + final WfConfig wf = dedupConf.getWf(); + if (fieldPivot != null) { + int i = 0; + for (final Row curr : queue) { + final String idCurr = curr.getString(identifierFieldPos); // identifier - final String idPivot = pivot.getString(identifierFieldPos); //identifier - final Object fieldsPivot = getJavaValue(pivot, orderFieldPos); - final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString(); - final WfConfig wf = dedupConf.getWf(); + if (mustSkip(idCurr)) { - if (fieldPivot != null) { - int i = 0; - for (final Row curr : queue) { - final String idCurr = curr.getString(identifierFieldPos); //identifier + context.incrementCounter(wf.getEntityType(), "skip list", 1); - if (mustSkip(idCurr)) { + break; + } - context.incrementCounter(wf.getEntityType(), "skip list", 1); + if (i > wf.getSlidingWindowSize()) { + break; + } - break; - } + final Object fieldsCurr = getJavaValue(curr, orderFieldPos); + final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString(); - if (i > wf.getSlidingWindowSize()) { - break; - } + if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - final Object fieldsCurr = getJavaValue(curr, orderFieldPos); - final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString(); + final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); - if (!idCurr.equals(idPivot) && (fieldCurr != null)) { + emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); - final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); + } + } + } + } + } - emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); + public Object getJavaValue(Row row, int pos) { + DataType dt = row.schema().fields()[pos].dataType(); + if (dt instanceof StringType) { + return row.getString(pos); + } else if (dt instanceof ArrayType) { + return row.getList(pos); + } - } - } - } - } - } + return null; + } - public Object getJavaValue(Row row, int pos) { - DataType dt = row.schema().fields()[pos].dataType(); - if (dt instanceof StringType) { - return row.getString(pos); - } else if (dt instanceof ArrayType) { - return row.getList(pos); - } + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { - return null; - } + if (result) { + writeSimilarity(context, idPivot, idCurr); + context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); + } + } - private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + private boolean mustSkip(final String idPivot) { + return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); + } - if (result) { - writeSimilarity(context, idPivot, idCurr); - context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); - } - } + private String getNsPrefix(final String id) { + return StringUtils.substringBetween(id, "|", "::"); + } - private boolean mustSkip(final String idPivot) { - return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); - } + private void writeSimilarity(final Reporter context, final String from, final String to) { + final String type = dedupConf.getWf().getEntityType(); - private String getNsPrefix(final String id) { - return StringUtils.substringBetween(id, "|", "::"); - } - - private void writeSimilarity(final Reporter context, final String from, final String to) { - final String type = dedupConf.getWf().getEntityType(); - - context.emit(type, from, to); - context.emit(type, to, from); - } + context.emit(type, from, to); + context.emit(type, to, from); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java index 2de729045..403d91dd9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java @@ -1,15 +1,18 @@ + package eu.dnetlib.pace.util; +import org.apache.commons.lang3.text.WordUtils; import com.google.common.base.Function; -import org.apache.commons.lang3.text.WordUtils; public class Capitalise implements Function { - private final char[] DELIM = {' ', '-'}; + private final char[] DELIM = { + ' ', '-' + }; - @Override - public String apply(final String s) { - return WordUtils.capitalize(s.toLowerCase(), DELIM); - } + @Override + public String apply(final String s) { + return WordUtils.capitalize(s.toLowerCase(), DELIM); + } }; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index 45e011fdd..cfd9acd70 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.util; /* @@ -17,7 +18,23 @@ package eu.dnetlib.pace.util; * See the License for the specific language governing permissions and * limitations under the License. */ - +/* + * Diff Match and Patch + * Copyright 2018 The diff-match-patch Authors. + * https://github.com/google/diff-match-patch + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; @@ -39,2433 +56,2498 @@ import java.util.regex.Pattern; */ public class DiffPatchMatch { - // Defaults. - // Set these on your diff_match_patch instance to override the defaults. - - /** - * Number of seconds to map a diff before giving up (0 for infinity). - */ - public float Diff_Timeout = 1.0f; - /** - * Cost of an empty edit operation in terms of edit characters. - */ - public short Diff_EditCost = 4; - /** - * At what point is no match declared (0.0 = perfection, 1.0 = very loose). - */ - public float Match_Threshold = 0.5f; - /** - * How far to search for a match (0 = exact location, 1000+ = broad match). - * A match this many characters away from the expected location will add - * 1.0 to the score (0.0 is a perfect match). - */ - public int Match_Distance = 1000; - /** - * When deleting a large block of text (over ~64 characters), how close do - * the contents have to be to match the expected contents. (0.0 = perfection, - * 1.0 = very loose). Note that Match_Threshold controls how closely the - * end points of a delete need to match. - */ - public float Patch_DeleteThreshold = 0.5f; - /** - * Chunk size for context length. - */ - public short Patch_Margin = 4; - - /** - * The number of bits in an int. - */ - private short Match_MaxBits = 32; - - /** - * Internal class for returning results from diff_linesToChars(). - * Other less paranoid languages just use a three-element array. - */ - protected static class LinesToCharsResult { - protected String chars1; - protected String chars2; - protected List lineArray; - - protected LinesToCharsResult(String chars1, String chars2, - List lineArray) { - this.chars1 = chars1; - this.chars2 = chars2; - this.lineArray = lineArray; - } - } - - - // DIFF FUNCTIONS - - - /** - * The data structure representing a diff is a Linked list of Diff objects: - * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), - * Diff(Operation.EQUAL, " world.")} - * which means: delete "Hello", add "Goodbye" and keep " world." - */ - public enum Operation { - DELETE, INSERT, EQUAL - } - - /** - * Find the differences between two texts. - * Run a faster, slightly less optimal diff. - * This method allows the 'checklines' of diff_main() to be optional. - * Most of the time checklines is wanted, so default to true. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @return Linked List of Diff objects. - */ - public LinkedList diff_main(String text1, String text2) { - return diff_main(text1, text2, true); - } - - /** - * Find the differences between two texts. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param checklines Speedup flag. If false, then don't run a - * line-level diff first to identify the changed areas. - * If true, then run a faster slightly less optimal diff. - * @return Linked List of Diff objects. - */ - public LinkedList diff_main(String text1, String text2, - boolean checklines) { - // Set a deadline by which time the diff must be complete. - long deadline; - if (Diff_Timeout <= 0) { - deadline = Long.MAX_VALUE; - } else { - deadline = System.currentTimeMillis() + (long) (Diff_Timeout * 1000); - } - return diff_main(text1, text2, checklines, deadline); - } - - /** - * Find the differences between two texts. Simplifies the problem by - * stripping any common prefix or suffix off the texts before diffing. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param checklines Speedup flag. If false, then don't run a - * line-level diff first to identify the changed areas. - * If true, then run a faster slightly less optimal diff. - * @param deadline Time when the diff should be complete by. Used - * internally for recursive calls. Users should set DiffTimeout instead. - * @return Linked List of Diff objects. - */ - private LinkedList diff_main(String text1, String text2, - boolean checklines, long deadline) { - // Check for null inputs. - if (text1 == null || text2 == null) { - throw new IllegalArgumentException("Null inputs. (diff_main)"); - } - - // Check for equality (speedup). - LinkedList diffs; - if (text1.equals(text2)) { - diffs = new LinkedList(); - if (text1.length() != 0) { - diffs.add(new Diff(Operation.EQUAL, text1)); - } - return diffs; - } - - // Trim off common prefix (speedup). - int commonlength = diff_commonPrefix(text1, text2); - String commonprefix = text1.substring(0, commonlength); - text1 = text1.substring(commonlength); - text2 = text2.substring(commonlength); - - // Trim off common suffix (speedup). - commonlength = diff_commonSuffix(text1, text2); - String commonsuffix = text1.substring(text1.length() - commonlength); - text1 = text1.substring(0, text1.length() - commonlength); - text2 = text2.substring(0, text2.length() - commonlength); - - // Compute the diff on the middle block. - diffs = diff_compute(text1, text2, checklines, deadline); - - // Restore the prefix and suffix. - if (commonprefix.length() != 0) { - diffs.addFirst(new Diff(Operation.EQUAL, commonprefix)); - } - if (commonsuffix.length() != 0) { - diffs.addLast(new Diff(Operation.EQUAL, commonsuffix)); - } - - diff_cleanupMerge(diffs); - return diffs; - } - - /** - * Find the differences between two texts. Assumes that the texts do not - * have any common prefix or suffix. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param checklines Speedup flag. If false, then don't run a - * line-level diff first to identify the changed areas. - * If true, then run a faster slightly less optimal diff. - * @param deadline Time when the diff should be complete by. - * @return Linked List of Diff objects. - */ - private LinkedList diff_compute(String text1, String text2, - boolean checklines, long deadline) { - LinkedList diffs = new LinkedList(); - - if (text1.length() == 0) { - // Just add some text (speedup). - diffs.add(new Diff(Operation.INSERT, text2)); - return diffs; - } - - if (text2.length() == 0) { - // Just delete some text (speedup). - diffs.add(new Diff(Operation.DELETE, text1)); - return diffs; - } - - String longtext = text1.length() > text2.length() ? text1 : text2; - String shorttext = text1.length() > text2.length() ? text2 : text1; - int i = longtext.indexOf(shorttext); - if (i != -1) { - // Shorter text is inside the longer text (speedup). - Operation op = (text1.length() > text2.length()) ? - Operation.DELETE : Operation.INSERT; - diffs.add(new Diff(op, longtext.substring(0, i))); - diffs.add(new Diff(Operation.EQUAL, shorttext)); - diffs.add(new Diff(op, longtext.substring(i + shorttext.length()))); - return diffs; - } - - if (shorttext.length() == 1) { - // Single character string. - // After the previous speedup, the character can't be an equality. - diffs.add(new Diff(Operation.DELETE, text1)); - diffs.add(new Diff(Operation.INSERT, text2)); - return diffs; - } - - // Check to see if the problem can be split in two. - String[] hm = diff_halfMatch(text1, text2); - if (hm != null) { - // A half-match was found, sort out the return data. - String text1_a = hm[0]; - String text1_b = hm[1]; - String text2_a = hm[2]; - String text2_b = hm[3]; - String mid_common = hm[4]; - // Send both pairs off for separate processing. - LinkedList diffs_a = diff_main(text1_a, text2_a, - checklines, deadline); - LinkedList diffs_b = diff_main(text1_b, text2_b, - checklines, deadline); - // Merge the results. - diffs = diffs_a; - diffs.add(new Diff(Operation.EQUAL, mid_common)); - diffs.addAll(diffs_b); - return diffs; - } - - if (checklines && text1.length() > 100 && text2.length() > 100) { - return diff_lineMode(text1, text2, deadline); - } - - return diff_bisect(text1, text2, deadline); - } - - /** - * Do a quick line-level diff on both strings, then rediff the parts for - * greater accuracy. - * This speedup can produce non-minimal diffs. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param deadline Time when the diff should be complete by. - * @return Linked List of Diff objects. - */ - private LinkedList diff_lineMode(String text1, String text2, - long deadline) { - // Scan the text on a line-by-line basis first. - LinesToCharsResult a = diff_linesToChars(text1, text2); - text1 = a.chars1; - text2 = a.chars2; - List linearray = a.lineArray; - - LinkedList diffs = diff_main(text1, text2, false, deadline); - - // Convert the diff back to original text. - diff_charsToLines(diffs, linearray); - // Eliminate freak matches (e.g. blank lines) - diff_cleanupSemantic(diffs); - - // Rediff any replacement blocks, this time character-by-character. - // Add a dummy entry at the end. - diffs.add(new Diff(Operation.EQUAL, "")); - int count_delete = 0; - int count_insert = 0; - String text_delete = ""; - String text_insert = ""; - ListIterator pointer = diffs.listIterator(); - Diff thisDiff = pointer.next(); - while (thisDiff != null) { - switch (thisDiff.operation) { - case INSERT: - count_insert++; - text_insert += thisDiff.text; - break; - case DELETE: - count_delete++; - text_delete += thisDiff.text; - break; - case EQUAL: - // Upon reaching an equality, check for prior redundancies. - if (count_delete >= 1 && count_insert >= 1) { - // Delete the offending records and add the merged ones. - pointer.previous(); - for (int j = 0; j < count_delete + count_insert; j++) { - pointer.previous(); - pointer.remove(); - } - for (Diff subDiff : diff_main(text_delete, text_insert, false, - deadline)) { - pointer.add(subDiff); - } - } - count_insert = 0; - count_delete = 0; - text_delete = ""; - text_insert = ""; - break; - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - diffs.removeLast(); // Remove the dummy entry at the end. - - return diffs; - } - - /** - * Find the 'middle snake' of a diff, split the problem in two - * and return the recursively constructed diff. - * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param deadline Time at which to bail if not yet complete. - * @return LinkedList of Diff objects. - */ - protected LinkedList diff_bisect(String text1, String text2, - long deadline) { - // Cache the text lengths to prevent multiple calls. - int text1_length = text1.length(); - int text2_length = text2.length(); - int max_d = (text1_length + text2_length + 1) / 2; - int v_offset = max_d; - int v_length = 2 * max_d; - int[] v1 = new int[v_length]; - int[] v2 = new int[v_length]; - for (int x = 0; x < v_length; x++) { - v1[x] = -1; - v2[x] = -1; - } - v1[v_offset + 1] = 0; - v2[v_offset + 1] = 0; - int delta = text1_length - text2_length; - // If the total number of characters is odd, then the front path will - // collide with the reverse path. - boolean front = (delta % 2 != 0); - // Offsets for start and end of k loop. - // Prevents mapping of space beyond the grid. - int k1start = 0; - int k1end = 0; - int k2start = 0; - int k2end = 0; - for (int d = 0; d < max_d; d++) { - // Bail out if deadline is reached. - if (System.currentTimeMillis() > deadline) { - break; - } - - // Walk the front path one step. - for (int k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { - int k1_offset = v_offset + k1; - int x1; - if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) { - x1 = v1[k1_offset + 1]; - } else { - x1 = v1[k1_offset - 1] + 1; - } - int y1 = x1 - k1; - while (x1 < text1_length && y1 < text2_length - && text1.charAt(x1) == text2.charAt(y1)) { - x1++; - y1++; - } - v1[k1_offset] = x1; - if (x1 > text1_length) { - // Ran off the right of the graph. - k1end += 2; - } else if (y1 > text2_length) { - // Ran off the bottom of the graph. - k1start += 2; - } else if (front) { - int k2_offset = v_offset + delta - k1; - if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) { - // Mirror x2 onto top-left coordinate system. - int x2 = text1_length - v2[k2_offset]; - if (x1 >= x2) { - // Overlap detected. - return diff_bisectSplit(text1, text2, x1, y1, deadline); - } - } - } - } - - // Walk the reverse path one step. - for (int k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { - int k2_offset = v_offset + k2; - int x2; - if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) { - x2 = v2[k2_offset + 1]; - } else { - x2 = v2[k2_offset - 1] + 1; - } - int y2 = x2 - k2; - while (x2 < text1_length && y2 < text2_length - && text1.charAt(text1_length - x2 - 1) - == text2.charAt(text2_length - y2 - 1)) { - x2++; - y2++; - } - v2[k2_offset] = x2; - if (x2 > text1_length) { - // Ran off the left of the graph. - k2end += 2; - } else if (y2 > text2_length) { - // Ran off the top of the graph. - k2start += 2; - } else if (!front) { - int k1_offset = v_offset + delta - k2; - if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) { - int x1 = v1[k1_offset]; - int y1 = v_offset + x1 - k1_offset; - // Mirror x2 onto top-left coordinate system. - x2 = text1_length - x2; - if (x1 >= x2) { - // Overlap detected. - return diff_bisectSplit(text1, text2, x1, y1, deadline); - } - } - } - } - } - // Diff took too long and hit the deadline or - // number of diffs equals number of characters, no commonality at all. - LinkedList diffs = new LinkedList(); - diffs.add(new Diff(Operation.DELETE, text1)); - diffs.add(new Diff(Operation.INSERT, text2)); - return diffs; - } - - /** - * Given the location of the 'middle snake', split the diff in two parts - * and recurse. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param x Index of split point in text1. - * @param y Index of split point in text2. - * @param deadline Time at which to bail if not yet complete. - * @return LinkedList of Diff objects. - */ - private LinkedList diff_bisectSplit(String text1, String text2, - int x, int y, long deadline) { - String text1a = text1.substring(0, x); - String text2a = text2.substring(0, y); - String text1b = text1.substring(x); - String text2b = text2.substring(y); - - // Compute both diffs serially. - LinkedList diffs = diff_main(text1a, text2a, false, deadline); - LinkedList diffsb = diff_main(text1b, text2b, false, deadline); - - diffs.addAll(diffsb); - return diffs; - } - - /** - * Split two texts into a list of strings. Reduce the texts to a string of - * hashes where each Unicode character represents one line. - * @param text1 First string. - * @param text2 Second string. - * @return An object containing the encoded text1, the encoded text2 and - * the List of unique strings. The zeroth element of the List of - * unique strings is intentionally blank. - */ - protected LinesToCharsResult diff_linesToChars(String text1, String text2) { - List lineArray = new ArrayList(); - Map lineHash = new HashMap(); - // e.g. linearray[4] == "Hello\n" - // e.g. linehash.get("Hello\n") == 4 - - // "\x00" is a valid character, but various debuggers don't like it. - // So we'll insert a junk entry to avoid generating a null character. - lineArray.add(""); - - // Allocate 2/3rds of the space for text1, the rest for text2. - String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash, 40000); - String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash, 65535); - return new LinesToCharsResult(chars1, chars2, lineArray); - } - - /** - * Split a text into a list of strings. Reduce the texts to a string of - * hashes where each Unicode character represents one line. - * @param text String to encode. - * @param lineArray List of unique strings. - * @param lineHash Map of strings to indices. - * @param maxLines Maximum length of lineArray. - * @return Encoded string. - */ - private String diff_linesToCharsMunge(String text, List lineArray, - Map lineHash, int maxLines) { - int lineStart = 0; - int lineEnd = -1; - String line; - StringBuilder chars = new StringBuilder(); - // Walk the text, pulling out a substring for each line. - // text.split('\n') would would temporarily double our memory footprint. - // Modifying text would create many large strings to garbage collect. - while (lineEnd < text.length() - 1) { - lineEnd = text.indexOf('\n', lineStart); - if (lineEnd == -1) { - lineEnd = text.length() - 1; - } - line = text.substring(lineStart, lineEnd + 1); - - if (lineHash.containsKey(line)) { - chars.append(String.valueOf((char) (int) lineHash.get(line))); - } else { - if (lineArray.size() == maxLines) { - // Bail out at 65535 because - // String.valueOf((char) 65536).equals(String.valueOf(((char) 0))) - line = text.substring(lineStart); - lineEnd = text.length(); - } - lineArray.add(line); - lineHash.put(line, lineArray.size() - 1); - chars.append(String.valueOf((char) (lineArray.size() - 1))); - } - lineStart = lineEnd + 1; - } - return chars.toString(); - } - - /** - * Rehydrate the text in a diff from a string of line hashes to real lines of - * text. - * @param diffs List of Diff objects. - * @param lineArray List of unique strings. - */ - protected void diff_charsToLines(List diffs, - List lineArray) { - StringBuilder text; - for (Diff diff : diffs) { - text = new StringBuilder(); - for (int j = 0; j < diff.text.length(); j++) { - text.append(lineArray.get(diff.text.charAt(j))); - } - diff.text = text.toString(); - } - } - - /** - * Determine the common prefix of two strings - * @param text1 First string. - * @param text2 Second string. - * @return The number of characters common to the start of each string. - */ - public int diff_commonPrefix(String text1, String text2) { - // Performance analysis: https://neil.fraser.name/news/2007/10/09/ - int n = Math.min(text1.length(), text2.length()); - for (int i = 0; i < n; i++) { - if (text1.charAt(i) != text2.charAt(i)) { - return i; - } - } - return n; - } - - /** - * Determine the common suffix of two strings - * @param text1 First string. - * @param text2 Second string. - * @return The number of characters common to the end of each string. - */ - public int diff_commonSuffix(String text1, String text2) { - // Performance analysis: https://neil.fraser.name/news/2007/10/09/ - int text1_length = text1.length(); - int text2_length = text2.length(); - int n = Math.min(text1_length, text2_length); - for (int i = 1; i <= n; i++) { - if (text1.charAt(text1_length - i) != text2.charAt(text2_length - i)) { - return i - 1; - } - } - return n; - } - - /** - * Determine if the suffix of one string is the prefix of another. - * @param text1 First string. - * @param text2 Second string. - * @return The number of characters common to the end of the first - * string and the start of the second string. - */ - protected int diff_commonOverlap(String text1, String text2) { - // Cache the text lengths to prevent multiple calls. - int text1_length = text1.length(); - int text2_length = text2.length(); - // Eliminate the null case. - if (text1_length == 0 || text2_length == 0) { - return 0; - } - // Truncate the longer string. - if (text1_length > text2_length) { - text1 = text1.substring(text1_length - text2_length); - } else if (text1_length < text2_length) { - text2 = text2.substring(0, text1_length); - } - int text_length = Math.min(text1_length, text2_length); - // Quick check for the worst case. - if (text1.equals(text2)) { - return text_length; - } - - // Start by looking for a single character match - // and increase length until no match is found. - // Performance analysis: https://neil.fraser.name/news/2010/11/04/ - int best = 0; - int length = 1; - while (true) { - String pattern = text1.substring(text_length - length); - int found = text2.indexOf(pattern); - if (found == -1) { - return best; - } - length += found; - if (found == 0 || text1.substring(text_length - length).equals( - text2.substring(0, length))) { - best = length; - length++; - } - } - } - - /** - * Do the two texts share a substring which is at least half the length of - * the longer text? - * This speedup can produce non-minimal diffs. - * @param text1 First string. - * @param text2 Second string. - * @return Five element String array, containing the prefix of text1, the - * suffix of text1, the prefix of text2, the suffix of text2 and the - * common middle. Or null if there was no match. - */ - protected String[] diff_halfMatch(String text1, String text2) { - if (Diff_Timeout <= 0) { - // Don't risk returning a non-optimal diff if we have unlimited time. - return null; - } - String longtext = text1.length() > text2.length() ? text1 : text2; - String shorttext = text1.length() > text2.length() ? text2 : text1; - if (longtext.length() < 4 || shorttext.length() * 2 < longtext.length()) { - return null; // Pointless. - } - - // First check if the second quarter is the seed for a half-match. - String[] hm1 = diff_halfMatchI(longtext, shorttext, - (longtext.length() + 3) / 4); - // Check again based on the third quarter. - String[] hm2 = diff_halfMatchI(longtext, shorttext, - (longtext.length() + 1) / 2); - String[] hm; - if (hm1 == null && hm2 == null) { - return null; - } else if (hm2 == null) { - hm = hm1; - } else if (hm1 == null) { - hm = hm2; - } else { - // Both matched. Select the longest. - hm = hm1[4].length() > hm2[4].length() ? hm1 : hm2; - } - - // A half-match was found, sort out the return data. - if (text1.length() > text2.length()) { - return hm; - //return new String[]{hm[0], hm[1], hm[2], hm[3], hm[4]}; - } else { - return new String[]{hm[2], hm[3], hm[0], hm[1], hm[4]}; - } - } - - /** - * Does a substring of shorttext exist within longtext such that the - * substring is at least half the length of longtext? - * @param longtext Longer string. - * @param shorttext Shorter string. - * @param i Start index of quarter length substring within longtext. - * @return Five element String array, containing the prefix of longtext, the - * suffix of longtext, the prefix of shorttext, the suffix of shorttext - * and the common middle. Or null if there was no match. - */ - private String[] diff_halfMatchI(String longtext, String shorttext, int i) { - // Start with a 1/4 length substring at position i as a seed. - String seed = longtext.substring(i, i + longtext.length() / 4); - int j = -1; - String best_common = ""; - String best_longtext_a = "", best_longtext_b = ""; - String best_shorttext_a = "", best_shorttext_b = ""; - while ((j = shorttext.indexOf(seed, j + 1)) != -1) { - int prefixLength = diff_commonPrefix(longtext.substring(i), - shorttext.substring(j)); - int suffixLength = diff_commonSuffix(longtext.substring(0, i), - shorttext.substring(0, j)); - if (best_common.length() < suffixLength + prefixLength) { - best_common = shorttext.substring(j - suffixLength, j) - + shorttext.substring(j, j + prefixLength); - best_longtext_a = longtext.substring(0, i - suffixLength); - best_longtext_b = longtext.substring(i + prefixLength); - best_shorttext_a = shorttext.substring(0, j - suffixLength); - best_shorttext_b = shorttext.substring(j + prefixLength); - } - } - if (best_common.length() * 2 >= longtext.length()) { - return new String[]{best_longtext_a, best_longtext_b, - best_shorttext_a, best_shorttext_b, best_common}; - } else { - return null; - } - } - - /** - * Reduce the number of edits by eliminating semantically trivial equalities. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupSemantic(LinkedList diffs) { - if (diffs.isEmpty()) { - return; - } - boolean changes = false; - Deque equalities = new ArrayDeque(); // Double-ended queue of qualities. - String lastEquality = null; // Always equal to equalities.peek().text - ListIterator pointer = diffs.listIterator(); - // Number of characters that changed prior to the equality. - int length_insertions1 = 0; - int length_deletions1 = 0; - // Number of characters that changed after the equality. - int length_insertions2 = 0; - int length_deletions2 = 0; - Diff thisDiff = pointer.next(); - while (thisDiff != null) { - if (thisDiff.operation == Operation.EQUAL) { - // Equality found. - equalities.push(thisDiff); - length_insertions1 = length_insertions2; - length_deletions1 = length_deletions2; - length_insertions2 = 0; - length_deletions2 = 0; - lastEquality = thisDiff.text; - } else { - // An insertion or deletion. - if (thisDiff.operation == Operation.INSERT) { - length_insertions2 += thisDiff.text.length(); - } else { - length_deletions2 += thisDiff.text.length(); - } - // Eliminate an equality that is smaller or equal to the edits on both - // sides of it. - if (lastEquality != null && (lastEquality.length() - <= Math.max(length_insertions1, length_deletions1)) - && (lastEquality.length() - <= Math.max(length_insertions2, length_deletions2))) { - //System.out.println("Splitting: '" + lastEquality + "'"); - // Walk back to offending equality. - while (thisDiff != equalities.peek()) { - thisDiff = pointer.previous(); - } - pointer.next(); - - // Replace equality with a delete. - pointer.set(new Diff(Operation.DELETE, lastEquality)); - // Insert a corresponding an insert. - pointer.add(new Diff(Operation.INSERT, lastEquality)); - - equalities.pop(); // Throw away the equality we just deleted. - if (!equalities.isEmpty()) { - // Throw away the previous equality (it needs to be reevaluated). - equalities.pop(); - } - if (equalities.isEmpty()) { - // There are no previous equalities, walk back to the start. - while (pointer.hasPrevious()) { - pointer.previous(); - } - } else { - // There is a safe equality we can fall back to. - thisDiff = equalities.peek(); - while (thisDiff != pointer.previous()) { - // Intentionally empty loop. - } - } - - length_insertions1 = 0; // Reset the counters. - length_insertions2 = 0; - length_deletions1 = 0; - length_deletions2 = 0; - lastEquality = null; - changes = true; - } - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - - // Normalize the diff. - if (changes) { - diff_cleanupMerge(diffs); - } - diff_cleanupSemanticLossless(diffs); - - // Find any overlaps between deletions and insertions. - // e.g: abcxxxxxxdef - // -> abcxxxdef - // e.g: xxxabcdefxxx - // -> defxxxabc - // Only extract an overlap if it is as big as the edit ahead or behind it. - pointer = diffs.listIterator(); - Diff prevDiff = null; - thisDiff = null; - if (pointer.hasNext()) { - prevDiff = pointer.next(); - if (pointer.hasNext()) { - thisDiff = pointer.next(); - } - } - while (thisDiff != null) { - if (prevDiff.operation == Operation.DELETE && - thisDiff.operation == Operation.INSERT) { - String deletion = prevDiff.text; - String insertion = thisDiff.text; - int overlap_length1 = this.diff_commonOverlap(deletion, insertion); - int overlap_length2 = this.diff_commonOverlap(insertion, deletion); - if (overlap_length1 >= overlap_length2) { - if (overlap_length1 >= deletion.length() / 2.0 || - overlap_length1 >= insertion.length() / 2.0) { - // Overlap found. Insert an equality and trim the surrounding edits. - pointer.previous(); - pointer.add(new Diff(Operation.EQUAL, - insertion.substring(0, overlap_length1))); - prevDiff.text = - deletion.substring(0, deletion.length() - overlap_length1); - thisDiff.text = insertion.substring(overlap_length1); - // pointer.add inserts the element before the cursor, so there is - // no need to step past the new element. - } - } else { - if (overlap_length2 >= deletion.length() / 2.0 || - overlap_length2 >= insertion.length() / 2.0) { - // Reverse overlap found. - // Insert an equality and swap and trim the surrounding edits. - pointer.previous(); - pointer.add(new Diff(Operation.EQUAL, - deletion.substring(0, overlap_length2))); - prevDiff.operation = Operation.INSERT; - prevDiff.text = - insertion.substring(0, insertion.length() - overlap_length2); - thisDiff.operation = Operation.DELETE; - thisDiff.text = deletion.substring(overlap_length2); - // pointer.add inserts the element before the cursor, so there is - // no need to step past the new element. - } - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - prevDiff = thisDiff; - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - } - - /** - * Look for single edits surrounded on both sides by equalities - * which can be shifted sideways to align the edit to a word boundary. - * e.g: The cat came. -> The cat came. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupSemanticLossless(LinkedList diffs) { - String equality1, edit, equality2; - String commonString; - int commonOffset; - int score, bestScore; - String bestEquality1, bestEdit, bestEquality2; - // Create a new iterator at the start. - ListIterator pointer = diffs.listIterator(); - Diff prevDiff = pointer.hasNext() ? pointer.next() : null; - Diff thisDiff = pointer.hasNext() ? pointer.next() : null; - Diff nextDiff = pointer.hasNext() ? pointer.next() : null; - // Intentionally ignore the first and last element (don't need checking). - while (nextDiff != null) { - if (prevDiff.operation == Operation.EQUAL && - nextDiff.operation == Operation.EQUAL) { - // This is a single edit surrounded by equalities. - equality1 = prevDiff.text; - edit = thisDiff.text; - equality2 = nextDiff.text; - - // First, shift the edit as far left as possible. - commonOffset = diff_commonSuffix(equality1, edit); - if (commonOffset != 0) { - commonString = edit.substring(edit.length() - commonOffset); - equality1 = equality1.substring(0, equality1.length() - commonOffset); - edit = commonString + edit.substring(0, edit.length() - commonOffset); - equality2 = commonString + equality2; - } - - // Second, step character by character right, looking for the best fit. - bestEquality1 = equality1; - bestEdit = edit; - bestEquality2 = equality2; - bestScore = diff_cleanupSemanticScore(equality1, edit) - + diff_cleanupSemanticScore(edit, equality2); - while (edit.length() != 0 && equality2.length() != 0 - && edit.charAt(0) == equality2.charAt(0)) { - equality1 += edit.charAt(0); - edit = edit.substring(1) + equality2.charAt(0); - equality2 = equality2.substring(1); - score = diff_cleanupSemanticScore(equality1, edit) - + diff_cleanupSemanticScore(edit, equality2); - // The >= encourages trailing rather than leading whitespace on edits. - if (score >= bestScore) { - bestScore = score; - bestEquality1 = equality1; - bestEdit = edit; - bestEquality2 = equality2; - } - } - - if (!prevDiff.text.equals(bestEquality1)) { - // We have an improvement, save it back to the diff. - if (bestEquality1.length() != 0) { - prevDiff.text = bestEquality1; - } else { - pointer.previous(); // Walk past nextDiff. - pointer.previous(); // Walk past thisDiff. - pointer.previous(); // Walk past prevDiff. - pointer.remove(); // Delete prevDiff. - pointer.next(); // Walk past thisDiff. - pointer.next(); // Walk past nextDiff. - } - thisDiff.text = bestEdit; - if (bestEquality2.length() != 0) { - nextDiff.text = bestEquality2; - } else { - pointer.remove(); // Delete nextDiff. - nextDiff = thisDiff; - thisDiff = prevDiff; - } - } - } - prevDiff = thisDiff; - thisDiff = nextDiff; - nextDiff = pointer.hasNext() ? pointer.next() : null; - } - } - - /** - * Given two strings, compute a score representing whether the internal - * boundary falls on logical boundaries. - * Scores range from 6 (best) to 0 (worst). - * @param one First string. - * @param two Second string. - * @return The score. - */ - private int diff_cleanupSemanticScore(String one, String two) { - if (one.length() == 0 || two.length() == 0) { - // Edges are the best. - return 6; - } - - // Each port of this function behaves slightly differently due to - // subtle differences in each language's definition of things like - // 'whitespace'. Since this function's purpose is largely cosmetic, - // the choice has been made to use each language's native features - // rather than force total conformity. - char char1 = one.charAt(one.length() - 1); - char char2 = two.charAt(0); - boolean nonAlphaNumeric1 = !Character.isLetterOrDigit(char1); - boolean nonAlphaNumeric2 = !Character.isLetterOrDigit(char2); - boolean whitespace1 = nonAlphaNumeric1 && Character.isWhitespace(char1); - boolean whitespace2 = nonAlphaNumeric2 && Character.isWhitespace(char2); - boolean lineBreak1 = whitespace1 - && Character.getType(char1) == Character.CONTROL; - boolean lineBreak2 = whitespace2 - && Character.getType(char2) == Character.CONTROL; - boolean blankLine1 = lineBreak1 && BLANKLINEEND.matcher(one).find(); - boolean blankLine2 = lineBreak2 && BLANKLINESTART.matcher(two).find(); - - if (blankLine1 || blankLine2) { - // Five points for blank lines. - return 5; - } else if (lineBreak1 || lineBreak2) { - // Four points for line breaks. - return 4; - } else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) { - // Three points for end of sentences. - return 3; - } else if (whitespace1 || whitespace2) { - // Two points for whitespace. - return 2; - } else if (nonAlphaNumeric1 || nonAlphaNumeric2) { - // One point for non-alphanumeric. - return 1; - } - return 0; - } - - // Define some regex patterns for matching boundaries. - private Pattern BLANKLINEEND - = Pattern.compile("\\n\\r?\\n\\Z", Pattern.DOTALL); - private Pattern BLANKLINESTART - = Pattern.compile("\\A\\r?\\n\\r?\\n", Pattern.DOTALL); - - /** - * Reduce the number of edits by eliminating operationally trivial equalities. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupEfficiency(LinkedList diffs) { - if (diffs.isEmpty()) { - return; - } - boolean changes = false; - Deque equalities = new ArrayDeque(); // Double-ended queue of equalities. - String lastEquality = null; // Always equal to equalities.peek().text - ListIterator pointer = diffs.listIterator(); - // Is there an insertion operation before the last equality. - boolean pre_ins = false; - // Is there a deletion operation before the last equality. - boolean pre_del = false; - // Is there an insertion operation after the last equality. - boolean post_ins = false; - // Is there a deletion operation after the last equality. - boolean post_del = false; - Diff thisDiff = pointer.next(); - Diff safeDiff = thisDiff; // The last Diff that is known to be unsplittable. - while (thisDiff != null) { - if (thisDiff.operation == Operation.EQUAL) { - // Equality found. - if (thisDiff.text.length() < Diff_EditCost && (post_ins || post_del)) { - // Candidate found. - equalities.push(thisDiff); - pre_ins = post_ins; - pre_del = post_del; - lastEquality = thisDiff.text; - } else { - // Not a candidate, and can never become one. - equalities.clear(); - lastEquality = null; - safeDiff = thisDiff; - } - post_ins = post_del = false; - } else { - // An insertion or deletion. - if (thisDiff.operation == Operation.DELETE) { - post_del = true; - } else { - post_ins = true; - } - /* - * Five types to be split: - * ABXYCD - * AXCD - * ABXC - * AXCD - * ABXC - */ - if (lastEquality != null - && ((pre_ins && pre_del && post_ins && post_del) - || ((lastEquality.length() < Diff_EditCost / 2) - && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) - + (post_ins ? 1 : 0) + (post_del ? 1 : 0)) == 3))) { - //System.out.println("Splitting: '" + lastEquality + "'"); - // Walk back to offending equality. - while (thisDiff != equalities.peek()) { - thisDiff = pointer.previous(); - } - pointer.next(); - - // Replace equality with a delete. - pointer.set(new Diff(Operation.DELETE, lastEquality)); - // Insert a corresponding an insert. - pointer.add(thisDiff = new Diff(Operation.INSERT, lastEquality)); - - equalities.pop(); // Throw away the equality we just deleted. - lastEquality = null; - if (pre_ins && pre_del) { - // No changes made which could affect previous entry, keep going. - post_ins = post_del = true; - equalities.clear(); - safeDiff = thisDiff; - } else { - if (!equalities.isEmpty()) { - // Throw away the previous equality (it needs to be reevaluated). - equalities.pop(); - } - if (equalities.isEmpty()) { - // There are no previous questionable equalities, - // walk back to the last known safe diff. - thisDiff = safeDiff; - } else { - // There is an equality we can fall back to. - thisDiff = equalities.peek(); - } - while (thisDiff != pointer.previous()) { - // Intentionally empty loop. - } - post_ins = post_del = false; - } - - changes = true; - } - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - - if (changes) { - diff_cleanupMerge(diffs); - } - } - - /** - * Reorder and merge like edit sections. Merge equalities. - * Any edit section can move as long as it doesn't cross an equality. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupMerge(LinkedList diffs) { - diffs.add(new Diff(Operation.EQUAL, "")); // Add a dummy entry at the end. - ListIterator pointer = diffs.listIterator(); - int count_delete = 0; - int count_insert = 0; - String text_delete = ""; - String text_insert = ""; - Diff thisDiff = pointer.next(); - Diff prevEqual = null; - int commonlength; - while (thisDiff != null) { - switch (thisDiff.operation) { - case INSERT: - count_insert++; - text_insert += thisDiff.text; - prevEqual = null; - break; - case DELETE: - count_delete++; - text_delete += thisDiff.text; - prevEqual = null; - break; - case EQUAL: - if (count_delete + count_insert > 1) { - boolean both_types = count_delete != 0 && count_insert != 0; - // Delete the offending records. - pointer.previous(); // Reverse direction. - while (count_delete-- > 0) { - pointer.previous(); - pointer.remove(); - } - while (count_insert-- > 0) { - pointer.previous(); - pointer.remove(); - } - if (both_types) { - // Factor out any common prefixies. - commonlength = diff_commonPrefix(text_insert, text_delete); - if (commonlength != 0) { - if (pointer.hasPrevious()) { - thisDiff = pointer.previous(); - assert thisDiff.operation == Operation.EQUAL - : "Previous diff should have been an equality."; - thisDiff.text += text_insert.substring(0, commonlength); - pointer.next(); - } else { - pointer.add(new Diff(Operation.EQUAL, - text_insert.substring(0, commonlength))); - } - text_insert = text_insert.substring(commonlength); - text_delete = text_delete.substring(commonlength); - } - // Factor out any common suffixies. - commonlength = diff_commonSuffix(text_insert, text_delete); - if (commonlength != 0) { - thisDiff = pointer.next(); - thisDiff.text = text_insert.substring(text_insert.length() - - commonlength) + thisDiff.text; - text_insert = text_insert.substring(0, text_insert.length() - - commonlength); - text_delete = text_delete.substring(0, text_delete.length() - - commonlength); - pointer.previous(); - } - } - // Insert the merged records. - if (text_delete.length() != 0) { - pointer.add(new Diff(Operation.DELETE, text_delete)); - } - if (text_insert.length() != 0) { - pointer.add(new Diff(Operation.INSERT, text_insert)); - } - // Step forward to the equality. - thisDiff = pointer.hasNext() ? pointer.next() : null; - } else if (prevEqual != null) { - // Merge this equality with the previous one. - prevEqual.text += thisDiff.text; - pointer.remove(); - thisDiff = pointer.previous(); - pointer.next(); // Forward direction - } - count_insert = 0; - count_delete = 0; - text_delete = ""; - text_insert = ""; - prevEqual = thisDiff; - break; - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - if (diffs.getLast().text.length() == 0) { - diffs.removeLast(); // Remove the dummy entry at the end. - } - - /* - * Second pass: look for single edits surrounded on both sides by equalities - * which can be shifted sideways to eliminate an equality. - * e.g: ABAC -> ABAC - */ - boolean changes = false; - // Create a new iterator at the start. - // (As opposed to walking the current one back.) - pointer = diffs.listIterator(); - Diff prevDiff = pointer.hasNext() ? pointer.next() : null; - thisDiff = pointer.hasNext() ? pointer.next() : null; - Diff nextDiff = pointer.hasNext() ? pointer.next() : null; - // Intentionally ignore the first and last element (don't need checking). - while (nextDiff != null) { - if (prevDiff.operation == Operation.EQUAL && - nextDiff.operation == Operation.EQUAL) { - // This is a single edit surrounded by equalities. - if (thisDiff.text.endsWith(prevDiff.text)) { - // Shift the edit over the previous equality. - thisDiff.text = prevDiff.text - + thisDiff.text.substring(0, thisDiff.text.length() - - prevDiff.text.length()); - nextDiff.text = prevDiff.text + nextDiff.text; - pointer.previous(); // Walk past nextDiff. - pointer.previous(); // Walk past thisDiff. - pointer.previous(); // Walk past prevDiff. - pointer.remove(); // Delete prevDiff. - pointer.next(); // Walk past thisDiff. - thisDiff = pointer.next(); // Walk past nextDiff. - nextDiff = pointer.hasNext() ? pointer.next() : null; - changes = true; - } else if (thisDiff.text.startsWith(nextDiff.text)) { - // Shift the edit over the next equality. - prevDiff.text += nextDiff.text; - thisDiff.text = thisDiff.text.substring(nextDiff.text.length()) - + nextDiff.text; - pointer.remove(); // Delete nextDiff. - nextDiff = pointer.hasNext() ? pointer.next() : null; - changes = true; - } - } - prevDiff = thisDiff; - thisDiff = nextDiff; - nextDiff = pointer.hasNext() ? pointer.next() : null; - } - // If shifts were made, the diff needs reordering and another shift sweep. - if (changes) { - diff_cleanupMerge(diffs); - } - } - - /** - * loc is a location in text1, compute and return the equivalent location in - * text2. - * e.g. "The cat" vs "The big cat", 1->1, 5->8 - * @param diffs List of Diff objects. - * @param loc Location within text1. - * @return Location within text2. - */ - public int diff_xIndex(List diffs, int loc) { - int chars1 = 0; - int chars2 = 0; - int last_chars1 = 0; - int last_chars2 = 0; - Diff lastDiff = null; - for (Diff aDiff : diffs) { - if (aDiff.operation != Operation.INSERT) { - // Equality or deletion. - chars1 += aDiff.text.length(); - } - if (aDiff.operation != Operation.DELETE) { - // Equality or insertion. - chars2 += aDiff.text.length(); - } - if (chars1 > loc) { - // Overshot the location. - lastDiff = aDiff; - break; - } - last_chars1 = chars1; - last_chars2 = chars2; - } - if (lastDiff != null && lastDiff.operation == Operation.DELETE) { - // The location was deleted. - return last_chars2; - } - // Add the remaining character length. - return last_chars2 + (loc - last_chars1); - } - - /** - * Convert a Diff list into a pretty HTML report. - * @param diffs List of Diff objects. - * @return HTML representation. - */ - public String diff_prettyHtml(List diffs) { - StringBuilder html = new StringBuilder(); - for (Diff aDiff : diffs) { - String text = aDiff.text.replace("&", "&").replace("<", "<") - .replace(">", ">").replace("\n", "¶
"); - switch (aDiff.operation) { - case INSERT: - html.append("").append(text) - .append(""); - break; - case DELETE: - html.append("").append(text) - .append(""); - break; - case EQUAL: - html.append("").append(text).append(""); - break; - } - } - return html.toString(); - } - - /** - * Compute and return the source text (all equalities and deletions). - * @param diffs List of Diff objects. - * @return Source text. - */ - public String diff_text1(List diffs) { - StringBuilder text = new StringBuilder(); - for (Diff aDiff : diffs) { - if (aDiff.operation != Operation.INSERT) { - text.append(aDiff.text); - } - } - return text.toString(); - } - - /** - * Compute and return the destination text (all equalities and insertions). - * @param diffs List of Diff objects. - * @return Destination text. - */ - public String diff_text2(List diffs) { - StringBuilder text = new StringBuilder(); - for (Diff aDiff : diffs) { - if (aDiff.operation != Operation.DELETE) { - text.append(aDiff.text); - } - } - return text.toString(); - } - - /** - * Compute the Levenshtein compare; the number of inserted, deleted or - * substituted characters. - * @param diffs List of Diff objects. - * @return Number of changes. - */ - public int diff_levenshtein(List diffs) { - int levenshtein = 0; - int insertions = 0; - int deletions = 0; - for (Diff aDiff : diffs) { - switch (aDiff.operation) { - case INSERT: - insertions += aDiff.text.length(); - break; - case DELETE: - deletions += aDiff.text.length(); - break; - case EQUAL: - // A deletion and an insertion is one substitution. - levenshtein += Math.max(insertions, deletions); - insertions = 0; - deletions = 0; - break; - } - } - levenshtein += Math.max(insertions, deletions); - return levenshtein; - } - - /** - * Crush the diff into an encoded string which describes the operations - * required to transform text1 into text2. - * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. - * Operations are tab-separated. Inserted text is escaped using %xx notation. - * @param diffs List of Diff objects. - * @return Delta text. - */ - public String diff_toDelta(List diffs) { - StringBuilder text = new StringBuilder(); - for (Diff aDiff : diffs) { - switch (aDiff.operation) { - case INSERT: - try { - text.append("+").append(URLEncoder.encode(aDiff.text, "UTF-8") - .replace('+', ' ')).append("\t"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } - break; - case DELETE: - text.append("-").append(aDiff.text.length()).append("\t"); - break; - case EQUAL: - text.append("=").append(aDiff.text.length()).append("\t"); - break; - } - } - String delta = text.toString(); - if (delta.length() != 0) { - // Strip off trailing tab character. - delta = delta.substring(0, delta.length() - 1); - delta = unescapeForEncodeUriCompatability(delta); - } - return delta; - } - - /** - * Given the original text1, and an encoded string which describes the - * operations required to transform text1 into text2, compute the full diff. - * @param text1 Source string for the diff. - * @param delta Delta text. - * @return Array of Diff objects or null if invalid. - * @throws IllegalArgumentException If invalid input. - */ - public LinkedList diff_fromDelta(String text1, String delta) - throws IllegalArgumentException { - LinkedList diffs = new LinkedList(); - int pointer = 0; // Cursor in text1 - String[] tokens = delta.split("\t"); - for (String token : tokens) { - if (token.length() == 0) { - // Blank tokens are ok (from a trailing \t). - continue; - } - // Each token begins with a one character parameter which specifies the - // operation of this token (delete, insert, equality). - String param = token.substring(1); - switch (token.charAt(0)) { - case '+': - // decode would change all "+" to " " - param = param.replace("+", "%2B"); - try { - param = URLDecoder.decode(param, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } catch (IllegalArgumentException e) { - // Malformed URI sequence. - throw new IllegalArgumentException( - "Illegal escape in diff_fromDelta: " + param, e); - } - diffs.add(new Diff(Operation.INSERT, param)); - break; - case '-': - // Fall through. - case '=': - int n; - try { - n = Integer.parseInt(param); - } catch (NumberFormatException e) { - throw new IllegalArgumentException( - "Invalid number in diff_fromDelta: " + param, e); - } - if (n < 0) { - throw new IllegalArgumentException( - "Negative number in diff_fromDelta: " + param); - } - String text; - try { - text = text1.substring(pointer, pointer += n); - } catch (StringIndexOutOfBoundsException e) { - throw new IllegalArgumentException("Delta length (" + pointer - + ") larger than source text length (" + text1.length() - + ").", e); - } - if (token.charAt(0) == '=') { - diffs.add(new Diff(Operation.EQUAL, text)); - } else { - diffs.add(new Diff(Operation.DELETE, text)); - } - break; - default: - // Anything else is an error. - throw new IllegalArgumentException( - "Invalid diff operation in diff_fromDelta: " + token.charAt(0)); - } - } - if (pointer != text1.length()) { - throw new IllegalArgumentException("Delta length (" + pointer - + ") smaller than source text length (" + text1.length() + ")."); - } - return diffs; - } - - - // MATCH FUNCTIONS - - - /** - * Locate the best instance of 'pattern' in 'text' near 'loc'. - * Returns -1 if no match found. - * @param text The text to search. - * @param pattern The pattern to search for. - * @param loc The location to search around. - * @return Best match index or -1. - */ - public int match_main(String text, String pattern, int loc) { - // Check for null inputs. - if (text == null || pattern == null) { - throw new IllegalArgumentException("Null inputs. (match_main)"); - } - - loc = Math.max(0, Math.min(loc, text.length())); - if (text.equals(pattern)) { - // Shortcut (potentially not guaranteed by the algorithm) - return 0; - } else if (text.length() == 0) { - // Nothing to match. - return -1; - } else if (loc + pattern.length() <= text.length() - && text.substring(loc, loc + pattern.length()).equals(pattern)) { - // Perfect match at the perfect spot! (Includes case of null pattern) - return loc; - } else { - // Do a fuzzy compare. - return match_bitap(text, pattern, loc); - } - } - - /** - * Locate the best instance of 'pattern' in 'text' near 'loc' using the - * Bitap algorithm. Returns -1 if no match found. - * @param text The text to search. - * @param pattern The pattern to search for. - * @param loc The location to search around. - * @return Best match index or -1. - */ - protected int match_bitap(String text, String pattern, int loc) { - assert (Match_MaxBits == 0 || pattern.length() <= Match_MaxBits) - : "Pattern too long for this application."; - - // Initialise the alphabet. - Map s = match_alphabet(pattern); - - // Highest score beyond which we give up. - double score_threshold = Match_Threshold; - // Is there a nearby exact match? (speedup) - int best_loc = text.indexOf(pattern, loc); - if (best_loc != -1) { - score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), - score_threshold); - // What about in the other direction? (speedup) - best_loc = text.lastIndexOf(pattern, loc + pattern.length()); - if (best_loc != -1) { - score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), - score_threshold); - } - } - - // Initialise the bit arrays. - int matchmask = 1 << (pattern.length() - 1); - best_loc = -1; - - int bin_min, bin_mid; - int bin_max = pattern.length() + text.length(); - // Empty initialization added to appease Java compiler. - int[] last_rd = new int[0]; - for (int d = 0; d < pattern.length(); d++) { - // Scan for the best match; each iteration allows for one more error. - // Run a binary search to determine how far from 'loc' we can stray at - // this error level. - bin_min = 0; - bin_mid = bin_max; - while (bin_min < bin_mid) { - if (match_bitapScore(d, loc + bin_mid, loc, pattern) - <= score_threshold) { - bin_min = bin_mid; - } else { - bin_max = bin_mid; - } - bin_mid = (bin_max - bin_min) / 2 + bin_min; - } - // Use the result from this iteration as the maximum for the next. - bin_max = bin_mid; - int start = Math.max(1, loc - bin_mid + 1); - int finish = Math.min(loc + bin_mid, text.length()) + pattern.length(); - - int[] rd = new int[finish + 2]; - rd[finish + 1] = (1 << d) - 1; - for (int j = finish; j >= start; j--) { - int charMatch; - if (text.length() <= j - 1 || !s.containsKey(text.charAt(j - 1))) { - // Out of range. - charMatch = 0; - } else { - charMatch = s.get(text.charAt(j - 1)); - } - if (d == 0) { - // First pass: exact match. - rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; - } else { - // Subsequent passes: fuzzy match. - rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) - | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; - } - if ((rd[j] & matchmask) != 0) { - double score = match_bitapScore(d, j - 1, loc, pattern); - // This match will almost certainly be better than any existing - // match. But check anyway. - if (score <= score_threshold) { - // Told you so. - score_threshold = score; - best_loc = j - 1; - if (best_loc > loc) { - // When passing loc, don't exceed our current compare from loc. - start = Math.max(1, 2 * loc - best_loc); - } else { - // Already passed loc, downhill from here on in. - break; - } - } - } - } - if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) { - // No hope for a (better) match at greater error levels. - break; - } - last_rd = rd; - } - return best_loc; - } - - /** - * Compute and return the score for a match with e errors and x location. - * @param e Number of errors in match. - * @param x Location of match. - * @param loc Expected location of match. - * @param pattern Pattern being sought. - * @return Overall score for match (0.0 = good, 1.0 = bad). - */ - private double match_bitapScore(int e, int x, int loc, String pattern) { - float accuracy = (float) e / pattern.length(); - int proximity = Math.abs(loc - x); - if (Match_Distance == 0) { - // Dodge divide by zero error. - return proximity == 0 ? accuracy : 1.0; - } - return accuracy + (proximity / (float) Match_Distance); - } - - /** - * Initialise the alphabet for the Bitap algorithm. - * @param pattern The text to encode. - * @return Hash of character locations. - */ - protected Map match_alphabet(String pattern) { - Map s = new HashMap(); - char[] char_pattern = pattern.toCharArray(); - for (char c : char_pattern) { - s.put(c, 0); - } - int i = 0; - for (char c : char_pattern) { - s.put(c, s.get(c) | (1 << (pattern.length() - i - 1))); - i++; - } - return s; - } - - - // PATCH FUNCTIONS - - - /** - * Increase the context until it is unique, - * but don't let the pattern expand beyond Match_MaxBits. - * @param patch The patch to grow. - * @param text Source text. - */ - protected void patch_addContext(Patch patch, String text) { - if (text.length() == 0) { - return; - } - String pattern = text.substring(patch.start2, patch.start2 + patch.length1); - int padding = 0; - - // Look for the first and last matches of pattern in text. If two different - // matches are found, increase the pattern length. - while (text.indexOf(pattern) != text.lastIndexOf(pattern) - && pattern.length() < Match_MaxBits - Patch_Margin - Patch_Margin) { - padding += Patch_Margin; - pattern = text.substring(Math.max(0, patch.start2 - padding), - Math.min(text.length(), patch.start2 + patch.length1 + padding)); - } - // Add one chunk for good luck. - padding += Patch_Margin; - - // Add the prefix. - String prefix = text.substring(Math.max(0, patch.start2 - padding), - patch.start2); - if (prefix.length() != 0) { - patch.diffs.addFirst(new Diff(Operation.EQUAL, prefix)); - } - // Add the suffix. - String suffix = text.substring(patch.start2 + patch.length1, - Math.min(text.length(), patch.start2 + patch.length1 + padding)); - if (suffix.length() != 0) { - patch.diffs.addLast(new Diff(Operation.EQUAL, suffix)); - } - - // Roll back the start points. - patch.start1 -= prefix.length(); - patch.start2 -= prefix.length(); - // Extend the lengths. - patch.length1 += prefix.length() + suffix.length(); - patch.length2 += prefix.length() + suffix.length(); - } - - /** - * Compute a list of patches to turn text1 into text2. - * A set of diffs will be computed. - * @param text1 Old text. - * @param text2 New text. - * @return LinkedList of Patch objects. - */ - public LinkedList patch_make(String text1, String text2) { - if (text1 == null || text2 == null) { - throw new IllegalArgumentException("Null inputs. (patch_make)"); - } - // No diffs provided, compute our own. - LinkedList diffs = diff_main(text1, text2, true); - if (diffs.size() > 2) { - diff_cleanupSemantic(diffs); - diff_cleanupEfficiency(diffs); - } - return patch_make(text1, diffs); - } - - /** - * Compute a list of patches to turn text1 into text2. - * text1 will be derived from the provided diffs. - * @param diffs Array of Diff objects for text1 to text2. - * @return LinkedList of Patch objects. - */ - public LinkedList patch_make(LinkedList diffs) { - if (diffs == null) { - throw new IllegalArgumentException("Null inputs. (patch_make)"); - } - // No origin string provided, compute our own. - String text1 = diff_text1(diffs); - return patch_make(text1, diffs); - } - - /** - * Compute a list of patches to turn text1 into text2. - * text2 is ignored, diffs are the delta between text1 and text2. - * @param text1 Old text - * @param text2 Ignored. - * @param diffs Array of Diff objects for text1 to text2. - * @return LinkedList of Patch objects. - * @deprecated Prefer patch_make(String text1, LinkedList diffs). - */ - @Deprecated public LinkedList patch_make(String text1, String text2, - LinkedList diffs) { - return patch_make(text1, diffs); - } - - /** - * Compute a list of patches to turn text1 into text2. - * text2 is not provided, diffs are the delta between text1 and text2. - * @param text1 Old text. - * @param diffs Array of Diff objects for text1 to text2. - * @return LinkedList of Patch objects. - */ - public LinkedList patch_make(String text1, LinkedList diffs) { - if (text1 == null || diffs == null) { - throw new IllegalArgumentException("Null inputs. (patch_make)"); - } - - LinkedList patches = new LinkedList(); - if (diffs.isEmpty()) { - return patches; // Get rid of the null case. - } - Patch patch = new Patch(); - int char_count1 = 0; // Number of characters into the text1 string. - int char_count2 = 0; // Number of characters into the text2 string. - // Start with text1 (prepatch_text) and apply the diffs until we arrive at - // text2 (postpatch_text). We recreate the patches one by one to determine - // context info. - String prepatch_text = text1; - String postpatch_text = text1; - for (Diff aDiff : diffs) { - if (patch.diffs.isEmpty() && aDiff.operation != Operation.EQUAL) { - // A new patch starts here. - patch.start1 = char_count1; - patch.start2 = char_count2; - } - - switch (aDiff.operation) { - case INSERT: - patch.diffs.add(aDiff); - patch.length2 += aDiff.text.length(); - postpatch_text = postpatch_text.substring(0, char_count2) - + aDiff.text + postpatch_text.substring(char_count2); - break; - case DELETE: - patch.length1 += aDiff.text.length(); - patch.diffs.add(aDiff); - postpatch_text = postpatch_text.substring(0, char_count2) - + postpatch_text.substring(char_count2 + aDiff.text.length()); - break; - case EQUAL: - if (aDiff.text.length() <= 2 * Patch_Margin - && !patch.diffs.isEmpty() && aDiff != diffs.getLast()) { - // Small equality inside a patch. - patch.diffs.add(aDiff); - patch.length1 += aDiff.text.length(); - patch.length2 += aDiff.text.length(); - } - - if (aDiff.text.length() >= 2 * Patch_Margin && !patch.diffs.isEmpty()) { - // Time for a new patch. - if (!patch.diffs.isEmpty()) { - patch_addContext(patch, prepatch_text); - patches.add(patch); - patch = new Patch(); - // Unlike Unidiff, our patch lists have a rolling context. - // https://github.com/google/diff-match-patch/wiki/Unidiff - // Update prepatch text & pos to reflect the application of the - // just completed patch. - prepatch_text = postpatch_text; - char_count1 = char_count2; - } - } - break; - } - - // Update the current character count. - if (aDiff.operation != Operation.INSERT) { - char_count1 += aDiff.text.length(); - } - if (aDiff.operation != Operation.DELETE) { - char_count2 += aDiff.text.length(); - } - } - // Pick up the leftover patch if not empty. - if (!patch.diffs.isEmpty()) { - patch_addContext(patch, prepatch_text); - patches.add(patch); - } - - return patches; - } - - /** - * Given an array of patches, return another array that is identical. - * @param patches Array of Patch objects. - * @return Array of Patch objects. - */ - public LinkedList patch_deepCopy(LinkedList patches) { - LinkedList patchesCopy = new LinkedList(); - for (Patch aPatch : patches) { - Patch patchCopy = new Patch(); - for (Diff aDiff : aPatch.diffs) { - Diff diffCopy = new Diff(aDiff.operation, aDiff.text); - patchCopy.diffs.add(diffCopy); - } - patchCopy.start1 = aPatch.start1; - patchCopy.start2 = aPatch.start2; - patchCopy.length1 = aPatch.length1; - patchCopy.length2 = aPatch.length2; - patchesCopy.add(patchCopy); - } - return patchesCopy; - } - - /** - * Merge a set of patches onto the text. Return a patched text, as well - * as an array of true/false values indicating which patches were applied. - * @param patches Array of Patch objects - * @param text Old text. - * @return Two element Object array, containing the new text and an array of - * boolean values. - */ - public Object[] patch_apply(LinkedList patches, String text) { - if (patches.isEmpty()) { - return new Object[]{text, new boolean[0]}; - } - - // Deep copy the patches so that no changes are made to originals. - patches = patch_deepCopy(patches); - - String nullPadding = patch_addPadding(patches); - text = nullPadding + text + nullPadding; - patch_splitMax(patches); - - int x = 0; - // delta keeps track of the offset between the expected and actual location - // of the previous patch. If there are patches expected at positions 10 and - // 20, but the first patch was found at 12, delta is 2 and the second patch - // has an effective expected position of 22. - int delta = 0; - boolean[] results = new boolean[patches.size()]; - for (Patch aPatch : patches) { - int expected_loc = aPatch.start2 + delta; - String text1 = diff_text1(aPatch.diffs); - int start_loc; - int end_loc = -1; - if (text1.length() > this.Match_MaxBits) { - // patch_splitMax will only provide an oversized pattern in the case of - // a monster delete. - start_loc = match_main(text, - text1.substring(0, this.Match_MaxBits), expected_loc); - if (start_loc != -1) { - end_loc = match_main(text, - text1.substring(text1.length() - this.Match_MaxBits), - expected_loc + text1.length() - this.Match_MaxBits); - if (end_loc == -1 || start_loc >= end_loc) { - // Can't find valid trailing context. Drop this patch. - start_loc = -1; - } - } - } else { - start_loc = match_main(text, text1, expected_loc); - } - if (start_loc == -1) { - // No match found. :( - results[x] = false; - // Subtract the delta for this failed patch from subsequent patches. - delta -= aPatch.length2 - aPatch.length1; - } else { - // Found a match. :) - results[x] = true; - delta = start_loc - expected_loc; - String text2; - if (end_loc == -1) { - text2 = text.substring(start_loc, - Math.min(start_loc + text1.length(), text.length())); - } else { - text2 = text.substring(start_loc, - Math.min(end_loc + this.Match_MaxBits, text.length())); - } - if (text1.equals(text2)) { - // Perfect match, just shove the replacement text in. - text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) - + text.substring(start_loc + text1.length()); - } else { - // Imperfect match. Run a diff to get a framework of equivalent - // indices. - LinkedList diffs = diff_main(text1, text2, false); - if (text1.length() > this.Match_MaxBits - && diff_levenshtein(diffs) / (float) text1.length() - > this.Patch_DeleteThreshold) { - // The end points match, but the content is unacceptably bad. - results[x] = false; - } else { - diff_cleanupSemanticLossless(diffs); - int index1 = 0; - for (Diff aDiff : aPatch.diffs) { - if (aDiff.operation != Operation.EQUAL) { - int index2 = diff_xIndex(diffs, index1); - if (aDiff.operation == Operation.INSERT) { - // Insertion - text = text.substring(0, start_loc + index2) + aDiff.text - + text.substring(start_loc + index2); - } else if (aDiff.operation == Operation.DELETE) { - // Deletion - text = text.substring(0, start_loc + index2) - + text.substring(start_loc + diff_xIndex(diffs, - index1 + aDiff.text.length())); - } - } - if (aDiff.operation != Operation.DELETE) { - index1 += aDiff.text.length(); - } - } - } - } - } - x++; - } - // Strip the padding off. - text = text.substring(nullPadding.length(), text.length() - - nullPadding.length()); - return new Object[]{text, results}; - } - - /** - * Add some padding on text start and end so that edges can match something. - * Intended to be called only from within patch_apply. - * @param patches Array of Patch objects. - * @return The padding string added to each side. - */ - public String patch_addPadding(LinkedList patches) { - short paddingLength = this.Patch_Margin; - String nullPadding = ""; - for (short x = 1; x <= paddingLength; x++) { - nullPadding += String.valueOf((char) x); - } - - // Bump all the patches forward. - for (Patch aPatch : patches) { - aPatch.start1 += paddingLength; - aPatch.start2 += paddingLength; - } - - // Add some padding on start of first diff. - Patch patch = patches.getFirst(); - LinkedList diffs = patch.diffs; - if (diffs.isEmpty() || diffs.getFirst().operation != Operation.EQUAL) { - // Add nullPadding equality. - diffs.addFirst(new Diff(Operation.EQUAL, nullPadding)); - patch.start1 -= paddingLength; // Should be 0. - patch.start2 -= paddingLength; // Should be 0. - patch.length1 += paddingLength; - patch.length2 += paddingLength; - } else if (paddingLength > diffs.getFirst().text.length()) { - // Grow first equality. - Diff firstDiff = diffs.getFirst(); - int extraLength = paddingLength - firstDiff.text.length(); - firstDiff.text = nullPadding.substring(firstDiff.text.length()) - + firstDiff.text; - patch.start1 -= extraLength; - patch.start2 -= extraLength; - patch.length1 += extraLength; - patch.length2 += extraLength; - } - - // Add some padding on end of last diff. - patch = patches.getLast(); - diffs = patch.diffs; - if (diffs.isEmpty() || diffs.getLast().operation != Operation.EQUAL) { - // Add nullPadding equality. - diffs.addLast(new Diff(Operation.EQUAL, nullPadding)); - patch.length1 += paddingLength; - patch.length2 += paddingLength; - } else if (paddingLength > diffs.getLast().text.length()) { - // Grow last equality. - Diff lastDiff = diffs.getLast(); - int extraLength = paddingLength - lastDiff.text.length(); - lastDiff.text += nullPadding.substring(0, extraLength); - patch.length1 += extraLength; - patch.length2 += extraLength; - } - - return nullPadding; - } - - /** - * Look through the patches and break up any which are longer than the - * maximum limit of the match algorithm. - * Intended to be called only from within patch_apply. - * @param patches LinkedList of Patch objects. - */ - public void patch_splitMax(LinkedList patches) { - short patch_size = Match_MaxBits; - String precontext, postcontext; - Patch patch; - int start1, start2; - boolean empty; - Operation diff_type; - String diff_text; - ListIterator pointer = patches.listIterator(); - Patch bigpatch = pointer.hasNext() ? pointer.next() : null; - while (bigpatch != null) { - if (bigpatch.length1 <= Match_MaxBits) { - bigpatch = pointer.hasNext() ? pointer.next() : null; - continue; - } - // Remove the big old patch. - pointer.remove(); - start1 = bigpatch.start1; - start2 = bigpatch.start2; - precontext = ""; - while (!bigpatch.diffs.isEmpty()) { - // Create one of several smaller patches. - patch = new Patch(); - empty = true; - patch.start1 = start1 - precontext.length(); - patch.start2 = start2 - precontext.length(); - if (precontext.length() != 0) { - patch.length1 = patch.length2 = precontext.length(); - patch.diffs.add(new Diff(Operation.EQUAL, precontext)); - } - while (!bigpatch.diffs.isEmpty() - && patch.length1 < patch_size - Patch_Margin) { - diff_type = bigpatch.diffs.getFirst().operation; - diff_text = bigpatch.diffs.getFirst().text; - if (diff_type == Operation.INSERT) { - // Insertions are harmless. - patch.length2 += diff_text.length(); - start2 += diff_text.length(); - patch.diffs.addLast(bigpatch.diffs.removeFirst()); - empty = false; - } else if (diff_type == Operation.DELETE && patch.diffs.size() == 1 - && patch.diffs.getFirst().operation == Operation.EQUAL - && diff_text.length() > 2 * patch_size) { - // This is a large deletion. Let it pass in one chunk. - patch.length1 += diff_text.length(); - start1 += diff_text.length(); - empty = false; - patch.diffs.add(new Diff(diff_type, diff_text)); - bigpatch.diffs.removeFirst(); - } else { - // Deletion or equality. Only take as much as we can stomach. - diff_text = diff_text.substring(0, Math.min(diff_text.length(), - patch_size - patch.length1 - Patch_Margin)); - patch.length1 += diff_text.length(); - start1 += diff_text.length(); - if (diff_type == Operation.EQUAL) { - patch.length2 += diff_text.length(); - start2 += diff_text.length(); - } else { - empty = false; - } - patch.diffs.add(new Diff(diff_type, diff_text)); - if (diff_text.equals(bigpatch.diffs.getFirst().text)) { - bigpatch.diffs.removeFirst(); - } else { - bigpatch.diffs.getFirst().text = bigpatch.diffs.getFirst().text - .substring(diff_text.length()); - } - } - } - // Compute the head context for the next patch. - precontext = diff_text2(patch.diffs); - precontext = precontext.substring(Math.max(0, precontext.length() - - Patch_Margin)); - // Append the end context for this patch. - if (diff_text1(bigpatch.diffs).length() > Patch_Margin) { - postcontext = diff_text1(bigpatch.diffs).substring(0, Patch_Margin); - } else { - postcontext = diff_text1(bigpatch.diffs); - } - if (postcontext.length() != 0) { - patch.length1 += postcontext.length(); - patch.length2 += postcontext.length(); - if (!patch.diffs.isEmpty() - && patch.diffs.getLast().operation == Operation.EQUAL) { - patch.diffs.getLast().text += postcontext; - } else { - patch.diffs.add(new Diff(Operation.EQUAL, postcontext)); - } - } - if (!empty) { - pointer.add(patch); - } - } - bigpatch = pointer.hasNext() ? pointer.next() : null; - } - } - - /** - * Take a list of patches and return a textual representation. - * @param patches List of Patch objects. - * @return Text representation of patches. - */ - public String patch_toText(List patches) { - StringBuilder text = new StringBuilder(); - for (Patch aPatch : patches) { - text.append(aPatch); - } - return text.toString(); - } - - /** - * Parse a textual representation of patches and return a List of Patch - * objects. - * @param textline Text representation of patches. - * @return List of Patch objects. - * @throws IllegalArgumentException If invalid input. - */ - public List patch_fromText(String textline) - throws IllegalArgumentException { - List patches = new LinkedList(); - if (textline.length() == 0) { - return patches; - } - List textList = Arrays.asList(textline.split("\n")); - LinkedList text = new LinkedList(textList); - Patch patch; - Pattern patchHeader - = Pattern.compile("^@@ -(\\d+),?(\\d*) \\+(\\d+),?(\\d*) @@$"); - Matcher m; - char sign; - String line; - while (!text.isEmpty()) { - m = patchHeader.matcher(text.getFirst()); - if (!m.matches()) { - throw new IllegalArgumentException( - "Invalid patch string: " + text.getFirst()); - } - patch = new Patch(); - patches.add(patch); - patch.start1 = Integer.parseInt(m.group(1)); - if (m.group(2).length() == 0) { - patch.start1--; - patch.length1 = 1; - } else if (m.group(2).equals("0")) { - patch.length1 = 0; - } else { - patch.start1--; - patch.length1 = Integer.parseInt(m.group(2)); - } - - patch.start2 = Integer.parseInt(m.group(3)); - if (m.group(4).length() == 0) { - patch.start2--; - patch.length2 = 1; - } else if (m.group(4).equals("0")) { - patch.length2 = 0; - } else { - patch.start2--; - patch.length2 = Integer.parseInt(m.group(4)); - } - text.removeFirst(); - - while (!text.isEmpty()) { - try { - sign = text.getFirst().charAt(0); - } catch (IndexOutOfBoundsException e) { - // Blank line? Whatever. - text.removeFirst(); - continue; - } - line = text.getFirst().substring(1); - line = line.replace("+", "%2B"); // decode would change all "+" to " " - try { - line = URLDecoder.decode(line, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } catch (IllegalArgumentException e) { - // Malformed URI sequence. - throw new IllegalArgumentException( - "Illegal escape in patch_fromText: " + line, e); - } - if (sign == '-') { - // Deletion. - patch.diffs.add(new Diff(Operation.DELETE, line)); - } else if (sign == '+') { - // Insertion. - patch.diffs.add(new Diff(Operation.INSERT, line)); - } else if (sign == ' ') { - // Minor equality. - patch.diffs.add(new Diff(Operation.EQUAL, line)); - } else if (sign == '@') { - // Start of next patch. - break; - } else { - // WTF? - throw new IllegalArgumentException( - "Invalid patch mode '" + sign + "' in: " + line); - } - text.removeFirst(); - } - } - return patches; - } - - - /** - * Class representing one diff operation. - */ - public static class Diff { - /** - * One of: INSERT, DELETE or EQUAL. - */ - public Operation operation; - /** - * The text associated with this diff operation. - */ - public String text; - - /** - * Constructor. Initializes the diff with the provided values. - * @param operation One of INSERT, DELETE or EQUAL. - * @param text The text being applied. - */ - public Diff(Operation operation, String text) { - // Construct a diff with the specified operation and text. - this.operation = operation; - this.text = text; - } - - /** - * Display a human-readable version of this Diff. - * @return text version. - */ - public String toString() { - String prettyText = this.text.replace('\n', '\u00b6'); - return "Diff(" + this.operation + ",\"" + prettyText + "\")"; - } - - /** - * Create a numeric hash value for a Diff. - * This function is not used by DMP. - * @return Hash value. - */ - @Override - public int hashCode() { - final int prime = 31; - int result = (operation == null) ? 0 : operation.hashCode(); - result += prime * ((text == null) ? 0 : text.hashCode()); - return result; - } - - /** - * Is this Diff equivalent to another Diff? - * @param obj Another Diff to compare against. - * @return true or false. - */ - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - Diff other = (Diff) obj; - if (operation != other.operation) { - return false; - } - if (text == null) { - if (other.text != null) { - return false; - } - } else if (!text.equals(other.text)) { - return false; - } - return true; - } - } - - - /** - * Class representing one patch operation. - */ - public static class Patch { - public LinkedList diffs; - public int start1; - public int start2; - public int length1; - public int length2; - - /** - * Constructor. Initializes with an empty list of diffs. - */ - public Patch() { - this.diffs = new LinkedList(); - } - - /** - * Emulate GNU diff's format. - * Header: @@ -382,8 +481,9 @@ - * Indices are printed as 1-based, not 0-based. - * @return The GNU diff string. - */ - public String toString() { - String coords1, coords2; - if (this.length1 == 0) { - coords1 = this.start1 + ",0"; - } else if (this.length1 == 1) { - coords1 = Integer.toString(this.start1 + 1); - } else { - coords1 = (this.start1 + 1) + "," + this.length1; - } - if (this.length2 == 0) { - coords2 = this.start2 + ",0"; - } else if (this.length2 == 1) { - coords2 = Integer.toString(this.start2 + 1); - } else { - coords2 = (this.start2 + 1) + "," + this.length2; - } - StringBuilder text = new StringBuilder(); - text.append("@@ -").append(coords1).append(" +").append(coords2) - .append(" @@\n"); - // Escape the body of the patch with %xx notation. - for (Diff aDiff : this.diffs) { - switch (aDiff.operation) { - case INSERT: - text.append('+'); - break; - case DELETE: - text.append('-'); - break; - case EQUAL: - text.append(' '); - break; - } - try { - text.append(URLEncoder.encode(aDiff.text, "UTF-8").replace('+', ' ')) - .append("\n"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } - } - return unescapeForEncodeUriCompatability(text.toString()); - } - } - - /** - * Unescape selected chars for compatability with JavaScript's encodeURI. - * In speed critical applications this could be dropped since the - * receiving application will certainly decode these fine. - * Note that this function is case-sensitive. Thus "%3f" would not be - * unescaped. But this is ok because it is only called with the output of - * URLEncoder.encode which returns uppercase hex. - * - * Example: "%3F" -> "?", "%24" -> "$", etc. - * - * @param str The string to escape. - * @return The escaped string. - */ - private static String unescapeForEncodeUriCompatability(String str) { - return str.replace("%21", "!").replace("%7E", "~") - .replace("%27", "'").replace("%28", "(").replace("%29", ")") - .replace("%3B", ";").replace("%2F", "/").replace("%3F", "?") - .replace("%3A", ":").replace("%40", "@").replace("%26", "&") - .replace("%3D", "=").replace("%2B", "+").replace("%24", "$") - .replace("%2C", ",").replace("%23", "#"); - } + // Defaults. + // Set these on your diff_match_patch instance to override the defaults. + + /** + * Number of seconds to map a diff before giving up (0 for infinity). + */ + public float Diff_Timeout = 1.0f; + /** + * Cost of an empty edit operation in terms of edit characters. + */ + public short Diff_EditCost = 4; + /** + * At what point is no match declared (0.0 = perfection, 1.0 = very loose). + */ + public float Match_Threshold = 0.5f; + /** + * How far to search for a match (0 = exact location, 1000+ = broad match). + * A match this many characters away from the expected location will add + * 1.0 to the score (0.0 is a perfect match). + */ + public int Match_Distance = 1000; + /** + * When deleting a large block of text (over ~64 characters), how close do + * the contents have to be to match the expected contents. (0.0 = perfection, + * 1.0 = very loose). Note that Match_Threshold controls how closely the + * end points of a delete need to match. + */ + public float Patch_DeleteThreshold = 0.5f; + /** + * Chunk size for context length. + */ + public short Patch_Margin = 4; + + /** + * The number of bits in an int. + */ + private short Match_MaxBits = 32; + + /** + * Internal class for returning results from diff_linesToChars(). + * Other less paranoid languages just use a three-element array. + */ + protected static class LinesToCharsResult { + protected String chars1; + protected String chars2; + protected List lineArray; + + protected LinesToCharsResult(String chars1, String chars2, + List lineArray) { + this.chars1 = chars1; + this.chars2 = chars2; + this.lineArray = lineArray; + } + } + + // DIFF FUNCTIONS + + /** + * The data structure representing a diff is a Linked list of Diff objects: + * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), + * Diff(Operation.EQUAL, " world.")} + * which means: delete "Hello", add "Goodbye" and keep " world." + */ + public enum Operation { + DELETE, INSERT, EQUAL + } + + /** + * Find the differences between two texts. + * Run a faster, slightly less optimal diff. + * This method allows the 'checklines' of diff_main() to be optional. + * Most of the time checklines is wanted, so default to true. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @return Linked List of Diff objects. + */ + public LinkedList diff_main(String text1, String text2) { + return diff_main(text1, text2, true); + } + + /** + * Find the differences between two texts. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @return Linked List of Diff objects. + */ + public LinkedList diff_main(String text1, String text2, + boolean checklines) { + // Set a deadline by which time the diff must be complete. + long deadline; + if (Diff_Timeout <= 0) { + deadline = Long.MAX_VALUE; + } else { + deadline = System.currentTimeMillis() + (long) (Diff_Timeout * 1000); + } + return diff_main(text1, text2, checklines, deadline); + } + + /** + * Find the differences between two texts. Simplifies the problem by + * stripping any common prefix or suffix off the texts before diffing. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @param deadline Time when the diff should be complete by. Used + * internally for recursive calls. Users should set DiffTimeout instead. + * @return Linked List of Diff objects. + */ + private LinkedList diff_main(String text1, String text2, + boolean checklines, long deadline) { + // Check for null inputs. + if (text1 == null || text2 == null) { + throw new IllegalArgumentException("Null inputs. (diff_main)"); + } + + // Check for equality (speedup). + LinkedList diffs; + if (text1.equals(text2)) { + diffs = new LinkedList(); + if (text1.length() != 0) { + diffs.add(new Diff(Operation.EQUAL, text1)); + } + return diffs; + } + + // Trim off common prefix (speedup). + int commonlength = diff_commonPrefix(text1, text2); + String commonprefix = text1.substring(0, commonlength); + text1 = text1.substring(commonlength); + text2 = text2.substring(commonlength); + + // Trim off common suffix (speedup). + commonlength = diff_commonSuffix(text1, text2); + String commonsuffix = text1.substring(text1.length() - commonlength); + text1 = text1.substring(0, text1.length() - commonlength); + text2 = text2.substring(0, text2.length() - commonlength); + + // Compute the diff on the middle block. + diffs = diff_compute(text1, text2, checklines, deadline); + + // Restore the prefix and suffix. + if (commonprefix.length() != 0) { + diffs.addFirst(new Diff(Operation.EQUAL, commonprefix)); + } + if (commonsuffix.length() != 0) { + diffs.addLast(new Diff(Operation.EQUAL, commonsuffix)); + } + + diff_cleanupMerge(diffs); + return diffs; + } + + /** + * Find the differences between two texts. Assumes that the texts do not + * have any common prefix or suffix. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @param deadline Time when the diff should be complete by. + * @return Linked List of Diff objects. + */ + private LinkedList diff_compute(String text1, String text2, + boolean checklines, long deadline) { + LinkedList diffs = new LinkedList(); + + if (text1.length() == 0) { + // Just add some text (speedup). + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + if (text2.length() == 0) { + // Just delete some text (speedup). + diffs.add(new Diff(Operation.DELETE, text1)); + return diffs; + } + + String longtext = text1.length() > text2.length() ? text1 : text2; + String shorttext = text1.length() > text2.length() ? text2 : text1; + int i = longtext.indexOf(shorttext); + if (i != -1) { + // Shorter text is inside the longer text (speedup). + Operation op = (text1.length() > text2.length()) ? Operation.DELETE : Operation.INSERT; + diffs.add(new Diff(op, longtext.substring(0, i))); + diffs.add(new Diff(Operation.EQUAL, shorttext)); + diffs.add(new Diff(op, longtext.substring(i + shorttext.length()))); + return diffs; + } + + if (shorttext.length() == 1) { + // Single character string. + // After the previous speedup, the character can't be an equality. + diffs.add(new Diff(Operation.DELETE, text1)); + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + // Check to see if the problem can be split in two. + String[] hm = diff_halfMatch(text1, text2); + if (hm != null) { + // A half-match was found, sort out the return data. + String text1_a = hm[0]; + String text1_b = hm[1]; + String text2_a = hm[2]; + String text2_b = hm[3]; + String mid_common = hm[4]; + // Send both pairs off for separate processing. + LinkedList diffs_a = diff_main( + text1_a, text2_a, + checklines, deadline); + LinkedList diffs_b = diff_main( + text1_b, text2_b, + checklines, deadline); + // Merge the results. + diffs = diffs_a; + diffs.add(new Diff(Operation.EQUAL, mid_common)); + diffs.addAll(diffs_b); + return diffs; + } + + if (checklines && text1.length() > 100 && text2.length() > 100) { + return diff_lineMode(text1, text2, deadline); + } + + return diff_bisect(text1, text2, deadline); + } + + /** + * Do a quick line-level diff on both strings, then rediff the parts for + * greater accuracy. + * This speedup can produce non-minimal diffs. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param deadline Time when the diff should be complete by. + * @return Linked List of Diff objects. + */ + private LinkedList diff_lineMode(String text1, String text2, + long deadline) { + // Scan the text on a line-by-line basis first. + LinesToCharsResult a = diff_linesToChars(text1, text2); + text1 = a.chars1; + text2 = a.chars2; + List linearray = a.lineArray; + + LinkedList diffs = diff_main(text1, text2, false, deadline); + + // Convert the diff back to original text. + diff_charsToLines(diffs, linearray); + // Eliminate freak matches (e.g. blank lines) + diff_cleanupSemantic(diffs); + + // Rediff any replacement blocks, this time character-by-character. + // Add a dummy entry at the end. + diffs.add(new Diff(Operation.EQUAL, "")); + int count_delete = 0; + int count_insert = 0; + String text_delete = ""; + String text_insert = ""; + ListIterator pointer = diffs.listIterator(); + Diff thisDiff = pointer.next(); + while (thisDiff != null) { + switch (thisDiff.operation) { + case INSERT: + count_insert++; + text_insert += thisDiff.text; + break; + case DELETE: + count_delete++; + text_delete += thisDiff.text; + break; + case EQUAL: + // Upon reaching an equality, check for prior redundancies. + if (count_delete >= 1 && count_insert >= 1) { + // Delete the offending records and add the merged ones. + pointer.previous(); + for (int j = 0; j < count_delete + count_insert; j++) { + pointer.previous(); + pointer.remove(); + } + for (Diff subDiff : diff_main( + text_delete, text_insert, false, + deadline)) { + pointer.add(subDiff); + } + } + count_insert = 0; + count_delete = 0; + text_delete = ""; + text_insert = ""; + break; + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + diffs.removeLast(); // Remove the dummy entry at the end. + + return diffs; + } + + /** + * Find the 'middle snake' of a diff, split the problem in two + * and return the recursively constructed diff. + * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param deadline Time at which to bail if not yet complete. + * @return LinkedList of Diff objects. + */ + protected LinkedList diff_bisect(String text1, String text2, + long deadline) { + // Cache the text lengths to prevent multiple calls. + int text1_length = text1.length(); + int text2_length = text2.length(); + int max_d = (text1_length + text2_length + 1) / 2; + int v_offset = max_d; + int v_length = 2 * max_d; + int[] v1 = new int[v_length]; + int[] v2 = new int[v_length]; + for (int x = 0; x < v_length; x++) { + v1[x] = -1; + v2[x] = -1; + } + v1[v_offset + 1] = 0; + v2[v_offset + 1] = 0; + int delta = text1_length - text2_length; + // If the total number of characters is odd, then the front path will + // collide with the reverse path. + boolean front = (delta % 2 != 0); + // Offsets for start and end of k loop. + // Prevents mapping of space beyond the grid. + int k1start = 0; + int k1end = 0; + int k2start = 0; + int k2end = 0; + for (int d = 0; d < max_d; d++) { + // Bail out if deadline is reached. + if (System.currentTimeMillis() > deadline) { + break; + } + + // Walk the front path one step. + for (int k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { + int k1_offset = v_offset + k1; + int x1; + if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) { + x1 = v1[k1_offset + 1]; + } else { + x1 = v1[k1_offset - 1] + 1; + } + int y1 = x1 - k1; + while (x1 < text1_length && y1 < text2_length + && text1.charAt(x1) == text2.charAt(y1)) { + x1++; + y1++; + } + v1[k1_offset] = x1; + if (x1 > text1_length) { + // Ran off the right of the graph. + k1end += 2; + } else if (y1 > text2_length) { + // Ran off the bottom of the graph. + k1start += 2; + } else if (front) { + int k2_offset = v_offset + delta - k1; + if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) { + // Mirror x2 onto top-left coordinate system. + int x2 = text1_length - v2[k2_offset]; + if (x1 >= x2) { + // Overlap detected. + return diff_bisectSplit(text1, text2, x1, y1, deadline); + } + } + } + } + + // Walk the reverse path one step. + for (int k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { + int k2_offset = v_offset + k2; + int x2; + if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) { + x2 = v2[k2_offset + 1]; + } else { + x2 = v2[k2_offset - 1] + 1; + } + int y2 = x2 - k2; + while (x2 < text1_length && y2 < text2_length + && text1.charAt(text1_length - x2 - 1) == text2.charAt(text2_length - y2 - 1)) { + x2++; + y2++; + } + v2[k2_offset] = x2; + if (x2 > text1_length) { + // Ran off the left of the graph. + k2end += 2; + } else if (y2 > text2_length) { + // Ran off the top of the graph. + k2start += 2; + } else if (!front) { + int k1_offset = v_offset + delta - k2; + if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) { + int x1 = v1[k1_offset]; + int y1 = v_offset + x1 - k1_offset; + // Mirror x2 onto top-left coordinate system. + x2 = text1_length - x2; + if (x1 >= x2) { + // Overlap detected. + return diff_bisectSplit(text1, text2, x1, y1, deadline); + } + } + } + } + } + // Diff took too long and hit the deadline or + // number of diffs equals number of characters, no commonality at all. + LinkedList diffs = new LinkedList(); + diffs.add(new Diff(Operation.DELETE, text1)); + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + /** + * Given the location of the 'middle snake', split the diff in two parts + * and recurse. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param x Index of split point in text1. + * @param y Index of split point in text2. + * @param deadline Time at which to bail if not yet complete. + * @return LinkedList of Diff objects. + */ + private LinkedList diff_bisectSplit(String text1, String text2, + int x, int y, long deadline) { + String text1a = text1.substring(0, x); + String text2a = text2.substring(0, y); + String text1b = text1.substring(x); + String text2b = text2.substring(y); + + // Compute both diffs serially. + LinkedList diffs = diff_main(text1a, text2a, false, deadline); + LinkedList diffsb = diff_main(text1b, text2b, false, deadline); + + diffs.addAll(diffsb); + return diffs; + } + + /** + * Split two texts into a list of strings. Reduce the texts to a string of + * hashes where each Unicode character represents one line. + * @param text1 First string. + * @param text2 Second string. + * @return An object containing the encoded text1, the encoded text2 and + * the List of unique strings. The zeroth element of the List of + * unique strings is intentionally blank. + */ + protected LinesToCharsResult diff_linesToChars(String text1, String text2) { + List lineArray = new ArrayList(); + Map lineHash = new HashMap(); + // e.g. linearray[4] == "Hello\n" + // e.g. linehash.get("Hello\n") == 4 + + // "\x00" is a valid character, but various debuggers don't like it. + // So we'll insert a junk entry to avoid generating a null character. + lineArray.add(""); + + // Allocate 2/3rds of the space for text1, the rest for text2. + String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash, 40000); + String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash, 65535); + return new LinesToCharsResult(chars1, chars2, lineArray); + } + + /** + * Split a text into a list of strings. Reduce the texts to a string of + * hashes where each Unicode character represents one line. + * @param text String to encode. + * @param lineArray List of unique strings. + * @param lineHash Map of strings to indices. + * @param maxLines Maximum length of lineArray. + * @return Encoded string. + */ + private String diff_linesToCharsMunge(String text, List lineArray, + Map lineHash, int maxLines) { + int lineStart = 0; + int lineEnd = -1; + String line; + StringBuilder chars = new StringBuilder(); + // Walk the text, pulling out a substring for each line. + // text.split('\n') would would temporarily double our memory footprint. + // Modifying text would create many large strings to garbage collect. + while (lineEnd < text.length() - 1) { + lineEnd = text.indexOf('\n', lineStart); + if (lineEnd == -1) { + lineEnd = text.length() - 1; + } + line = text.substring(lineStart, lineEnd + 1); + + if (lineHash.containsKey(line)) { + chars.append(String.valueOf((char) (int) lineHash.get(line))); + } else { + if (lineArray.size() == maxLines) { + // Bail out at 65535 because + // String.valueOf((char) 65536).equals(String.valueOf(((char) 0))) + line = text.substring(lineStart); + lineEnd = text.length(); + } + lineArray.add(line); + lineHash.put(line, lineArray.size() - 1); + chars.append(String.valueOf((char) (lineArray.size() - 1))); + } + lineStart = lineEnd + 1; + } + return chars.toString(); + } + + /** + * Rehydrate the text in a diff from a string of line hashes to real lines of + * text. + * @param diffs List of Diff objects. + * @param lineArray List of unique strings. + */ + protected void diff_charsToLines(List diffs, + List lineArray) { + StringBuilder text; + for (Diff diff : diffs) { + text = new StringBuilder(); + for (int j = 0; j < diff.text.length(); j++) { + text.append(lineArray.get(diff.text.charAt(j))); + } + diff.text = text.toString(); + } + } + + /** + * Determine the common prefix of two strings + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the start of each string. + */ + public int diff_commonPrefix(String text1, String text2) { + // Performance analysis: https://neil.fraser.name/news/2007/10/09/ + int n = Math.min(text1.length(), text2.length()); + for (int i = 0; i < n; i++) { + if (text1.charAt(i) != text2.charAt(i)) { + return i; + } + } + return n; + } + + /** + * Determine the common suffix of two strings + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the end of each string. + */ + public int diff_commonSuffix(String text1, String text2) { + // Performance analysis: https://neil.fraser.name/news/2007/10/09/ + int text1_length = text1.length(); + int text2_length = text2.length(); + int n = Math.min(text1_length, text2_length); + for (int i = 1; i <= n; i++) { + if (text1.charAt(text1_length - i) != text2.charAt(text2_length - i)) { + return i - 1; + } + } + return n; + } + + /** + * Determine if the suffix of one string is the prefix of another. + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the end of the first + * string and the start of the second string. + */ + protected int diff_commonOverlap(String text1, String text2) { + // Cache the text lengths to prevent multiple calls. + int text1_length = text1.length(); + int text2_length = text2.length(); + // Eliminate the null case. + if (text1_length == 0 || text2_length == 0) { + return 0; + } + // Truncate the longer string. + if (text1_length > text2_length) { + text1 = text1.substring(text1_length - text2_length); + } else if (text1_length < text2_length) { + text2 = text2.substring(0, text1_length); + } + int text_length = Math.min(text1_length, text2_length); + // Quick check for the worst case. + if (text1.equals(text2)) { + return text_length; + } + + // Start by looking for a single character match + // and increase length until no match is found. + // Performance analysis: https://neil.fraser.name/news/2010/11/04/ + int best = 0; + int length = 1; + while (true) { + String pattern = text1.substring(text_length - length); + int found = text2.indexOf(pattern); + if (found == -1) { + return best; + } + length += found; + if (found == 0 || text1 + .substring(text_length - length) + .equals( + text2.substring(0, length))) { + best = length; + length++; + } + } + } + + /** + * Do the two texts share a substring which is at least half the length of + * the longer text? + * This speedup can produce non-minimal diffs. + * @param text1 First string. + * @param text2 Second string. + * @return Five element String array, containing the prefix of text1, the + * suffix of text1, the prefix of text2, the suffix of text2 and the + * common middle. Or null if there was no match. + */ + protected String[] diff_halfMatch(String text1, String text2) { + if (Diff_Timeout <= 0) { + // Don't risk returning a non-optimal diff if we have unlimited time. + return null; + } + String longtext = text1.length() > text2.length() ? text1 : text2; + String shorttext = text1.length() > text2.length() ? text2 : text1; + if (longtext.length() < 4 || shorttext.length() * 2 < longtext.length()) { + return null; // Pointless. + } + + // First check if the second quarter is the seed for a half-match. + String[] hm1 = diff_halfMatchI( + longtext, shorttext, + (longtext.length() + 3) / 4); + // Check again based on the third quarter. + String[] hm2 = diff_halfMatchI( + longtext, shorttext, + (longtext.length() + 1) / 2); + String[] hm; + if (hm1 == null && hm2 == null) { + return null; + } else if (hm2 == null) { + hm = hm1; + } else if (hm1 == null) { + hm = hm2; + } else { + // Both matched. Select the longest. + hm = hm1[4].length() > hm2[4].length() ? hm1 : hm2; + } + + // A half-match was found, sort out the return data. + if (text1.length() > text2.length()) { + return hm; + // return new String[]{hm[0], hm[1], hm[2], hm[3], hm[4]}; + } else { + return new String[] { + hm[2], hm[3], hm[0], hm[1], hm[4] + }; + } + } + + /** + * Does a substring of shorttext exist within longtext such that the + * substring is at least half the length of longtext? + * @param longtext Longer string. + * @param shorttext Shorter string. + * @param i Start index of quarter length substring within longtext. + * @return Five element String array, containing the prefix of longtext, the + * suffix of longtext, the prefix of shorttext, the suffix of shorttext + * and the common middle. Or null if there was no match. + */ + private String[] diff_halfMatchI(String longtext, String shorttext, int i) { + // Start with a 1/4 length substring at position i as a seed. + String seed = longtext.substring(i, i + longtext.length() / 4); + int j = -1; + String best_common = ""; + String best_longtext_a = "", best_longtext_b = ""; + String best_shorttext_a = "", best_shorttext_b = ""; + while ((j = shorttext.indexOf(seed, j + 1)) != -1) { + int prefixLength = diff_commonPrefix( + longtext.substring(i), + shorttext.substring(j)); + int suffixLength = diff_commonSuffix( + longtext.substring(0, i), + shorttext.substring(0, j)); + if (best_common.length() < suffixLength + prefixLength) { + best_common = shorttext.substring(j - suffixLength, j) + + shorttext.substring(j, j + prefixLength); + best_longtext_a = longtext.substring(0, i - suffixLength); + best_longtext_b = longtext.substring(i + prefixLength); + best_shorttext_a = shorttext.substring(0, j - suffixLength); + best_shorttext_b = shorttext.substring(j + prefixLength); + } + } + if (best_common.length() * 2 >= longtext.length()) { + return new String[] { + best_longtext_a, best_longtext_b, + best_shorttext_a, best_shorttext_b, best_common + }; + } else { + return null; + } + } + + /** + * Reduce the number of edits by eliminating semantically trivial equalities. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupSemantic(LinkedList diffs) { + if (diffs.isEmpty()) { + return; + } + boolean changes = false; + Deque equalities = new ArrayDeque(); // Double-ended queue of qualities. + String lastEquality = null; // Always equal to equalities.peek().text + ListIterator pointer = diffs.listIterator(); + // Number of characters that changed prior to the equality. + int length_insertions1 = 0; + int length_deletions1 = 0; + // Number of characters that changed after the equality. + int length_insertions2 = 0; + int length_deletions2 = 0; + Diff thisDiff = pointer.next(); + while (thisDiff != null) { + if (thisDiff.operation == Operation.EQUAL) { + // Equality found. + equalities.push(thisDiff); + length_insertions1 = length_insertions2; + length_deletions1 = length_deletions2; + length_insertions2 = 0; + length_deletions2 = 0; + lastEquality = thisDiff.text; + } else { + // An insertion or deletion. + if (thisDiff.operation == Operation.INSERT) { + length_insertions2 += thisDiff.text.length(); + } else { + length_deletions2 += thisDiff.text.length(); + } + // Eliminate an equality that is smaller or equal to the edits on both + // sides of it. + if (lastEquality != null && (lastEquality.length() <= Math.max(length_insertions1, length_deletions1)) + && (lastEquality.length() <= Math.max(length_insertions2, length_deletions2))) { + // System.out.println("Splitting: '" + lastEquality + "'"); + // Walk back to offending equality. + while (thisDiff != equalities.peek()) { + thisDiff = pointer.previous(); + } + pointer.next(); + + // Replace equality with a delete. + pointer.set(new Diff(Operation.DELETE, lastEquality)); + // Insert a corresponding an insert. + pointer.add(new Diff(Operation.INSERT, lastEquality)); + + equalities.pop(); // Throw away the equality we just deleted. + if (!equalities.isEmpty()) { + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop(); + } + if (equalities.isEmpty()) { + // There are no previous equalities, walk back to the start. + while (pointer.hasPrevious()) { + pointer.previous(); + } + } else { + // There is a safe equality we can fall back to. + thisDiff = equalities.peek(); + while (thisDiff != pointer.previous()) { + // Intentionally empty loop. + } + } + + length_insertions1 = 0; // Reset the counters. + length_insertions2 = 0; + length_deletions1 = 0; + length_deletions2 = 0; + lastEquality = null; + changes = true; + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + + // Normalize the diff. + if (changes) { + diff_cleanupMerge(diffs); + } + diff_cleanupSemanticLossless(diffs); + + // Find any overlaps between deletions and insertions. + // e.g: abcxxxxxxdef + // -> abcxxxdef + // e.g: xxxabcdefxxx + // -> defxxxabc + // Only extract an overlap if it is as big as the edit ahead or behind it. + pointer = diffs.listIterator(); + Diff prevDiff = null; + thisDiff = null; + if (pointer.hasNext()) { + prevDiff = pointer.next(); + if (pointer.hasNext()) { + thisDiff = pointer.next(); + } + } + while (thisDiff != null) { + if (prevDiff.operation == Operation.DELETE && + thisDiff.operation == Operation.INSERT) { + String deletion = prevDiff.text; + String insertion = thisDiff.text; + int overlap_length1 = this.diff_commonOverlap(deletion, insertion); + int overlap_length2 = this.diff_commonOverlap(insertion, deletion); + if (overlap_length1 >= overlap_length2) { + if (overlap_length1 >= deletion.length() / 2.0 || + overlap_length1 >= insertion.length() / 2.0) { + // Overlap found. Insert an equality and trim the surrounding edits. + pointer.previous(); + pointer + .add( + new Diff(Operation.EQUAL, + insertion.substring(0, overlap_length1))); + prevDiff.text = deletion.substring(0, deletion.length() - overlap_length1); + thisDiff.text = insertion.substring(overlap_length1); + // pointer.add inserts the element before the cursor, so there is + // no need to step past the new element. + } + } else { + if (overlap_length2 >= deletion.length() / 2.0 || + overlap_length2 >= insertion.length() / 2.0) { + // Reverse overlap found. + // Insert an equality and swap and trim the surrounding edits. + pointer.previous(); + pointer + .add( + new Diff(Operation.EQUAL, + deletion.substring(0, overlap_length2))); + prevDiff.operation = Operation.INSERT; + prevDiff.text = insertion.substring(0, insertion.length() - overlap_length2); + thisDiff.operation = Operation.DELETE; + thisDiff.text = deletion.substring(overlap_length2); + // pointer.add inserts the element before the cursor, so there is + // no need to step past the new element. + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + prevDiff = thisDiff; + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Look for single edits surrounded on both sides by equalities + * which can be shifted sideways to align the edit to a word boundary. + * e.g: The cat came. -> The cat came. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupSemanticLossless(LinkedList diffs) { + String equality1, edit, equality2; + String commonString; + int commonOffset; + int score, bestScore; + String bestEquality1, bestEdit, bestEquality2; + // Create a new iterator at the start. + ListIterator pointer = diffs.listIterator(); + Diff prevDiff = pointer.hasNext() ? pointer.next() : null; + Diff thisDiff = pointer.hasNext() ? pointer.next() : null; + Diff nextDiff = pointer.hasNext() ? pointer.next() : null; + // Intentionally ignore the first and last element (don't need checking). + while (nextDiff != null) { + if (prevDiff.operation == Operation.EQUAL && + nextDiff.operation == Operation.EQUAL) { + // This is a single edit surrounded by equalities. + equality1 = prevDiff.text; + edit = thisDiff.text; + equality2 = nextDiff.text; + + // First, shift the edit as far left as possible. + commonOffset = diff_commonSuffix(equality1, edit); + if (commonOffset != 0) { + commonString = edit.substring(edit.length() - commonOffset); + equality1 = equality1.substring(0, equality1.length() - commonOffset); + edit = commonString + edit.substring(0, edit.length() - commonOffset); + equality2 = commonString + equality2; + } + + // Second, step character by character right, looking for the best fit. + bestEquality1 = equality1; + bestEdit = edit; + bestEquality2 = equality2; + bestScore = diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2); + while (edit.length() != 0 && equality2.length() != 0 + && edit.charAt(0) == equality2.charAt(0)) { + equality1 += edit.charAt(0); + edit = edit.substring(1) + equality2.charAt(0); + equality2 = equality2.substring(1); + score = diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2); + // The >= encourages trailing rather than leading whitespace on edits. + if (score >= bestScore) { + bestScore = score; + bestEquality1 = equality1; + bestEdit = edit; + bestEquality2 = equality2; + } + } + + if (!prevDiff.text.equals(bestEquality1)) { + // We have an improvement, save it back to the diff. + if (bestEquality1.length() != 0) { + prevDiff.text = bestEquality1; + } else { + pointer.previous(); // Walk past nextDiff. + pointer.previous(); // Walk past thisDiff. + pointer.previous(); // Walk past prevDiff. + pointer.remove(); // Delete prevDiff. + pointer.next(); // Walk past thisDiff. + pointer.next(); // Walk past nextDiff. + } + thisDiff.text = bestEdit; + if (bestEquality2.length() != 0) { + nextDiff.text = bestEquality2; + } else { + pointer.remove(); // Delete nextDiff. + nextDiff = thisDiff; + thisDiff = prevDiff; + } + } + } + prevDiff = thisDiff; + thisDiff = nextDiff; + nextDiff = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Given two strings, compute a score representing whether the internal + * boundary falls on logical boundaries. + * Scores range from 6 (best) to 0 (worst). + * @param one First string. + * @param two Second string. + * @return The score. + */ + private int diff_cleanupSemanticScore(String one, String two) { + if (one.length() == 0 || two.length() == 0) { + // Edges are the best. + return 6; + } + + // Each port of this function behaves slightly differently due to + // subtle differences in each language's definition of things like + // 'whitespace'. Since this function's purpose is largely cosmetic, + // the choice has been made to use each language's native features + // rather than force total conformity. + char char1 = one.charAt(one.length() - 1); + char char2 = two.charAt(0); + boolean nonAlphaNumeric1 = !Character.isLetterOrDigit(char1); + boolean nonAlphaNumeric2 = !Character.isLetterOrDigit(char2); + boolean whitespace1 = nonAlphaNumeric1 && Character.isWhitespace(char1); + boolean whitespace2 = nonAlphaNumeric2 && Character.isWhitespace(char2); + boolean lineBreak1 = whitespace1 + && Character.getType(char1) == Character.CONTROL; + boolean lineBreak2 = whitespace2 + && Character.getType(char2) == Character.CONTROL; + boolean blankLine1 = lineBreak1 && BLANKLINEEND.matcher(one).find(); + boolean blankLine2 = lineBreak2 && BLANKLINESTART.matcher(two).find(); + + if (blankLine1 || blankLine2) { + // Five points for blank lines. + return 5; + } else if (lineBreak1 || lineBreak2) { + // Four points for line breaks. + return 4; + } else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) { + // Three points for end of sentences. + return 3; + } else if (whitespace1 || whitespace2) { + // Two points for whitespace. + return 2; + } else if (nonAlphaNumeric1 || nonAlphaNumeric2) { + // One point for non-alphanumeric. + return 1; + } + return 0; + } + + // Define some regex patterns for matching boundaries. + private Pattern BLANKLINEEND = Pattern.compile("\\n\\r?\\n\\Z", Pattern.DOTALL); + private Pattern BLANKLINESTART = Pattern.compile("\\A\\r?\\n\\r?\\n", Pattern.DOTALL); + + /** + * Reduce the number of edits by eliminating operationally trivial equalities. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupEfficiency(LinkedList diffs) { + if (diffs.isEmpty()) { + return; + } + boolean changes = false; + Deque equalities = new ArrayDeque(); // Double-ended queue of equalities. + String lastEquality = null; // Always equal to equalities.peek().text + ListIterator pointer = diffs.listIterator(); + // Is there an insertion operation before the last equality. + boolean pre_ins = false; + // Is there a deletion operation before the last equality. + boolean pre_del = false; + // Is there an insertion operation after the last equality. + boolean post_ins = false; + // Is there a deletion operation after the last equality. + boolean post_del = false; + Diff thisDiff = pointer.next(); + Diff safeDiff = thisDiff; // The last Diff that is known to be unsplittable. + while (thisDiff != null) { + if (thisDiff.operation == Operation.EQUAL) { + // Equality found. + if (thisDiff.text.length() < Diff_EditCost && (post_ins || post_del)) { + // Candidate found. + equalities.push(thisDiff); + pre_ins = post_ins; + pre_del = post_del; + lastEquality = thisDiff.text; + } else { + // Not a candidate, and can never become one. + equalities.clear(); + lastEquality = null; + safeDiff = thisDiff; + } + post_ins = post_del = false; + } else { + // An insertion or deletion. + if (thisDiff.operation == Operation.DELETE) { + post_del = true; + } else { + post_ins = true; + } + /* + * Five types to be split: ABXYCD + * AXCD ABXC + * AXCD ABXC + */ + if (lastEquality != null + && ((pre_ins && pre_del && post_ins && post_del) + || ((lastEquality.length() < Diff_EditCost / 2) + && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) + + (post_ins ? 1 : 0) + (post_del ? 1 : 0)) == 3))) { + // System.out.println("Splitting: '" + lastEquality + "'"); + // Walk back to offending equality. + while (thisDiff != equalities.peek()) { + thisDiff = pointer.previous(); + } + pointer.next(); + + // Replace equality with a delete. + pointer.set(new Diff(Operation.DELETE, lastEquality)); + // Insert a corresponding an insert. + pointer.add(thisDiff = new Diff(Operation.INSERT, lastEquality)); + + equalities.pop(); // Throw away the equality we just deleted. + lastEquality = null; + if (pre_ins && pre_del) { + // No changes made which could affect previous entry, keep going. + post_ins = post_del = true; + equalities.clear(); + safeDiff = thisDiff; + } else { + if (!equalities.isEmpty()) { + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop(); + } + if (equalities.isEmpty()) { + // There are no previous questionable equalities, + // walk back to the last known safe diff. + thisDiff = safeDiff; + } else { + // There is an equality we can fall back to. + thisDiff = equalities.peek(); + } + while (thisDiff != pointer.previous()) { + // Intentionally empty loop. + } + post_ins = post_del = false; + } + + changes = true; + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + + if (changes) { + diff_cleanupMerge(diffs); + } + } + + /** + * Reorder and merge like edit sections. Merge equalities. + * Any edit section can move as long as it doesn't cross an equality. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupMerge(LinkedList diffs) { + diffs.add(new Diff(Operation.EQUAL, "")); // Add a dummy entry at the end. + ListIterator pointer = diffs.listIterator(); + int count_delete = 0; + int count_insert = 0; + String text_delete = ""; + String text_insert = ""; + Diff thisDiff = pointer.next(); + Diff prevEqual = null; + int commonlength; + while (thisDiff != null) { + switch (thisDiff.operation) { + case INSERT: + count_insert++; + text_insert += thisDiff.text; + prevEqual = null; + break; + case DELETE: + count_delete++; + text_delete += thisDiff.text; + prevEqual = null; + break; + case EQUAL: + if (count_delete + count_insert > 1) { + boolean both_types = count_delete != 0 && count_insert != 0; + // Delete the offending records. + pointer.previous(); // Reverse direction. + while (count_delete-- > 0) { + pointer.previous(); + pointer.remove(); + } + while (count_insert-- > 0) { + pointer.previous(); + pointer.remove(); + } + if (both_types) { + // Factor out any common prefixies. + commonlength = diff_commonPrefix(text_insert, text_delete); + if (commonlength != 0) { + if (pointer.hasPrevious()) { + thisDiff = pointer.previous(); + assert thisDiff.operation == Operation.EQUAL : "Previous diff should have been an equality."; + thisDiff.text += text_insert.substring(0, commonlength); + pointer.next(); + } else { + pointer + .add( + new Diff(Operation.EQUAL, + text_insert.substring(0, commonlength))); + } + text_insert = text_insert.substring(commonlength); + text_delete = text_delete.substring(commonlength); + } + // Factor out any common suffixies. + commonlength = diff_commonSuffix(text_insert, text_delete); + if (commonlength != 0) { + thisDiff = pointer.next(); + thisDiff.text = text_insert + .substring( + text_insert.length() + - commonlength) + + thisDiff.text; + text_insert = text_insert + .substring( + 0, text_insert.length() + - commonlength); + text_delete = text_delete + .substring( + 0, text_delete.length() + - commonlength); + pointer.previous(); + } + } + // Insert the merged records. + if (text_delete.length() != 0) { + pointer.add(new Diff(Operation.DELETE, text_delete)); + } + if (text_insert.length() != 0) { + pointer.add(new Diff(Operation.INSERT, text_insert)); + } + // Step forward to the equality. + thisDiff = pointer.hasNext() ? pointer.next() : null; + } else if (prevEqual != null) { + // Merge this equality with the previous one. + prevEqual.text += thisDiff.text; + pointer.remove(); + thisDiff = pointer.previous(); + pointer.next(); // Forward direction + } + count_insert = 0; + count_delete = 0; + text_delete = ""; + text_insert = ""; + prevEqual = thisDiff; + break; + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + if (diffs.getLast().text.length() == 0) { + diffs.removeLast(); // Remove the dummy entry at the end. + } + + /* + * Second pass: look for single edits surrounded on both sides by equalities which can be shifted sideways to + * eliminate an equality. e.g: ABAC -> ABAC + */ + boolean changes = false; + // Create a new iterator at the start. + // (As opposed to walking the current one back.) + pointer = diffs.listIterator(); + Diff prevDiff = pointer.hasNext() ? pointer.next() : null; + thisDiff = pointer.hasNext() ? pointer.next() : null; + Diff nextDiff = pointer.hasNext() ? pointer.next() : null; + // Intentionally ignore the first and last element (don't need checking). + while (nextDiff != null) { + if (prevDiff.operation == Operation.EQUAL && + nextDiff.operation == Operation.EQUAL) { + // This is a single edit surrounded by equalities. + if (thisDiff.text.endsWith(prevDiff.text)) { + // Shift the edit over the previous equality. + thisDiff.text = prevDiff.text + + thisDiff.text + .substring( + 0, thisDiff.text.length() + - prevDiff.text.length()); + nextDiff.text = prevDiff.text + nextDiff.text; + pointer.previous(); // Walk past nextDiff. + pointer.previous(); // Walk past thisDiff. + pointer.previous(); // Walk past prevDiff. + pointer.remove(); // Delete prevDiff. + pointer.next(); // Walk past thisDiff. + thisDiff = pointer.next(); // Walk past nextDiff. + nextDiff = pointer.hasNext() ? pointer.next() : null; + changes = true; + } else if (thisDiff.text.startsWith(nextDiff.text)) { + // Shift the edit over the next equality. + prevDiff.text += nextDiff.text; + thisDiff.text = thisDiff.text.substring(nextDiff.text.length()) + + nextDiff.text; + pointer.remove(); // Delete nextDiff. + nextDiff = pointer.hasNext() ? pointer.next() : null; + changes = true; + } + } + prevDiff = thisDiff; + thisDiff = nextDiff; + nextDiff = pointer.hasNext() ? pointer.next() : null; + } + // If shifts were made, the diff needs reordering and another shift sweep. + if (changes) { + diff_cleanupMerge(diffs); + } + } + + /** + * loc is a location in text1, compute and return the equivalent location in + * text2. + * e.g. "The cat" vs "The big cat", 1->1, 5->8 + * @param diffs List of Diff objects. + * @param loc Location within text1. + * @return Location within text2. + */ + public int diff_xIndex(List diffs, int loc) { + int chars1 = 0; + int chars2 = 0; + int last_chars1 = 0; + int last_chars2 = 0; + Diff lastDiff = null; + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.INSERT) { + // Equality or deletion. + chars1 += aDiff.text.length(); + } + if (aDiff.operation != Operation.DELETE) { + // Equality or insertion. + chars2 += aDiff.text.length(); + } + if (chars1 > loc) { + // Overshot the location. + lastDiff = aDiff; + break; + } + last_chars1 = chars1; + last_chars2 = chars2; + } + if (lastDiff != null && lastDiff.operation == Operation.DELETE) { + // The location was deleted. + return last_chars2; + } + // Add the remaining character length. + return last_chars2 + (loc - last_chars1); + } + + /** + * Convert a Diff list into a pretty HTML report. + * @param diffs List of Diff objects. + * @return HTML representation. + */ + public String diff_prettyHtml(List diffs) { + StringBuilder html = new StringBuilder(); + for (Diff aDiff : diffs) { + String text = aDiff.text + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("\n", "¶
"); + switch (aDiff.operation) { + case INSERT: + html + .append("") + .append(text) + .append(""); + break; + case DELETE: + html + .append("") + .append(text) + .append(""); + break; + case EQUAL: + html.append("").append(text).append(""); + break; + } + } + return html.toString(); + } + + /** + * Compute and return the source text (all equalities and deletions). + * @param diffs List of Diff objects. + * @return Source text. + */ + public String diff_text1(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.INSERT) { + text.append(aDiff.text); + } + } + return text.toString(); + } + + /** + * Compute and return the destination text (all equalities and insertions). + * @param diffs List of Diff objects. + * @return Destination text. + */ + public String diff_text2(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.DELETE) { + text.append(aDiff.text); + } + } + return text.toString(); + } + + /** + * Compute the Levenshtein compare; the number of inserted, deleted or + * substituted characters. + * @param diffs List of Diff objects. + * @return Number of changes. + */ + public int diff_levenshtein(List diffs) { + int levenshtein = 0; + int insertions = 0; + int deletions = 0; + for (Diff aDiff : diffs) { + switch (aDiff.operation) { + case INSERT: + insertions += aDiff.text.length(); + break; + case DELETE: + deletions += aDiff.text.length(); + break; + case EQUAL: + // A deletion and an insertion is one substitution. + levenshtein += Math.max(insertions, deletions); + insertions = 0; + deletions = 0; + break; + } + } + levenshtein += Math.max(insertions, deletions); + return levenshtein; + } + + /** + * Crush the diff into an encoded string which describes the operations + * required to transform text1 into text2. + * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. + * Operations are tab-separated. Inserted text is escaped using %xx notation. + * @param diffs List of Diff objects. + * @return Delta text. + */ + public String diff_toDelta(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + switch (aDiff.operation) { + case INSERT: + try { + text + .append("+") + .append( + URLEncoder + .encode(aDiff.text, "UTF-8") + .replace('+', ' ')) + .append("\t"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } + break; + case DELETE: + text.append("-").append(aDiff.text.length()).append("\t"); + break; + case EQUAL: + text.append("=").append(aDiff.text.length()).append("\t"); + break; + } + } + String delta = text.toString(); + if (delta.length() != 0) { + // Strip off trailing tab character. + delta = delta.substring(0, delta.length() - 1); + delta = unescapeForEncodeUriCompatability(delta); + } + return delta; + } + + /** + * Given the original text1, and an encoded string which describes the + * operations required to transform text1 into text2, compute the full diff. + * @param text1 Source string for the diff. + * @param delta Delta text. + * @return Array of Diff objects or null if invalid. + * @throws IllegalArgumentException If invalid input. + */ + public LinkedList diff_fromDelta(String text1, String delta) + throws IllegalArgumentException { + LinkedList diffs = new LinkedList(); + int pointer = 0; // Cursor in text1 + String[] tokens = delta.split("\t"); + for (String token : tokens) { + if (token.length() == 0) { + // Blank tokens are ok (from a trailing \t). + continue; + } + // Each token begins with a one character parameter which specifies the + // operation of this token (delete, insert, equality). + String param = token.substring(1); + switch (token.charAt(0)) { + case '+': + // decode would change all "+" to " " + param = param.replace("+", "%2B"); + try { + param = URLDecoder.decode(param, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } catch (IllegalArgumentException e) { + // Malformed URI sequence. + throw new IllegalArgumentException( + "Illegal escape in diff_fromDelta: " + param, e); + } + diffs.add(new Diff(Operation.INSERT, param)); + break; + case '-': + // Fall through. + case '=': + int n; + try { + n = Integer.parseInt(param); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "Invalid number in diff_fromDelta: " + param, e); + } + if (n < 0) { + throw new IllegalArgumentException( + "Negative number in diff_fromDelta: " + param); + } + String text; + try { + text = text1.substring(pointer, pointer += n); + } catch (StringIndexOutOfBoundsException e) { + throw new IllegalArgumentException("Delta length (" + pointer + + ") larger than source text length (" + text1.length() + + ").", e); + } + if (token.charAt(0) == '=') { + diffs.add(new Diff(Operation.EQUAL, text)); + } else { + diffs.add(new Diff(Operation.DELETE, text)); + } + break; + default: + // Anything else is an error. + throw new IllegalArgumentException( + "Invalid diff operation in diff_fromDelta: " + token.charAt(0)); + } + } + if (pointer != text1.length()) { + throw new IllegalArgumentException("Delta length (" + pointer + + ") smaller than source text length (" + text1.length() + ")."); + } + return diffs; + } + + // MATCH FUNCTIONS + + /** + * Locate the best instance of 'pattern' in 'text' near 'loc'. + * Returns -1 if no match found. + * @param text The text to search. + * @param pattern The pattern to search for. + * @param loc The location to search around. + * @return Best match index or -1. + */ + public int match_main(String text, String pattern, int loc) { + // Check for null inputs. + if (text == null || pattern == null) { + throw new IllegalArgumentException("Null inputs. (match_main)"); + } + + loc = Math.max(0, Math.min(loc, text.length())); + if (text.equals(pattern)) { + // Shortcut (potentially not guaranteed by the algorithm) + return 0; + } else if (text.length() == 0) { + // Nothing to match. + return -1; + } else if (loc + pattern.length() <= text.length() + && text.substring(loc, loc + pattern.length()).equals(pattern)) { + // Perfect match at the perfect spot! (Includes case of null pattern) + return loc; + } else { + // Do a fuzzy compare. + return match_bitap(text, pattern, loc); + } + } + + /** + * Locate the best instance of 'pattern' in 'text' near 'loc' using the + * Bitap algorithm. Returns -1 if no match found. + * @param text The text to search. + * @param pattern The pattern to search for. + * @param loc The location to search around. + * @return Best match index or -1. + */ + protected int match_bitap(String text, String pattern, int loc) { + assert (Match_MaxBits == 0 || pattern.length() <= Match_MaxBits) : "Pattern too long for this application."; + + // Initialise the alphabet. + Map s = match_alphabet(pattern); + + // Highest score beyond which we give up. + double score_threshold = Match_Threshold; + // Is there a nearby exact match? (speedup) + int best_loc = text.indexOf(pattern, loc); + if (best_loc != -1) { + score_threshold = Math + .min( + match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + // What about in the other direction? (speedup) + best_loc = text.lastIndexOf(pattern, loc + pattern.length()); + if (best_loc != -1) { + score_threshold = Math + .min( + match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + } + } + + // Initialise the bit arrays. + int matchmask = 1 << (pattern.length() - 1); + best_loc = -1; + + int bin_min, bin_mid; + int bin_max = pattern.length() + text.length(); + // Empty initialization added to appease Java compiler. + int[] last_rd = new int[0]; + for (int d = 0; d < pattern.length(); d++) { + // Scan for the best match; each iteration allows for one more error. + // Run a binary search to determine how far from 'loc' we can stray at + // this error level. + bin_min = 0; + bin_mid = bin_max; + while (bin_min < bin_mid) { + if (match_bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold) { + bin_min = bin_mid; + } else { + bin_max = bin_mid; + } + bin_mid = (bin_max - bin_min) / 2 + bin_min; + } + // Use the result from this iteration as the maximum for the next. + bin_max = bin_mid; + int start = Math.max(1, loc - bin_mid + 1); + int finish = Math.min(loc + bin_mid, text.length()) + pattern.length(); + + int[] rd = new int[finish + 2]; + rd[finish + 1] = (1 << d) - 1; + for (int j = finish; j >= start; j--) { + int charMatch; + if (text.length() <= j - 1 || !s.containsKey(text.charAt(j - 1))) { + // Out of range. + charMatch = 0; + } else { + charMatch = s.get(text.charAt(j - 1)); + } + if (d == 0) { + // First pass: exact match. + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; + } else { + // Subsequent passes: fuzzy match. + rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) + | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; + } + if ((rd[j] & matchmask) != 0) { + double score = match_bitapScore(d, j - 1, loc, pattern); + // This match will almost certainly be better than any existing + // match. But check anyway. + if (score <= score_threshold) { + // Told you so. + score_threshold = score; + best_loc = j - 1; + if (best_loc > loc) { + // When passing loc, don't exceed our current compare from loc. + start = Math.max(1, 2 * loc - best_loc); + } else { + // Already passed loc, downhill from here on in. + break; + } + } + } + } + if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) { + // No hope for a (better) match at greater error levels. + break; + } + last_rd = rd; + } + return best_loc; + } + + /** + * Compute and return the score for a match with e errors and x location. + * @param e Number of errors in match. + * @param x Location of match. + * @param loc Expected location of match. + * @param pattern Pattern being sought. + * @return Overall score for match (0.0 = good, 1.0 = bad). + */ + private double match_bitapScore(int e, int x, int loc, String pattern) { + float accuracy = (float) e / pattern.length(); + int proximity = Math.abs(loc - x); + if (Match_Distance == 0) { + // Dodge divide by zero error. + return proximity == 0 ? accuracy : 1.0; + } + return accuracy + (proximity / (float) Match_Distance); + } + + /** + * Initialise the alphabet for the Bitap algorithm. + * @param pattern The text to encode. + * @return Hash of character locations. + */ + protected Map match_alphabet(String pattern) { + Map s = new HashMap(); + char[] char_pattern = pattern.toCharArray(); + for (char c : char_pattern) { + s.put(c, 0); + } + int i = 0; + for (char c : char_pattern) { + s.put(c, s.get(c) | (1 << (pattern.length() - i - 1))); + i++; + } + return s; + } + + // PATCH FUNCTIONS + + /** + * Increase the context until it is unique, + * but don't let the pattern expand beyond Match_MaxBits. + * @param patch The patch to grow. + * @param text Source text. + */ + protected void patch_addContext(Patch patch, String text) { + if (text.length() == 0) { + return; + } + String pattern = text.substring(patch.start2, patch.start2 + patch.length1); + int padding = 0; + + // Look for the first and last matches of pattern in text. If two different + // matches are found, increase the pattern length. + while (text.indexOf(pattern) != text.lastIndexOf(pattern) + && pattern.length() < Match_MaxBits - Patch_Margin - Patch_Margin) { + padding += Patch_Margin; + pattern = text + .substring( + Math.max(0, patch.start2 - padding), + Math.min(text.length(), patch.start2 + patch.length1 + padding)); + } + // Add one chunk for good luck. + padding += Patch_Margin; + + // Add the prefix. + String prefix = text + .substring( + Math.max(0, patch.start2 - padding), + patch.start2); + if (prefix.length() != 0) { + patch.diffs.addFirst(new Diff(Operation.EQUAL, prefix)); + } + // Add the suffix. + String suffix = text + .substring( + patch.start2 + patch.length1, + Math.min(text.length(), patch.start2 + patch.length1 + padding)); + if (suffix.length() != 0) { + patch.diffs.addLast(new Diff(Operation.EQUAL, suffix)); + } + + // Roll back the start points. + patch.start1 -= prefix.length(); + patch.start2 -= prefix.length(); + // Extend the lengths. + patch.length1 += prefix.length() + suffix.length(); + patch.length2 += prefix.length() + suffix.length(); + } + + /** + * Compute a list of patches to turn text1 into text2. + * A set of diffs will be computed. + * @param text1 Old text. + * @param text2 New text. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(String text1, String text2) { + if (text1 == null || text2 == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + // No diffs provided, compute our own. + LinkedList diffs = diff_main(text1, text2, true); + if (diffs.size() > 2) { + diff_cleanupSemantic(diffs); + diff_cleanupEfficiency(diffs); + } + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text1 will be derived from the provided diffs. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(LinkedList diffs) { + if (diffs == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + // No origin string provided, compute our own. + String text1 = diff_text1(diffs); + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text2 is ignored, diffs are the delta between text1 and text2. + * @param text1 Old text + * @param text2 Ignored. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + * @deprecated Prefer patch_make(String text1, LinkedList diffs). + */ + @Deprecated + public LinkedList patch_make(String text1, String text2, + LinkedList diffs) { + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text2 is not provided, diffs are the delta between text1 and text2. + * @param text1 Old text. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(String text1, LinkedList diffs) { + if (text1 == null || diffs == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + + LinkedList patches = new LinkedList(); + if (diffs.isEmpty()) { + return patches; // Get rid of the null case. + } + Patch patch = new Patch(); + int char_count1 = 0; // Number of characters into the text1 string. + int char_count2 = 0; // Number of characters into the text2 string. + // Start with text1 (prepatch_text) and apply the diffs until we arrive at + // text2 (postpatch_text). We recreate the patches one by one to determine + // context info. + String prepatch_text = text1; + String postpatch_text = text1; + for (Diff aDiff : diffs) { + if (patch.diffs.isEmpty() && aDiff.operation != Operation.EQUAL) { + // A new patch starts here. + patch.start1 = char_count1; + patch.start2 = char_count2; + } + + switch (aDiff.operation) { + case INSERT: + patch.diffs.add(aDiff); + patch.length2 += aDiff.text.length(); + postpatch_text = postpatch_text.substring(0, char_count2) + + aDiff.text + postpatch_text.substring(char_count2); + break; + case DELETE: + patch.length1 += aDiff.text.length(); + patch.diffs.add(aDiff); + postpatch_text = postpatch_text.substring(0, char_count2) + + postpatch_text.substring(char_count2 + aDiff.text.length()); + break; + case EQUAL: + if (aDiff.text.length() <= 2 * Patch_Margin + && !patch.diffs.isEmpty() && aDiff != diffs.getLast()) { + // Small equality inside a patch. + patch.diffs.add(aDiff); + patch.length1 += aDiff.text.length(); + patch.length2 += aDiff.text.length(); + } + + if (aDiff.text.length() >= 2 * Patch_Margin && !patch.diffs.isEmpty()) { + // Time for a new patch. + if (!patch.diffs.isEmpty()) { + patch_addContext(patch, prepatch_text); + patches.add(patch); + patch = new Patch(); + // Unlike Unidiff, our patch lists have a rolling context. + // https://github.com/google/diff-match-patch/wiki/Unidiff + // Update prepatch text & pos to reflect the application of the + // just completed patch. + prepatch_text = postpatch_text; + char_count1 = char_count2; + } + } + break; + } + + // Update the current character count. + if (aDiff.operation != Operation.INSERT) { + char_count1 += aDiff.text.length(); + } + if (aDiff.operation != Operation.DELETE) { + char_count2 += aDiff.text.length(); + } + } + // Pick up the leftover patch if not empty. + if (!patch.diffs.isEmpty()) { + patch_addContext(patch, prepatch_text); + patches.add(patch); + } + + return patches; + } + + /** + * Given an array of patches, return another array that is identical. + * @param patches Array of Patch objects. + * @return Array of Patch objects. + */ + public LinkedList patch_deepCopy(LinkedList patches) { + LinkedList patchesCopy = new LinkedList(); + for (Patch aPatch : patches) { + Patch patchCopy = new Patch(); + for (Diff aDiff : aPatch.diffs) { + Diff diffCopy = new Diff(aDiff.operation, aDiff.text); + patchCopy.diffs.add(diffCopy); + } + patchCopy.start1 = aPatch.start1; + patchCopy.start2 = aPatch.start2; + patchCopy.length1 = aPatch.length1; + patchCopy.length2 = aPatch.length2; + patchesCopy.add(patchCopy); + } + return patchesCopy; + } + + /** + * Merge a set of patches onto the text. Return a patched text, as well + * as an array of true/false values indicating which patches were applied. + * @param patches Array of Patch objects + * @param text Old text. + * @return Two element Object array, containing the new text and an array of + * boolean values. + */ + public Object[] patch_apply(LinkedList patches, String text) { + if (patches.isEmpty()) { + return new Object[] { + text, new boolean[0] + }; + } + + // Deep copy the patches so that no changes are made to originals. + patches = patch_deepCopy(patches); + + String nullPadding = patch_addPadding(patches); + text = nullPadding + text + nullPadding; + patch_splitMax(patches); + + int x = 0; + // delta keeps track of the offset between the expected and actual location + // of the previous patch. If there are patches expected at positions 10 and + // 20, but the first patch was found at 12, delta is 2 and the second patch + // has an effective expected position of 22. + int delta = 0; + boolean[] results = new boolean[patches.size()]; + for (Patch aPatch : patches) { + int expected_loc = aPatch.start2 + delta; + String text1 = diff_text1(aPatch.diffs); + int start_loc; + int end_loc = -1; + if (text1.length() > this.Match_MaxBits) { + // patch_splitMax will only provide an oversized pattern in the case of + // a monster delete. + start_loc = match_main( + text, + text1.substring(0, this.Match_MaxBits), expected_loc); + if (start_loc != -1) { + end_loc = match_main( + text, + text1.substring(text1.length() - this.Match_MaxBits), + expected_loc + text1.length() - this.Match_MaxBits); + if (end_loc == -1 || start_loc >= end_loc) { + // Can't find valid trailing context. Drop this patch. + start_loc = -1; + } + } + } else { + start_loc = match_main(text, text1, expected_loc); + } + if (start_loc == -1) { + // No match found. :( + results[x] = false; + // Subtract the delta for this failed patch from subsequent patches. + delta -= aPatch.length2 - aPatch.length1; + } else { + // Found a match. :) + results[x] = true; + delta = start_loc - expected_loc; + String text2; + if (end_loc == -1) { + text2 = text + .substring( + start_loc, + Math.min(start_loc + text1.length(), text.length())); + } else { + text2 = text + .substring( + start_loc, + Math.min(end_loc + this.Match_MaxBits, text.length())); + } + if (text1.equals(text2)) { + // Perfect match, just shove the replacement text in. + text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) + + text.substring(start_loc + text1.length()); + } else { + // Imperfect match. Run a diff to get a framework of equivalent + // indices. + LinkedList diffs = diff_main(text1, text2, false); + if (text1.length() > this.Match_MaxBits + && diff_levenshtein(diffs) / (float) text1.length() > this.Patch_DeleteThreshold) { + // The end points match, but the content is unacceptably bad. + results[x] = false; + } else { + diff_cleanupSemanticLossless(diffs); + int index1 = 0; + for (Diff aDiff : aPatch.diffs) { + if (aDiff.operation != Operation.EQUAL) { + int index2 = diff_xIndex(diffs, index1); + if (aDiff.operation == Operation.INSERT) { + // Insertion + text = text.substring(0, start_loc + index2) + aDiff.text + + text.substring(start_loc + index2); + } else if (aDiff.operation == Operation.DELETE) { + // Deletion + text = text.substring(0, start_loc + index2) + + text + .substring( + start_loc + diff_xIndex( + diffs, + index1 + aDiff.text.length())); + } + } + if (aDiff.operation != Operation.DELETE) { + index1 += aDiff.text.length(); + } + } + } + } + } + x++; + } + // Strip the padding off. + text = text + .substring( + nullPadding.length(), text.length() + - nullPadding.length()); + return new Object[] { + text, results + }; + } + + /** + * Add some padding on text start and end so that edges can match something. + * Intended to be called only from within patch_apply. + * @param patches Array of Patch objects. + * @return The padding string added to each side. + */ + public String patch_addPadding(LinkedList patches) { + short paddingLength = this.Patch_Margin; + String nullPadding = ""; + for (short x = 1; x <= paddingLength; x++) { + nullPadding += String.valueOf((char) x); + } + + // Bump all the patches forward. + for (Patch aPatch : patches) { + aPatch.start1 += paddingLength; + aPatch.start2 += paddingLength; + } + + // Add some padding on start of first diff. + Patch patch = patches.getFirst(); + LinkedList diffs = patch.diffs; + if (diffs.isEmpty() || diffs.getFirst().operation != Operation.EQUAL) { + // Add nullPadding equality. + diffs.addFirst(new Diff(Operation.EQUAL, nullPadding)); + patch.start1 -= paddingLength; // Should be 0. + patch.start2 -= paddingLength; // Should be 0. + patch.length1 += paddingLength; + patch.length2 += paddingLength; + } else if (paddingLength > diffs.getFirst().text.length()) { + // Grow first equality. + Diff firstDiff = diffs.getFirst(); + int extraLength = paddingLength - firstDiff.text.length(); + firstDiff.text = nullPadding.substring(firstDiff.text.length()) + + firstDiff.text; + patch.start1 -= extraLength; + patch.start2 -= extraLength; + patch.length1 += extraLength; + patch.length2 += extraLength; + } + + // Add some padding on end of last diff. + patch = patches.getLast(); + diffs = patch.diffs; + if (diffs.isEmpty() || diffs.getLast().operation != Operation.EQUAL) { + // Add nullPadding equality. + diffs.addLast(new Diff(Operation.EQUAL, nullPadding)); + patch.length1 += paddingLength; + patch.length2 += paddingLength; + } else if (paddingLength > diffs.getLast().text.length()) { + // Grow last equality. + Diff lastDiff = diffs.getLast(); + int extraLength = paddingLength - lastDiff.text.length(); + lastDiff.text += nullPadding.substring(0, extraLength); + patch.length1 += extraLength; + patch.length2 += extraLength; + } + + return nullPadding; + } + + /** + * Look through the patches and break up any which are longer than the + * maximum limit of the match algorithm. + * Intended to be called only from within patch_apply. + * @param patches LinkedList of Patch objects. + */ + public void patch_splitMax(LinkedList patches) { + short patch_size = Match_MaxBits; + String precontext, postcontext; + Patch patch; + int start1, start2; + boolean empty; + Operation diff_type; + String diff_text; + ListIterator pointer = patches.listIterator(); + Patch bigpatch = pointer.hasNext() ? pointer.next() : null; + while (bigpatch != null) { + if (bigpatch.length1 <= Match_MaxBits) { + bigpatch = pointer.hasNext() ? pointer.next() : null; + continue; + } + // Remove the big old patch. + pointer.remove(); + start1 = bigpatch.start1; + start2 = bigpatch.start2; + precontext = ""; + while (!bigpatch.diffs.isEmpty()) { + // Create one of several smaller patches. + patch = new Patch(); + empty = true; + patch.start1 = start1 - precontext.length(); + patch.start2 = start2 - precontext.length(); + if (precontext.length() != 0) { + patch.length1 = patch.length2 = precontext.length(); + patch.diffs.add(new Diff(Operation.EQUAL, precontext)); + } + while (!bigpatch.diffs.isEmpty() + && patch.length1 < patch_size - Patch_Margin) { + diff_type = bigpatch.diffs.getFirst().operation; + diff_text = bigpatch.diffs.getFirst().text; + if (diff_type == Operation.INSERT) { + // Insertions are harmless. + patch.length2 += diff_text.length(); + start2 += diff_text.length(); + patch.diffs.addLast(bigpatch.diffs.removeFirst()); + empty = false; + } else if (diff_type == Operation.DELETE && patch.diffs.size() == 1 + && patch.diffs.getFirst().operation == Operation.EQUAL + && diff_text.length() > 2 * patch_size) { + // This is a large deletion. Let it pass in one chunk. + patch.length1 += diff_text.length(); + start1 += diff_text.length(); + empty = false; + patch.diffs.add(new Diff(diff_type, diff_text)); + bigpatch.diffs.removeFirst(); + } else { + // Deletion or equality. Only take as much as we can stomach. + diff_text = diff_text + .substring( + 0, Math + .min( + diff_text.length(), + patch_size - patch.length1 - Patch_Margin)); + patch.length1 += diff_text.length(); + start1 += diff_text.length(); + if (diff_type == Operation.EQUAL) { + patch.length2 += diff_text.length(); + start2 += diff_text.length(); + } else { + empty = false; + } + patch.diffs.add(new Diff(diff_type, diff_text)); + if (diff_text.equals(bigpatch.diffs.getFirst().text)) { + bigpatch.diffs.removeFirst(); + } else { + bigpatch.diffs.getFirst().text = bigpatch.diffs.getFirst().text + .substring(diff_text.length()); + } + } + } + // Compute the head context for the next patch. + precontext = diff_text2(patch.diffs); + precontext = precontext + .substring( + Math + .max( + 0, precontext.length() + - Patch_Margin)); + // Append the end context for this patch. + if (diff_text1(bigpatch.diffs).length() > Patch_Margin) { + postcontext = diff_text1(bigpatch.diffs).substring(0, Patch_Margin); + } else { + postcontext = diff_text1(bigpatch.diffs); + } + if (postcontext.length() != 0) { + patch.length1 += postcontext.length(); + patch.length2 += postcontext.length(); + if (!patch.diffs.isEmpty() + && patch.diffs.getLast().operation == Operation.EQUAL) { + patch.diffs.getLast().text += postcontext; + } else { + patch.diffs.add(new Diff(Operation.EQUAL, postcontext)); + } + } + if (!empty) { + pointer.add(patch); + } + } + bigpatch = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Take a list of patches and return a textual representation. + * @param patches List of Patch objects. + * @return Text representation of patches. + */ + public String patch_toText(List patches) { + StringBuilder text = new StringBuilder(); + for (Patch aPatch : patches) { + text.append(aPatch); + } + return text.toString(); + } + + /** + * Parse a textual representation of patches and return a List of Patch + * objects. + * @param textline Text representation of patches. + * @return List of Patch objects. + * @throws IllegalArgumentException If invalid input. + */ + public List patch_fromText(String textline) + throws IllegalArgumentException { + List patches = new LinkedList(); + if (textline.length() == 0) { + return patches; + } + List textList = Arrays.asList(textline.split("\n")); + LinkedList text = new LinkedList(textList); + Patch patch; + Pattern patchHeader = Pattern.compile("^@@ -(\\d+),?(\\d*) \\+(\\d+),?(\\d*) @@$"); + Matcher m; + char sign; + String line; + while (!text.isEmpty()) { + m = patchHeader.matcher(text.getFirst()); + if (!m.matches()) { + throw new IllegalArgumentException( + "Invalid patch string: " + text.getFirst()); + } + patch = new Patch(); + patches.add(patch); + patch.start1 = Integer.parseInt(m.group(1)); + if (m.group(2).length() == 0) { + patch.start1--; + patch.length1 = 1; + } else if (m.group(2).equals("0")) { + patch.length1 = 0; + } else { + patch.start1--; + patch.length1 = Integer.parseInt(m.group(2)); + } + + patch.start2 = Integer.parseInt(m.group(3)); + if (m.group(4).length() == 0) { + patch.start2--; + patch.length2 = 1; + } else if (m.group(4).equals("0")) { + patch.length2 = 0; + } else { + patch.start2--; + patch.length2 = Integer.parseInt(m.group(4)); + } + text.removeFirst(); + + while (!text.isEmpty()) { + try { + sign = text.getFirst().charAt(0); + } catch (IndexOutOfBoundsException e) { + // Blank line? Whatever. + text.removeFirst(); + continue; + } + line = text.getFirst().substring(1); + line = line.replace("+", "%2B"); // decode would change all "+" to " " + try { + line = URLDecoder.decode(line, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } catch (IllegalArgumentException e) { + // Malformed URI sequence. + throw new IllegalArgumentException( + "Illegal escape in patch_fromText: " + line, e); + } + if (sign == '-') { + // Deletion. + patch.diffs.add(new Diff(Operation.DELETE, line)); + } else if (sign == '+') { + // Insertion. + patch.diffs.add(new Diff(Operation.INSERT, line)); + } else if (sign == ' ') { + // Minor equality. + patch.diffs.add(new Diff(Operation.EQUAL, line)); + } else if (sign == '@') { + // Start of next patch. + break; + } else { + // WTF? + throw new IllegalArgumentException( + "Invalid patch mode '" + sign + "' in: " + line); + } + text.removeFirst(); + } + } + return patches; + } + + /** + * Class representing one diff operation. + */ + public static class Diff { + /** + * One of: INSERT, DELETE or EQUAL. + */ + public Operation operation; + /** + * The text associated with this diff operation. + */ + public String text; + + /** + * Constructor. Initializes the diff with the provided values. + * @param operation One of INSERT, DELETE or EQUAL. + * @param text The text being applied. + */ + public Diff(Operation operation, String text) { + // Construct a diff with the specified operation and text. + this.operation = operation; + this.text = text; + } + + /** + * Display a human-readable version of this Diff. + * @return text version. + */ + public String toString() { + String prettyText = this.text.replace('\n', '\u00b6'); + return "Diff(" + this.operation + ",\"" + prettyText + "\")"; + } + + /** + * Create a numeric hash value for a Diff. + * This function is not used by DMP. + * @return Hash value. + */ + @Override + public int hashCode() { + final int prime = 31; + int result = (operation == null) ? 0 : operation.hashCode(); + result += prime * ((text == null) ? 0 : text.hashCode()); + return result; + } + + /** + * Is this Diff equivalent to another Diff? + * @param obj Another Diff to compare against. + * @return true or false. + */ + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Diff other = (Diff) obj; + if (operation != other.operation) { + return false; + } + if (text == null) { + if (other.text != null) { + return false; + } + } else if (!text.equals(other.text)) { + return false; + } + return true; + } + } + + /** + * Class representing one patch operation. + */ + public static class Patch { + public LinkedList diffs; + public int start1; + public int start2; + public int length1; + public int length2; + + /** + * Constructor. Initializes with an empty list of diffs. + */ + public Patch() { + this.diffs = new LinkedList(); + } + + /** + * Emulate GNU diff's format. + * Header: @@ -382,8 +481,9 @@ + * Indices are printed as 1-based, not 0-based. + * @return The GNU diff string. + */ + public String toString() { + String coords1, coords2; + if (this.length1 == 0) { + coords1 = this.start1 + ",0"; + } else if (this.length1 == 1) { + coords1 = Integer.toString(this.start1 + 1); + } else { + coords1 = (this.start1 + 1) + "," + this.length1; + } + if (this.length2 == 0) { + coords2 = this.start2 + ",0"; + } else if (this.length2 == 1) { + coords2 = Integer.toString(this.start2 + 1); + } else { + coords2 = (this.start2 + 1) + "," + this.length2; + } + StringBuilder text = new StringBuilder(); + text + .append("@@ -") + .append(coords1) + .append(" +") + .append(coords2) + .append(" @@\n"); + // Escape the body of the patch with %xx notation. + for (Diff aDiff : this.diffs) { + switch (aDiff.operation) { + case INSERT: + text.append('+'); + break; + case DELETE: + text.append('-'); + break; + case EQUAL: + text.append(' '); + break; + } + try { + text + .append(URLEncoder.encode(aDiff.text, "UTF-8").replace('+', ' ')) + .append("\n"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } + } + return unescapeForEncodeUriCompatability(text.toString()); + } + } + + /** + * Unescape selected chars for compatability with JavaScript's encodeURI. + * In speed critical applications this could be dropped since the + * receiving application will certainly decode these fine. + * Note that this function is case-sensitive. Thus "%3f" would not be + * unescaped. But this is ok because it is only called with the output of + * URLEncoder.encode which returns uppercase hex. + * + * Example: "%3F" -> "?", "%24" -> "$", etc. + * + * @param str The string to escape. + * @return The escaped string. + */ + private static String unescapeForEncodeUriCompatability(String str) { + return str + .replace("%21", "!") + .replace("%7E", "~") + .replace("%27", "'") + .replace("%28", "(") + .replace("%29", ")") + .replace("%3B", ";") + .replace("%2F", "/") + .replace("%3F", "?") + .replace("%3A", ":") + .replace("%40", "@") + .replace("%26", "&") + .replace("%3D", "=") + .replace("%2B", "+") + .replace("%24", "$") + .replace("%2C", ",") + .replace("%23", "#"); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java index fdbd6e99d..33183b0f6 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.util; import com.google.common.base.Function; @@ -7,4 +8,4 @@ public class DotAbbreviations implements Function { public String apply(String s) { return s.length() == 1 ? s + "." : s; } -}; \ No newline at end of file +}; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index 575055fb7..f1353f655 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -1,117 +1,172 @@ -package eu.dnetlib.pace.util; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.Configuration; -import com.jayway.jsonpath.JsonPath; -import com.jayway.jsonpath.Option; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.*; -import net.minidev.json.JSONArray; +package eu.dnetlib.pace.util; import java.math.BigDecimal; import java.util.*; import java.util.function.Predicate; import java.util.stream.Collectors; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.*; +import net.minidev.json.JSONArray; + public class MapDocumentUtil { - public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; - public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); - public static List getJPathList(String path, String json, Type type) { - if (type == Type.List) - return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); - Object jresult; - List result = new ArrayList<>(); - try { - jresult = JsonPath.read(json, path); - } catch (Throwable e) { - return result; - } - if (jresult instanceof JSONArray) { + public static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath + .using( + Configuration + .defaultConfiguration() + .addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)) + .parse(json) + .read(path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { - ((JSONArray) jresult).forEach(it -> { + ((JSONArray) jresult).forEach(it -> { - try { - result.add(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { - } - } - ); - return result; - } + } + }); + return result; + } - if (jresult instanceof LinkedHashMap) { - try { - result.add(new ObjectMapper().writeValueAsString(jresult)); - } catch (JsonProcessingException e) { + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { - } - return result; - } - if (jresult instanceof String) { - result.add((String) jresult); - } - return result; - } + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String)o; - if (o instanceof JSONArray && ((JSONArray)o).size()>0) - return (String)((JSONArray)o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } + public static double[] getJPathArray(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof double[]) + return (double[]) o; + if (o instanceof JSONArray) { + Object[] objects = ((JSONArray) o).toArray(); + double[] array = new double[objects.length]; + for (int i = 0; i < objects.length; i++) { + if (objects[i] instanceof BigDecimal) + array[i] = ((BigDecimal) objects[i]).doubleValue(); + else + array[i] = (double) objects[i]; + } + return array; + } + return new double[0]; + } catch (Exception e) { + e.printStackTrace(); + return new double[0]; + } + } - public static double[] getJPathArray(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof double[]) - return (double[]) o; - if (o instanceof JSONArray) { - Object[] objects = ((JSONArray) o).toArray(); - double[] array = new double[objects.length]; - for (int i = 0; i < objects.length; i++) { - if (objects[i] instanceof BigDecimal) - array[i] = ((BigDecimal)objects[i]).doubleValue(); - else - array[i] = (double) objects[i]; - } - return array; - } - return new double[0]; - } - catch (Exception e) { - e.printStackTrace(); - return new double[0]; - } - } + public static String truncateValue(String value, int length) { + if (value == null) + return ""; + if (length == -1 || length > value.length()) + return value; - public static String truncateValue(String value, int length) { - if (value == null) - return ""; + return value.substring(0, length); + } - if (length == -1 || length > value.length()) - return value; + public static List truncateList(List list, int size) { + if (size == -1 || size > list.size()) + return list; - return value.substring(0, length); - } + return list.subList(0, size); + } - public static List truncateList(List list, int size) { - if (size == -1 || size > list.size()) - return list; + public static String getJPathString(final String jsonPath, final DocumentContext json) { + try { + Object o = json.read(jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } - return list.subList(0, size); - } + public static List getJPathList(String path, DocumentContext json, Type type) { + // if (type == Type.List) + // return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, + // Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = json.read(path); + } catch (Throwable e) { + return result; + } + + if (jresult instanceof JSONArray) { + ((JSONArray) jresult).forEach(it -> { + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { + + } + }); + return result; + } + + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { + + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java index 198861c53..077139482 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java @@ -1,13 +1,14 @@ + package eu.dnetlib.pace.util; public class PaceException extends RuntimeException { - public PaceException(String s, Throwable e){ - super(s, e); - } + public PaceException(String s, Throwable e) { + super(s, e); + } - public PaceException(String s){ - super(s); - } + public PaceException(String s) { + super(s); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index bf6feea1c..252205c79 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -1,49 +1,61 @@ -package eu.dnetlib.pace.util; -import eu.dnetlib.pace.clustering.ClusteringClass; -import eu.dnetlib.pace.clustering.ClusteringFunction; -import eu.dnetlib.pace.tree.support.Comparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.reflections.Reflections; +package eu.dnetlib.pace.util; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; import java.util.Map; import java.util.stream.Collectors; +import org.reflections.Reflections; + +import eu.dnetlib.pace.clustering.ClusteringClass; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.tree.support.Comparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + public class PaceResolver implements Serializable { - public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); - public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); + public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); + public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); - private final Map> clusteringFunctions; - private final Map> comparators; + private final Map> clusteringFunctions; + private final Map> comparators; - public PaceResolver() { + public PaceResolver() { - this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream() - .filter(ClusteringFunction.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class)cl)); + this.clusteringFunctions = CLUSTERING_RESOLVER + .getTypesAnnotatedWith(ClusteringClass.class) + .stream() + .filter(ClusteringFunction.class::isAssignableFrom) + .collect( + Collectors + .toMap( + cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class) cl)); - this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream() - .filter(Comparator.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class)cl)); - } + this.comparators = COMPARATOR_RESOLVER + .getTypesAnnotatedWith(ComparatorClass.class) + .stream() + .filter(Comparator.class::isAssignableFrom) + .collect( + Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class) cl)); + } - public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { - try { - return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { - throw new PaceException(name + " not found ", e); - } - } + public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { + try { + return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException + | NoSuchMethodException e) { + throw new PaceException(name + " not found ", e); + } + } - public Comparator getComparator(String name, Map params) throws PaceException { - try { - return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { - throw new PaceException(name + " not found ", e); - } - } + public Comparator getComparator(String name, Map params) throws PaceException { + try { + return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException + | NullPointerException e) { + throw new PaceException(name + " not found ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java index 10c886cb5..fd6761aa1 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java @@ -1,11 +1,11 @@ -package eu.dnetlib.pace.util; +package eu.dnetlib.pace.util; import java.io.Serializable; public interface Reporter extends Serializable { - void incrementCounter(String counterGroup, String counterName, long delta); + void incrementCounter(String counterGroup, String counterName, long delta); - void emit(String type, String from, String to); + void emit(String type, String from, String to); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java new file mode 100644 index 000000000..c2d1c8f59 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/SparkReporter.java @@ -0,0 +1,86 @@ + +package eu.dnetlib.pace.util; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.spark.SparkContext; +import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.Reporter; +import scala.Serializable; +import scala.Tuple2; + +public class SparkReporter implements Serializable, Reporter { + + private final List> relations = new ArrayList<>(); + + private final Map accumulators; + + public SparkReporter(Map accumulators) { + this.accumulators = accumulators; + } + + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { + + final String accumulatorName = String.format("%s::%s", counterGroup, counterName); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(delta); + } + } + + @Override + public void incrementCounter(String counterGroup, String counterName, long delta) { + + incrementCounter(counterGroup, counterName, delta, accumulators); + } + + @Override + public void emit(String type, String from, String to) { + relations.add(new Tuple2<>(from, to)); + } + + public List> getRelations() { + return relations; + } + + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { + + Map accumulators = new HashMap<>(); + + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + accumulators.put(acc1, context.longAccumulator(acc1)); + String acc2 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + accumulators.put(acc2, context.longAccumulator(acc2)); + String acc3 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + accumulators.put(acc3, context.longAccumulator(acc3)); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); + accumulators.put(acc4, context.longAccumulator(acc4)); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + accumulators.put(acc5, context.longAccumulator(acc5)); + String acc6 = String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + accumulators.put(acc6, context.longAccumulator(acc6)); + + return accumulators; + } +} diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java index 4cc8bd323..d3f502f35 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -1,12 +1,14 @@ -package eu.dnetlib.pace; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import org.apache.commons.io.IOUtils; +package eu.dnetlib.pace; import java.io.IOException; import java.io.StringWriter; import java.util.List; +import org.apache.commons.io.IOUtils; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; + public abstract class AbstractPaceTest extends AbstractPaceFunctions { protected String readFromClasspath(final String filename) { @@ -35,7 +37,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions { return a; } - protected List createFieldList(List strings, String fieldName){ + protected List createFieldList(List strings, String fieldName) { return strings; } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 32d2ab8d4..16849abb1 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -1,17 +1,20 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.DedupConfig; -import org.junit.jupiter.api.*; +package eu.dnetlib.pace.clustering; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.junit.jupiter.api.*; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.DedupConfig; + public class ClusteringFunctionTest extends AbstractPaceTest { private static Map params; @@ -20,7 +23,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @BeforeAll public static void setUp() throws Exception { params = Maps.newHashMap(); - conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class)); + conf = DedupConfig + .load( + AbstractPaceFunctions + .readFromClasspath( + "/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class)); } @Test @@ -210,7 +217,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } @Test - public void testPersonClustering(){ + public void testPersonClustering() { final ClusteringFunction cf = new PersonClustering(params); final String s = "Abd-Alla, Abo-el-nour N."; @@ -224,7 +231,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } @Test - public void testPersonHash(){ + public void testPersonHash() { final ClusteringFunction cf = new PersonHash(params); final String s = "Manghi, Paolo"; @@ -238,7 +245,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } @Test - public void testLastNameFirstInitial(){ + public void testLastNameFirstInitial() { final ClusteringFunction cf = new LastNameFirstInitial(params); final String s = "LI Yonghong"; @@ -246,4 +253,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(cf.apply(conf, Lists.newArrayList(s))); } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java index ff1ca6721..7fd81d975 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java @@ -1,56 +1,57 @@ + package eu.dnetlib.pace.common; -import org.junit.jupiter.api.*; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; +import org.junit.jupiter.api.*; + public class PaceFunctionTest extends AbstractPaceFunctions { - private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; + private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; - @Test - public void normalizePidTest(){ + @Test + public void normalizePidTest() { - assertEquals("identifier", normalizePid("IdentifIer")); - assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); - assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); - assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); - } + assertEquals("identifier", normalizePid("IdentifIer")); + assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); + assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); + assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); + } - @Test - public void filterAllStopwordsTest(){ + @Test + public void filterAllStopwordsTest() { - assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); - } + assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); + } - @Test - public void normalizeTest() { - assertEquals("universitat", normalize("Universität")); + @Test + public void normalizeTest() { + assertEquals("universitat", normalize("Universität")); - System.out.println(normalize("İstanbul Ticarət Universiteti")); - } + System.out.println(normalize("İstanbul Ticarət Universiteti")); + } - @Test - public void cleanupTest() { - assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti")); + @Test + public void cleanupTest() { + assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti")); + System.out.println("cleaned up : " + cleanup(TEST_STRING)); + } - System.out.println("cleaned up : " + cleanup(TEST_STRING)); - } + @Test + public void testGetNumbers() { + System.out.println("Numbers : " + getNumbers(TEST_STRING)); + } - @Test - public void testGetNumbers() { - System.out.println("Numbers : " + getNumbers(TEST_STRING)); - } + @Test + public void testRemoveSymbols() { + System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); + } - @Test - public void testRemoveSymbols() { - System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); - } - - @Test - public void testFixAliases() { - System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); - } + @Test + public void testFixAliases() { + System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); + } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 6d375e778..92e0529b3 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -1,16 +1,18 @@ + package eu.dnetlib.pace.comparators; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.*; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestInstance; + import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.tree.*; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestInstance; - -import java.util.*; - -import static org.junit.jupiter.api.Assertions.assertEquals; @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ComparatorTest extends AbstractPaceTest { @@ -26,7 +28,8 @@ public class ComparatorTest extends AbstractPaceTest { params.put("name_th", "0.95"); params.put("jpath_value", "$.value"); params.put("jpath_classid", "$.qualifier.classid"); - conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); + conf = DedupConfig + .load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); } @Test @@ -39,32 +42,38 @@ public class ComparatorTest extends AbstractPaceTest { public void cityMatchTest() { final CityMatch cityMatch = new CityMatch(params); - //both names with no cities + // both names with no cities assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf)); - //one of the two names with no cities + // one of the two names with no cities assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf)); - //both names with cities (same) + // both names with cities (same) assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf)); - //both names with cities (different) + // both names with cities (different) assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf)); - //particular cases + // particular cases assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); - assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); + assertEquals( + 1.0, + cityMatch + .distance( + "Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", + conf)); assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf)); } @Test - public void keywordMatchTest(){ + public void keywordMatchTest() { params.put("threshold", "0.5"); final KeywordMatch keywordMatch = new KeywordMatch(params); - assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); + assertEquals( + 0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); @@ -77,7 +86,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void listContainsMatchTest(){ + public void listContainsMatchTest() { List a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType"); List b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType"); @@ -100,7 +109,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void stringContainsMatchTest(){ + public void stringContainsMatchTest() { params.put("string", "openorgs"); params.put("bool", "XOR"); @@ -120,7 +129,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void numbersMatchTest(){ + public void numbersMatchTest() { final NumbersMatch numbersMatch = new NumbersMatch(params); assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf)); @@ -128,7 +137,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void romansMatchTest(){ + public void romansMatchTest() { final RomansMatch romansMatch = new RomansMatch(params); @@ -142,8 +151,9 @@ public class ComparatorTest extends AbstractPaceTest { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf); - System.out.println("result = " + result); + double result = jaroWinklerNormalizedName + .distance("AT&T (United States)", "United States Military Academy", conf); + System.out.println("result = " + result); result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); System.out.println("result = " + result); @@ -171,7 +181,11 @@ public class ComparatorTest extends AbstractPaceTest { final LevensteinTitle levensteinTitle = new LevensteinTitle(params); - double result = levensteinTitle.distance("Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6", "Degradation of lignin β-aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK-6", conf); + double result = levensteinTitle + .distance( + "Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6", + "Degradation of lignin β-aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK-6", + conf); System.out.println("result = " + result); } @@ -195,13 +209,16 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(1.0, result); - List c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); + List c = createFieldList( + Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); result = instanceTypeMatch.compare(c, b, conf); assertEquals(1.0, result); - List d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); - List e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); + List d = createFieldList( + Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); + List e = createFieldList( + Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); result = instanceTypeMatch.compare(d, e, conf); assertEquals(1.0, result); @@ -222,7 +239,8 @@ public class ComparatorTest extends AbstractPaceTest { AuthorsMatch authorsMatch = new AuthorsMatch(params); - List a = createFieldList(Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors"); + List a = createFieldList( + Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors"); List b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors"); double result = authorsMatch.compare(a, b, conf); @@ -232,7 +250,7 @@ public class ComparatorTest extends AbstractPaceTest { List d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors"); result = authorsMatch.compare(c, d, conf); - assertEquals(0.0, result) ; + assertEquals(0.0, result); params.put("mode", "surname"); authorsMatch = new AuthorsMatch(params); @@ -246,7 +264,7 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(0.25, result); List f = createFieldList(new ArrayList<>(), "authors"); - result = authorsMatch.compare(f,f, conf); + result = authorsMatch.compare(f, f, conf); System.out.println("result = " + result); } @@ -256,8 +274,19 @@ public class ComparatorTest extends AbstractPaceTest { JsonListMatch jsonListMatch = new JsonListMatch(params); - List a = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors"); - List b = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), "authors"); + List a = createFieldList( + Arrays + .asList( + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), + "authors"); + List b = createFieldList( + Arrays + .asList( + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), + "authors"); double result = jsonListMatch.compare(a, b, conf); @@ -287,13 +316,16 @@ public class ComparatorTest extends AbstractPaceTest { CosineSimilarity cosineSimilarity = new CosineSimilarity(params); - double[] a = new double[]{1,2,3}; - double[] b = new double[]{1,2,3}; + double[] a = new double[] { + 1, 2, 3 + }; + double[] b = new double[] { + 1, 2, 3 + }; double compare = cosineSimilarity.compare(a, b, conf); System.out.println("compare = " + compare); } - } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index a6ce9e7eb..b46085bb4 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,17 +1,17 @@ + package eu.dnetlib.pace.config; - -import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.util.HashMap; -import java.util.Map; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import java.util.HashMap; +import java.util.Map; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.util.MapDocumentUtil; public class ConfigTest extends AbstractPaceTest { @@ -56,7 +56,7 @@ public class ConfigTest extends AbstractPaceTest { System.out.println("translationMap = " + translationMap.size()); - for (String key: translationMap.keySet()) { + for (String key : translationMap.keySet()) { if (translationMap.get(key).equals("key::1")) System.out.println("key = " + key); } @@ -70,13 +70,13 @@ public class ConfigTest extends AbstractPaceTest { assertEquals(0, load.getPace().translationMap().keySet().size()); } - @Test - public void testJPath() { - final String json = readFromClasspath("organization.json"); + @Test + public void testJPath() { + final String json = readFromClasspath("organization.json"); - final String jpath ="$.id"; + final String jpath = "$.id"; - System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); - } + System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); + } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index 1e6053246..198b9f049 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -1,40 +1,43 @@ + package eu.dnetlib.pace.util; -import eu.dnetlib.pace.model.Person; -import jdk.nashorn.internal.ir.annotations.Ignore; -import org.junit.jupiter.api.*; +import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.HashMap; import java.util.Map; -import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.*; + +import eu.dnetlib.pace.model.Person; +import jdk.nashorn.internal.ir.annotations.Ignore; public class UtilTest { - static Map params; + static Map params; - @BeforeAll - public static void setUp(){ - params = new HashMap<>(); - } + @BeforeAll + public static void setUp() { + params = new HashMap<>(); + } - @Test - @Ignore - public void paceResolverTest() { - PaceResolver paceResolver = new PaceResolver(); - paceResolver.getComparator("keywordMatch", params); - } + @Test + @Ignore + public void paceResolverTest() { + PaceResolver paceResolver = new PaceResolver(); + paceResolver.getComparator("keywordMatch", params); + } - @Test - public void personTest() { - Person p = new Person("j. f. kennedy", false); + @Test + public void personTest() { + Person p = new Person("j. f. kennedy", false); - assertEquals("kennedy", p.getSurnameString()); - assertEquals("j f", p.getNameString()); + assertEquals("kennedy", p.getSurnameString()); + assertEquals("j f", p.getNameString()); - p = new Person("Guan-Hua Du", false); + p = new Person("Guan-Hua Du", false); - System.out.println("surname = " + p.getSurnameString()); - System.out.println("name = " + p.getNameString()); - } + System.out.println("surname = " + p.getSurnameString()); + System.out.println("name = " + p.getNameString()); + } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java index aaf53b669..becc71c92 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -1,16 +1,18 @@ package eu.dnetlib.dhp.broker.oa.util; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.OaBrokerMainEntity; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.tree.support.TreeProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; +import java.io.IOException; + import org.apache.spark.sql.Row; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.SparkDedupConfig; +import eu.dnetlib.pace.tree.support.TreeProcessor; public class TrustUtils { @@ -18,13 +20,18 @@ public class TrustUtils { private static DedupConfig dedupConfig; + private static SparkDedupConfig sparkDedupConfig; + + private static final ObjectMapper mapper; + static { - final ObjectMapper mapper = new ObjectMapper(); + mapper = new ObjectMapper(); try { dedupConfig = mapper .readValue( DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), DedupConfig.class); + sparkDedupConfig = new SparkDedupConfig(dedupConfig, 1); } catch (final IOException e) { log.error("Error loading dedupConfig, e"); } @@ -40,11 +47,8 @@ public class TrustUtils { } try { - final ObjectMapper objectMapper = new ObjectMapper(); - final Row doc1 = MapDocumentUtil - .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); - final Row doc2 = MapDocumentUtil - .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); + final Row doc1 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r1)); + final Row doc2 = sparkDedupConfig.rowFromJson().apply(mapper.writeValueAsString(r2)); final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); diff --git a/dhp-workflows/dhp-dedup-openaire/pom.xml b/dhp-workflows/dhp-dedup-openaire/pom.xml index b88e8e0df..a6e34bb96 100644 --- a/dhp-workflows/dhp-dedup-openaire/pom.xml +++ b/dhp-workflows/dhp-dedup-openaire/pom.xml @@ -53,13 +53,17 @@ + - io.opentelemetry - opentelemetry-api + eu.dnetlib.dhp + dhp-common + ${project.version} + - io.opentelemetry - opentelemetry-sdk + eu.dnetlib.dhp + dhp-pace-core + ${project.version} @@ -83,31 +87,21 @@ spark-sql_2.11 - - eu.dnetlib.dhp - dhp-common - ${project.version} - - com.arakelian java-jq + dom4j dom4j + jaxen jaxen - - - eu.dnetlib.dhp - dhp-pace-core - ${project.version} - org.apache.spark spark-graphx_2.11 @@ -141,12 +135,7 @@ 1.4.200 test - - org.apache.spark - spark-catalyst_2.11 - 2.4.0.cloudera2 - compile - + diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DSLExample.scala b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DSLExample.scala index 378753564..9a75b20dd 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DSLExample.scala +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DSLExample.scala @@ -3,29 +3,20 @@ package eu.dnetlib.dhp.oa.dedup import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.oa.dedup.dsl.{Clustering, Deduper} import eu.dnetlib.dhp.oa.dedup.model.BlockStats -import eu.dnetlib.dhp.oa.dedup.model.SparkDedupConfig -import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.dhp.utils.ISLookupClientFactory -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService -import eu.dnetlib.pace.config.DedupConfig -import eu.dnetlib.pace.model.RowDataOrderingComparator +import eu.dnetlib.enabling.is.lookup.rmi.{ISLookUpException, ISLookUpService} +import eu.dnetlib.pace.model.{RowDataOrderingComparator, SparkDedupConfig} import org.apache.commons.io.IOUtils import org.apache.spark.SparkConf -import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.DataTypes import org.dom4j.DocumentException -import org.slf4j.Logger import org.slf4j.LoggerFactory import org.xml.sax.SAXException import java.io.IOException -import java.util -import java.util.Optional import java.util.stream.Collectors -import scala.collection.Seq object DSLExample { private val log = LoggerFactory.getLogger(classOf[DSLExample]) @@ -64,15 +55,15 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends DSLExample.log.info("isLookUpUrl: '{}'", isLookUpUrl) DSLExample.log.info("actionSetId: '{}'", actionSetId) DSLExample.log.info("workingPath: '{}'", workingPath) -// for each dedup configuration + // for each dedup configuration import scala.collection.JavaConversions._ for (dedupConf <- getConfigurations(isLookUpService, actionSetId).subList(0, 1)) { val subEntity = dedupConf.getWf.getSubEntityValue DSLExample.log.info("Creating blockstats for: '{}'", subEntity) val outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity) AbstractSparkAction.removeOutputDir(spark, outputPath) - val sc = JavaSparkContext.fromSparkContext(spark.sparkContext) - val sparkConfig = new SparkDedupConfig(dedupConf, numPartitions) + + val sparkConfig = SparkDedupConfig(dedupConf, numPartitions) val inputDF = spark.read .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) @@ -87,8 +78,7 @@ class DSLExample(parser: ArgumentApplicationParser, spark: SparkSession) extends Clustering("suffixprefix", Seq("legalname"), Map("max" -> 1, "len" -> 3)), Clustering("urlclustering", Seq("websiteurl")), Clustering("keywordsclustering", Seq("fields"), Map("max" -> 2, "windowSize" -> 4)) - ); - + ) simRels .map[BlockStats]( diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index e470a59bf..3dece78b3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -1,13 +1,12 @@ package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + import org.apache.spark.SparkContext; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -16,11 +15,13 @@ import org.dom4j.Element; import org.dom4j.io.SAXReader; import org.xml.sax.SAXException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; public class DedupUtility { @@ -30,39 +31,6 @@ public class DedupUtility { private DedupUtility() { } - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { - - Map accumulators = new HashMap<>(); - - String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = String - .format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = String - .format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String - .format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = String - .format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); - - return accumulators; - } - public static String createDedupRecordPath( final String basePath, final String actionSetId, final String entityType) { return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index af087c132..431e0d7b3 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -7,9 +7,6 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import eu.dnetlib.dhp.oa.dedup.dsl.Deduper; -import eu.dnetlib.dhp.oa.dedup.model.SparkDedupConfig; -import eu.dnetlib.pace.model.RowDataOrderingComparator; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -23,97 +20,103 @@ import org.xml.sax.SAXException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.dedup.model.BlockStats; +import eu.dnetlib.pace.model.SparkDedupConfig; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.RowDataOrderingComparator; public class SparkBlockStats extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class); + private static final Logger log = LoggerFactory.getLogger(SparkBlockStats.class); - public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkBlockStats(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - SparkBlockStats.class - .getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkBlockStats.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createBlockStats_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - new SparkBlockStats(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkBlockStats(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - public Long computeComparisons(Long blockSize, Long slidingWindowSize) { + public Long computeComparisons(Long blockSize, Long slidingWindowSize) { - if (slidingWindowSize >= blockSize) - return (slidingWindowSize * (slidingWindowSize - 1)) / 2; - else { - return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; - } - } + if (slidingWindowSize >= blockSize) + return (slidingWindowSize * (slidingWindowSize - 1)) / 2; + else { + return (blockSize - slidingWindowSize + 1) * (slidingWindowSize * (slidingWindowSize - 1)) / 2; + } + } - @Override - public void run(ISLookUpService isLookUpService) - throws DocumentException, IOException, ISLookUpException, SAXException { + @Override + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException, SAXException { - // read oozie parameters - final String graphBasePath = parser.get("graphBasePath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); - final int numPartitions = Optional - .ofNullable(parser.get("numPartitions")) - .map(Integer::valueOf) - .orElse(NUM_PARTITIONS); + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + final int numPartitions = Optional + .ofNullable(parser.get("numPartitions")) + .map(Integer::valueOf) + .orElse(NUM_PARTITIONS); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - // for each dedup configuration - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating blockstats for: '{}'", subEntity); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating blockstats for: '{}'", subEntity); - final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity); - removeOutputDir(spark, outputPath); + final String outputPath = DedupUtility.createBlockStatsPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - SparkDedupConfig sparkConfig = new SparkDedupConfig(dedupConf, numPartitions); + SparkDedupConfig sparkConfig = new SparkDedupConfig(dedupConf, numPartitions); - Dataset inputDF = spark.read().textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .transform(sparkConfig.modelExtractor()); - Dataset simRels = inputDF - .transform(sparkConfig.generateClusters()) - .filter(functions.size(new Column("block")).geq(new Literal(1, DataTypes.IntegerType))); + Dataset inputDF = spark + .read() + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .transform(sparkConfig.modelExtractor()); + Dataset simRels = inputDF + .transform(sparkConfig.generateClusters()) + .filter(functions.size(new Column("block")).geq(new Literal(1, DataTypes.IntegerType))); - simRels.map(b -> { - Collection documents = b.getList(1); + simRels.map(b -> { + Collection documents = b.getList(1); - List mapDocuments = documents.stream() - .sorted(new RowDataOrderingComparator(sparkConfig.orderingFieldPosition())) - .limit(dedupConf.getWf().getQueueMaxSize()).collect(Collectors.toList()); + List mapDocuments = documents + .stream() + .sorted(new RowDataOrderingComparator(sparkConfig.orderingFieldPosition())) + .limit(dedupConf.getWf().getQueueMaxSize()) + .collect(Collectors.toList()); - return new BlockStats( - b.getString(0), - (long) mapDocuments.size(), - computeComparisons( - (long) mapDocuments.size(), (long) dedupConf.getWf().getSlidingWindowSize())); - }, Encoders.bean(BlockStats.class)) - .write() - .mode(SaveMode.Overwrite) - .save(outputPath); - } - } + return new BlockStats( + b.getString(0), + (long) mapDocuments.size(), + computeComparisons( + (long) mapDocuments.size(), (long) dedupConf.getWf().getSlidingWindowSize())); + }, Encoders.bean(BlockStats.class)) + .write() + .mode(SaveMode.Overwrite) + .save(outputPath); + } + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java deleted file mode 100644 index 005e65ddf..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java +++ /dev/null @@ -1,50 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.pace.util.Reporter; -import scala.Serializable; -import scala.Tuple2; - -public class SparkReporter implements Serializable, Reporter { - - private final List> relations = new ArrayList<>(); - - private final Map accumulators; - - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } - - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { - - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } - - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { - - incrementCounter(counterGroup, counterName, delta, accumulators); - } - - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } - - public List> getRelations() { - return relations; - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java index da261779b..00c2d2d88 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java @@ -1,12 +1,9 @@ package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; +import java.io.IOException; +import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -17,8 +14,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -import java.io.IOException; -import java.util.Optional; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; public class SparkWhitelistSimRels extends AbstractSparkAction { @@ -69,11 +70,14 @@ public class SparkWhitelistSimRels extends AbstractSparkAction { JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); // file format: source####target - Dataset whiteListRels = spark.read() - .textFile(whiteListPath) - .withColumn("pairs", functions.split(new Column("value"), WHITELIST_SEPARATOR)) - .filter(functions.size(new Column("pairs")).equalTo(2)) - .select(functions.element_at(new Column("pairs"), 1).as("from"), functions.element_at(new Column("pairs"), 2).as("to")); + Dataset whiteListRels = spark + .read() + .textFile(whiteListPath) + .withColumn("pairs", functions.split(new Column("value"), WHITELIST_SEPARATOR)) + .filter(functions.size(new Column("pairs")).equalTo(2)) + .select( + functions.element_at(new Column("pairs"), 1).as("from"), + functions.element_at(new Column("pairs"), 2).as("to")); // for each dedup configuration for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { @@ -84,15 +88,21 @@ public class SparkWhitelistSimRels extends AbstractSparkAction { final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); - //DFMapDocumentUtils.registerUDFs(spark, dedupConf); + // DFMapDocumentUtils.registerUDFs(spark, dedupConf); - Dataset entities = spark.read().textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(numPartitions) - .withColumn("id", functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath())); + Dataset entities = spark + .read() + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .repartition(numPartitions) + .withColumn("id", functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath())); - Dataset whiteListRels1 = whiteListRels.join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "inner").select("from", "to"); + Dataset whiteListRels1 = whiteListRels + .join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "inner") + .select("from", "to"); - Dataset whiteListRels2 = whiteListRels1.join(entities, whiteListRels1.col("to").equalTo(entities.col("id")), "inner").select("from", "to"); + Dataset whiteListRels2 = whiteListRels1 + .join(entities, whiteListRels1.col("to").equalTo(entities.col("id")), "inner") + .select("from", "to"); // Dataset> whiteListRels1 = whiteListRels // .joinWith(entities, whiteListRels.col("_1").equalTo(entities.col("_1")), "inner") diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/DFMapDocumentUtils.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/DFMapDocumentUtils.java deleted file mode 100644 index b4ecaaea0..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/DFMapDocumentUtils.java +++ /dev/null @@ -1,88 +0,0 @@ -package eu.dnetlib.dhp.oa.dedup.model; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.Configuration; -import com.jayway.jsonpath.DocumentContext; -import com.jayway.jsonpath.JsonPath; -import com.jayway.jsonpath.Option; -import eu.dnetlib.dhp.oa.dedup.DedupUtility; -import eu.dnetlib.dhp.oa.dedup.SparkReporter; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.*; -import eu.dnetlib.pace.util.BlockProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; -import net.minidev.json.JSONArray; -import org.apache.commons.compress.utils.Lists; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.apache.spark.sql.api.java.UDF1; -import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; -import org.apache.spark.sql.types.*; -import org.apache.spark.util.LongAccumulator; -import scala.Tuple2; -import scala.collection.JavaConverters; -import scala.collection.mutable.WrappedArray; - -import java.util.*; -import java.util.function.Predicate; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -public class DFMapDocumentUtils extends MapDocumentUtil { - public static final Pattern URL_REGEX = Pattern.compile("^\\s*(http|https|ftp)\\://.*"); - - public static final Pattern CONCAT_REGEX = Pattern.compile("\\|\\|\\|"); - public static Predicate urlFilter = s -> URL_REGEX.matcher(s).matches(); - - public static String getJPathString(final String jsonPath, final DocumentContext json) { - try { - Object o = json.read(jsonPath); - if (o instanceof String) - return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } - - public static List getJPathList(String path, DocumentContext json, Type type) { - // if (type == Type.List) - // return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); - Object jresult; - List result = new ArrayList<>(); - try { - jresult = json.read(path); - } catch (Throwable e) { - return result; - } - - if (jresult instanceof JSONArray) { - ((JSONArray) jresult).forEach(it -> { - try { - result.add(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { - - } - } - ); - return result; - } - - if (jresult instanceof LinkedHashMap) { - try { - result.add(new ObjectMapper().writeValueAsString(jresult)); - } catch (JsonProcessingException e) { - - } - return result; - } - if (jresult instanceof String) { - result.add((String) jresult); - } - return result; - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/RandomIdGenerator.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/RandomIdGenerator.java deleted file mode 100644 index 6c3a09d13..000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/RandomIdGenerator.java +++ /dev/null @@ -1,48 +0,0 @@ -// -// Source code recreated from a .class file by IntelliJ IDEA -// (powered by FernFlower decompiler) -// -import io.opentelemetry.api.trace.SpanId; -import io.opentelemetry.api.trace.TraceId; -import io.opentelemetry.sdk.internal.RandomSupplier; -import io.opentelemetry.sdk.trace.IdGenerator; - -import java.util.Random; -import java.util.function.Supplier; - -enum RandomIdGenerator implements IdGenerator { - INSTANCE; - - private static final long INVALID_ID = 0L; - private static final Supplier randomSupplier = RandomSupplier.platformDefault(); - - private RandomIdGenerator() { - } - - public String generateSpanId() { - Random random = (Random)randomSupplier.get(); - - long id; - do { - id = random.nextLong(); - } while(id == 0L); - - return SpanId.fromLong(id); - } - - public String generateTraceId() { - Random random = (Random)randomSupplier.get(); - long idHi = random.nextLong(); - - long idLo; - do { - idLo = random.nextLong(); - } while(idLo == 0L); - - return TraceId.fromLongs(idHi, idLo); - } - - public String toString() { - return "RandomIdGenerator{}"; - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDSLExampleTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDSLExampleTest.java index 028db8afe..199bde822 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDSLExampleTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDSLExampleTest.java @@ -1,9 +1,17 @@ package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import static java.nio.file.Files.createTempDirectory; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.lenient; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.net.URISyntaxException; +import java.nio.file.Paths; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -18,15 +26,9 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; -import java.io.File; -import java.io.IOException; -import java.io.Serializable; -import java.net.URISyntaxException; -import java.nio.file.Paths; - -import static java.nio.file.Files.createTempDirectory; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.mockito.Mockito.lenient; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) public class SparkDSLExampleTest implements Serializable { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 2f293d70a..dc9f08c73 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -133,11 +133,13 @@ public class SparkDedupTest implements Serializable { .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) .thenReturn(classPathResourceAsString("/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json")); } + @Test @Order(1) void createSimRelsTestTwice() throws Exception { createSimRelsTest(); } + @Test @Order(1) void createSimRelsTest() throws Exception { @@ -182,7 +184,6 @@ public class SparkDedupTest implements Serializable { .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "otherresearchproduct")) .count(); - System.out.println("orgs_simrel = " + orgs_simrel); System.out.println("pubs_simrel = " + pubs_simrel); System.out.println("sw_simrel = " + sw_simrel);