From 801da2fd4ae834d5d16de567e8cb30c9f1798786 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Thu, 6 Jul 2023 10:28:53 +0200 Subject: [PATCH] New sources formatted by maven plugin --- .../AbstractClusteringFunction.java | 38 +- .../eu/dnetlib/pace/clustering/Acronyms.java | 12 +- .../BlacklistAwareClusteringCombiner.java | 79 +- .../pace/clustering/ClusteringClass.java | 5 +- .../pace/clustering/ClusteringCombiner.java | 40 +- .../pace/clustering/ClusteringFunction.java | 5 +- .../pace/clustering/ImmutableFieldValue.java | 2 + .../pace/clustering/KeywordsClustering.java | 82 +- .../pace/clustering/LastNameFirstInitial.java | 124 +- .../pace/clustering/LowercaseClustering.java | 9 +- .../dnetlib/pace/clustering/NGramUtils.java | 8 +- .../dnetlib/pace/clustering/NgramPairs.java | 6 +- .../eu/dnetlib/pace/clustering/Ngrams.java | 7 +- .../pace/clustering/PersonClustering.java | 20 +- .../dnetlib/pace/clustering/PersonHash.java | 4 +- .../clustering/RandomClusteringFunction.java | 5 +- .../pace/clustering/SortedNgramPairs.java | 6 +- .../clustering/SpaceTrimmingFieldValue.java | 9 +- .../dnetlib/pace/clustering/SuffixPrefix.java | 4 +- .../pace/clustering/UrlClustering.java | 70 +- .../WordsStatsSuffixPrefixChain.java | 129 +- .../pace/clustering/WordsSuffixPrefix.java | 76 +- .../pace/common/AbstractPaceFunctions.java | 684 +-- .../java/eu/dnetlib/pace/config/Config.java | 2 +- .../eu/dnetlib/pace/config/DedupConfig.java | 50 +- .../eu/dnetlib/pace/config/PaceConfig.java | 23 +- .../java/eu/dnetlib/pace/config/Type.java | 1 + .../java/eu/dnetlib/pace/config/WfConfig.java | 28 +- .../eu/dnetlib/pace/model/AbstractField.java | 8 +- .../eu/dnetlib/pace/model/ClusteringDef.java | 14 +- .../java/eu/dnetlib/pace/model/Document.java | 1 + .../java/eu/dnetlib/pace/model/Field.java | 5 +- .../java/eu/dnetlib/pace/model/FieldDef.java | 27 +- .../java/eu/dnetlib/pace/model/FieldList.java | 1 + .../eu/dnetlib/pace/model/FieldListImpl.java | 69 +- .../eu/dnetlib/pace/model/FieldValue.java | 1 + .../eu/dnetlib/pace/model/FieldValueImpl.java | 25 +- .../eu/dnetlib/pace/model/MapDocument.java | 6 +- .../pace/model/MapDocumentComparator.java | 8 +- .../pace/model/MapDocumentSerializer.java | 4 +- .../java/eu/dnetlib/pace/model/Person.java | 3 +- .../pace/model/PersonComparatorUtils.java | 7 +- .../eu/dnetlib/pace/tree/AlwaysMatch.java | 49 +- .../eu/dnetlib/pace/tree/AuthorsMatch.java | 245 +- .../java/eu/dnetlib/pace/tree/CityMatch.java | 59 +- .../dnetlib/pace/tree/CosineSimilarity.java | 60 +- .../eu/dnetlib/pace/tree/DoiExactMatch.java | 17 +- .../dnetlib/pace/tree/DomainExactMatch.java | 35 +- .../java/eu/dnetlib/pace/tree/ExactMatch.java | 54 +- .../pace/tree/ExactMatchIgnoreCase.java | 35 +- .../dnetlib/pace/tree/InstanceTypeMatch.java | 113 +- .../eu/dnetlib/pace/tree/JaroWinkler.java | 54 +- .../pace/tree/JaroWinklerNormalizedName.java | 100 +- .../dnetlib/pace/tree/JaroWinklerTitle.java | 15 +- .../eu/dnetlib/pace/tree/JsonListMatch.java | 106 +- .../eu/dnetlib/pace/tree/KeywordMatch.java | 61 +- .../dnetlib/pace/tree/Level2JaroWinkler.java | 8 +- .../pace/tree/Level2JaroWinklerTitle.java | 13 +- .../dnetlib/pace/tree/Level2Levenstein.java | 8 +- .../java/eu/dnetlib/pace/tree/Levenstein.java | 8 +- .../eu/dnetlib/pace/tree/LevensteinTitle.java | 17 +- .../tree/LevensteinTitleIgnoreVersion.java | 11 +- .../dnetlib/pace/tree/ListContainsMatch.java | 101 +- .../eu/dnetlib/pace/tree/MustBeDifferent.java | 10 +- .../dnetlib/pace/tree/NullDistanceAlgo.java | 7 +- .../dnetlib/pace/tree/NumbersComparator.java | 37 +- .../eu/dnetlib/pace/tree/NumbersMatch.java | 40 +- .../eu/dnetlib/pace/tree/RomansMatch.java | 38 +- .../java/eu/dnetlib/pace/tree/SizeMatch.java | 51 +- .../dnetlib/pace/tree/SortedJaroWinkler.java | 10 +- .../pace/tree/SortedLevel2JaroWinkler.java | 10 +- .../pace/tree/StringContainsMatch.java | 87 +- .../eu/dnetlib/pace/tree/StringListMatch.java | 73 +- .../pace/tree/SubStringLevenstein.java | 17 +- .../dnetlib/pace/tree/TitleVersionMatch.java | 33 +- .../java/eu/dnetlib/pace/tree/UrlMatcher.java | 80 +- .../java/eu/dnetlib/pace/tree/YearMatch.java | 58 +- .../pace/tree/support/AbstractComparator.java | 196 +- .../support/AbstractSortedComparator.java | 54 +- .../eu/dnetlib/pace/tree/support/AggType.java | 27 +- .../dnetlib/pace/tree/support/Comparator.java | 10 +- .../pace/tree/support/ComparatorClass.java | 3 +- .../dnetlib/pace/tree/support/FieldConf.java | 116 +- .../dnetlib/pace/tree/support/FieldStats.java | 127 +- .../dnetlib/pace/tree/support/MatchType.java | 20 +- .../pace/tree/support/TreeNodeDef.java | 228 +- .../pace/tree/support/TreeNodeStats.java | 218 +- .../pace/tree/support/TreeProcessor.java | 39 +- .../dnetlib/pace/tree/support/TreeStats.java | 69 +- .../eu/dnetlib/pace/util/BlockProcessor.java | 299 +- .../pace/util/BlockProcessorForTesting.java | 386 +- .../java/eu/dnetlib/pace/util/Capitalise.java | 15 +- .../eu/dnetlib/pace/util/DiffPatchMatch.java | 4925 +++++++++-------- .../dnetlib/pace/util/DotAbbreviations.java | 3 +- .../eu/dnetlib/pace/util/MapDocumentUtil.java | 293 +- .../eu/dnetlib/pace/util/PaceException.java | 13 +- .../eu/dnetlib/pace/util/PaceResolver.java | 76 +- .../java/eu/dnetlib/pace/util/Reporter.java | 6 +- .../eu/dnetlib/pace/AbstractPaceTest.java | 23 +- .../clustering/ClusteringFunctionTest.java | 33 +- .../dnetlib/pace/common/PaceFunctionTest.java | 73 +- .../pace/comparators/ComparatorTest.java | 104 +- .../eu/dnetlib/pace/config/ConfigTest.java | 45 +- .../java/eu/dnetlib/pace/util/UtilTest.java | 47 +- .../doiboost/SparkGenerateDoiBoost.scala | 2 +- 105 files changed, 5610 insertions(+), 5267 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 01f146120..e984f5d18 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -1,9 +1,5 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.HashSet; @@ -11,33 +7,41 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; + public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { protected Map params; - + public AbstractClusteringFunction(final Map params) { this.params = params; } protected abstract Collection doApply(Config conf, String s); - + @Override public Collection apply(Config conf, List fields) { - return fields.stream().filter(f -> !f.isEmpty()) - .map(Field::stringValue) - .map(this::normalize) - .map(s -> filterAllStopWords(s)) - .map(s -> doApply(conf, s)) - .map(c -> filterBlacklisted(c, ngramBlacklist)) - .flatMap(c -> c.stream()) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toCollection(HashSet::new)); + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::normalize) + .map(s -> filterAllStopWords(s)) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); } public Map getParams() { return params; } - + protected Integer param(String name) { return params.get(name); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index d3008332d..9072fbb4b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -6,6 +7,7 @@ import java.util.Set; import java.util.StringTokenizer; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; @ClusteringClass("acronyms") @@ -19,16 +21,16 @@ public class Acronyms extends AbstractClusteringFunction { protected Collection doApply(Config conf, String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); } - + private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { - + final Set acronyms = Sets.newLinkedHashSet(); - + for (int i = 0; i < maxAcronyms; i++) { - + final StringTokenizer st = new StringTokenizer(s); final StringBuilder sb = new StringBuilder(); - + while (st.hasMoreTokens()) { final String token = st.nextToken(); if (sb.length() > maxLen) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java index 79a264a49..f0e93b8ba 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java @@ -1,11 +1,5 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Maps; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Document; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.MapDocument; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; @@ -13,47 +7,54 @@ import java.util.Map; import java.util.Map.Entry; import java.util.regex.Pattern; +import com.google.common.collect.Maps; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.MapDocument; + public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { - public static Collection filterAndCombine(final MapDocument a, final Config conf) { - Document filtered = filter(a, conf.blacklists()); - return combine(filtered, conf); - } + public static Collection filterAndCombine(final MapDocument a, final Config conf) { + Document filtered = filter(a, conf.blacklists()); + return combine(filtered, conf); + } - private static MapDocument filter(final MapDocument a, final Map> blacklists) { - if (blacklists == null || blacklists.isEmpty()) { - return a; - } + private static MapDocument filter(final MapDocument a, final Map> blacklists) { + if (blacklists == null || blacklists.isEmpty()) { + return a; + } - final Map filtered = Maps.newHashMap(a.getFieldMap()); + final Map filtered = Maps.newHashMap(a.getFieldMap()); - for (final Entry> e : blacklists.entrySet()) { - Field fields = a.getFieldMap().get(e.getKey()); - if (fields != null) { - final FieldListImpl fl = new FieldListImpl(); + for (final Entry> e : blacklists.entrySet()) { + Field fields = a.getFieldMap().get(e.getKey()); + if (fields != null) { + final FieldListImpl fl = new FieldListImpl(); - for (Field f : fields) { - if (!isBlackListed(f.stringValue(), e.getValue())) { - fl.add(f); - } - } + for (Field f : fields) { + if (!isBlackListed(f.stringValue(), e.getValue())) { + fl.add(f); + } + } - filtered.put(e.getKey(), fl); - } - } + filtered.put(e.getKey(), fl); + } + } - return new MapDocument(a.getIdentifier(), filtered); - } + return new MapDocument(a.getIdentifier(), filtered); + } - private static boolean isBlackListed(String value, List blacklist) { - for (Pattern pattern : blacklist) { - if (pattern.matcher(value).matches()) { - return true; - } - } + private static boolean isBlackListed(String value, List blacklist) { + for (Pattern pattern : blacklist) { + if (pattern.matcher(value).matches()) { + return true; + } + } - return false; - } + return false; + } } - diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java index e67767171..3bb845b15 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.lang.annotation.ElementType; @@ -9,5 +10,5 @@ import java.lang.annotation.Target; @Target(ElementType.TYPE) public @interface ClusteringClass { - public String value(); -} \ No newline at end of file + public String value(); +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java index 037476289..3a6f17e20 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.ArrayList; @@ -5,6 +6,8 @@ import java.util.Collection; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + import com.google.common.collect.Sets; import eu.dnetlib.pace.config.Config; @@ -12,12 +15,11 @@ import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldValueImpl; -import org.apache.commons.lang3.StringUtils; public class ClusteringCombiner { private static String SEPARATOR = ":"; - private static String COLLAPSE_ON= "collapseOn"; + private static String COLLAPSE_ON = "collapseOn"; public static Collection combine(final Document a, final Config conf) { final Collection res = Sets.newLinkedHashSet(); @@ -30,31 +32,33 @@ public class ClusteringCombiner { if (values instanceof FieldValueImpl) { fields.add(values); - } - else { + } else { fields.addAll((List) values); } - res.addAll( - cd.clusteringFunction() - .apply(conf, fields) - .stream() - .map(k -> prefix + SEPARATOR +k) - .collect(Collectors.toList()) - ); + res + .addAll( + cd + .clusteringFunction() + .apply(conf, fields) + .stream() + .map(k -> prefix + SEPARATOR + k) + .collect(Collectors.toList())); } } return res; } private static String getPrefix(ClusteringDef cd, String fieldName) { - return cd.getName()+ SEPARATOR + - cd.getParams().keySet() - .stream() - .filter(k -> k.contains(COLLAPSE_ON)) - .findFirst() - .map(k -> StringUtils.substringAfter(k, SEPARATOR)) - .orElse(fieldName); + return cd.getName() + SEPARATOR + + cd + .getParams() + .keySet() + .stream() + .filter(k -> k.contains(COLLAPSE_ON)) + .findFirst() + .map(k -> StringUtils.substringAfter(k, SEPARATOR)) + .orElse(fieldName); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 0554d27a1..e72535160 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -8,9 +9,9 @@ import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; public interface ClusteringFunction { - + public Collection apply(Config config, List fields); - + public Map getParams(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index 7f342f69c..bc8844aee 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -5,6 +6,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; + import eu.dnetlib.pace.config.Config; @ClusteringClass("immutablefieldvalue") diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index e67275c4e..60861aafd 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -1,53 +1,57 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; + @ClusteringClass("keywordsclustering") public class KeywordsClustering extends AbstractClusteringFunction { - public KeywordsClustering(Map params) { - super(params); - } + public KeywordsClustering(Map params) { + super(params); + } - @Override - protected Collection doApply(final Config conf, String s) { + @Override + protected Collection doApply(final Config conf, String s) { - //takes city codes and keywords codes without duplicates - Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); - Set cities = getCities(s, params.getOrDefault("windowSize", 4)); + // takes city codes and keywords codes without duplicates + Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); + Set cities = getCities(s, params.getOrDefault("windowSize", 4)); - //list of combination to return as result - final Collection combinations = new LinkedHashSet(); + // list of combination to return as result + final Collection combinations = new LinkedHashSet(); - for (String keyword: keywordsToCodes(keywords, conf.translationMap())){ - for (String city: citiesToCodes(cities)) { - combinations.add(keyword+"-"+city); - if (combinations.size()>=params.getOrDefault("max", 2)) { - return combinations; - } - } - } + for (String keyword : keywordsToCodes(keywords, conf.translationMap())) { + for (String city : citiesToCodes(cities)) { + combinations.add(keyword + "-" + city); + if (combinations.size() >= params.getOrDefault("max", 2)) { + return combinations; + } + } + } - return combinations; - } + return combinations; + } - @Override - public Collection apply(final Config conf, List fields) { - return fields.stream().filter(f -> !f.isEmpty()) - .map(Field::stringValue) - .map(this::cleanup) - .map(this::normalize) - .map(s -> filterAllStopWords(s)) - .map(s -> doApply(conf, s)) - .map(c -> filterBlacklisted(c, ngramBlacklist)) - .flatMap(c -> c.stream()) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toCollection(HashSet::new)); - } -} \ No newline at end of file + @Override + public Collection apply(final Config conf, List fields) { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::cleanup) + .map(this::normalize) + .map(s -> filterAllStopWords(s)) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java index 7f86854c2..dc6f8f775 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java @@ -1,77 +1,81 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Lists; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.Person; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.Person; + @ClusteringClass("lnfi") -public class LastNameFirstInitial extends AbstractClusteringFunction{ +public class LastNameFirstInitial extends AbstractClusteringFunction { - private boolean DEFAULT_AGGRESSIVE = true; + private boolean DEFAULT_AGGRESSIVE = true; - public LastNameFirstInitial(final Map params) { - super(params); - } + public LastNameFirstInitial(final Map params) { + super(params); + } - @Override - public Collection apply(Config conf, List fields) { - return fields.stream().filter(f -> !f.isEmpty()) - .map(Field::stringValue) - .map(this::normalize) - .map(s -> doApply(conf, s)) - .map(c -> filterBlacklisted(c, ngramBlacklist)) - .flatMap(c -> c.stream()) - .filter(StringUtils::isNotBlank) - .collect(Collectors.toCollection(HashSet::new)); - } + @Override + public Collection apply(Config conf, List fields) { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::normalize) + .map(s -> doApply(conf, s)) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } - @Override - protected String normalize(final String s) { - return fixAliases(transliterate(nfd(unicodeNormalization(s)))) - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings - .replaceAll("[^ \\w]+", "") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + @Override + protected String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input + // strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - @Override - protected Collection doApply(final Config conf, final String s) { + @Override + protected Collection doApply(final Config conf, final String s) { - final List res = Lists.newArrayList(); + final List res = Lists.newArrayList(); - final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") + : DEFAULT_AGGRESSIVE); - Person p = new Person(s, aggressive); + Person p = new Person(s, aggressive); - if (p.isAccurate()) { - String lastName = p.getNormalisedSurname().toLowerCase(); - String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1); + if (p.isAccurate()) { + String lastName = p.getNormalisedSurname().toLowerCase(); + String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0, 1); - res.add(firstInitial.concat(lastName)); - } - else { // is not accurate, meaning it has no defined name and surname - List fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); - if (fullname.size() == 1) { - res.add(p.getNormalisedFullname().toLowerCase()); - } - else if (fullname.size() == 2) { - res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase()); - res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase()); - } - else { - res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase()); - res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase()); - } - } + res.add(firstInitial.concat(lastName)); + } else { // is not accurate, meaning it has no defined name and surname + List fullname = Arrays.asList(p.getNormalisedFullname().split(" ")); + if (fullname.size() == 1) { + res.add(p.getNormalisedFullname().toLowerCase()); + } else if (fullname.size() == 2) { + res.add(fullname.get(0).substring(0, 1).concat(fullname.get(1)).toLowerCase()); + res.add(fullname.get(1).substring(0, 1).concat(fullname.get(0)).toLowerCase()); + } else { + res.add(fullname.get(0).substring(0, 1).concat(fullname.get(fullname.size() - 1)).toLowerCase()); + res.add(fullname.get(fullname.size() - 1).substring(0, 1).concat(fullname.get(0)).toLowerCase()); + } + } - return res; - } -} \ No newline at end of file + return res; + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 309650f73..403d187fa 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -1,14 +1,17 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; +import org.apache.commons.lang3.StringUtils; + import com.google.common.collect.Lists; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; -import org.apache.commons.lang3.StringUtils; @ClusteringClass("lowercase") public class LowercaseClustering extends AbstractClusteringFunction { @@ -20,7 +23,7 @@ public class LowercaseClustering extends AbstractClusteringFunction { @Override public Collection apply(Config conf, List fields) { Collection c = Sets.newLinkedHashSet(); - for(Field f : fields) { + for (Field f : fields) { c.addAll(doApply(conf, f.stringValue())); } return c; @@ -28,7 +31,7 @@ public class LowercaseClustering extends AbstractClusteringFunction { @Override protected Collection doApply(final Config conf, final String s) { - if(StringUtils.isBlank(s)) { + if (StringUtils.isBlank(s)) { return Lists.newArrayList(); } return Lists.newArrayList(s.toLowerCase().trim()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java index 2391685b0..aa12f1279 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Set; @@ -10,11 +11,14 @@ public class NGramUtils extends AbstractPaceFunctions { private static final int SIZE = 100; - private static Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + private static Set stopwords = AbstractPaceFunctions + .loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); public static String cleanupForOrdering(String s) { NGramUtils utils = new NGramUtils(); - return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", ""); + return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)) + .substring(0, SIZE) + .replaceAll(" ", ""); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index baa30d747..0656312c7 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -6,6 +7,7 @@ import java.util.List; import java.util.Map; import com.google.common.collect.Lists; + import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrampairs") @@ -14,7 +16,7 @@ public class NgramPairs extends Ngrams { public NgramPairs(Map params) { super(params); } - + @Override protected Collection doApply(Config conf, String s) { return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); @@ -28,7 +30,7 @@ public class NgramPairs extends Ngrams { break; } res.add(ngrams.get(i) + ngrams.get(j)); - //System.out.println("-- " + concatNgrams); + // System.out.println("-- " + concatNgrams); } return res; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index 214b1451f..bcc10a869 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -1,9 +1,10 @@ + package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.config.Config; - import java.util.*; +import eu.dnetlib.pace.config.Config; + @ClusteringClass("ngrams") public class Ngrams extends AbstractClusteringFunction { @@ -36,7 +37,7 @@ public class Ngrams extends AbstractClusteringFunction { } } } - //System.out.println(ngrams + " n: " + ngrams.size()); + // System.out.println(ngrams + " n: " + ngrams.size()); return ngrams; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index db8d90bce..83b92f22c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -1,17 +1,20 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.Person; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Set; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.Person; + @ClusteringClass("personClustering") public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { @@ -31,7 +34,8 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin final Person person = new Person(f.stringValue(), false); - if (StringUtils.isNotBlank(person.getNormalisedFirstName()) && StringUtils.isNotBlank(person.getNormalisedSurname())) { + if (StringUtils.isNotBlank(person.getNormalisedFirstName()) + && StringUtils.isNotBlank(person.getNormalisedSurname())) { hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase()); } else { for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index f6c4fe07f..a3d58a9be 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -22,7 +23,8 @@ public class PersonHash extends AbstractClusteringFunction { protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); - final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") + : DEFAULT_AGGRESSIVE); res.add(new Person(s, aggressive).hash()); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index 86a2e4e4f..2aab926da 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -1,10 +1,11 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.Map; +import eu.dnetlib.pace.config.Config; + public class RandomClusteringFunction extends AbstractClusteringFunction { public RandomClusteringFunction(Map params) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 55b203d7a..1fc9f1747 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.*; @@ -5,6 +6,7 @@ import java.util.*; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; + import eu.dnetlib.pace.config.Config; @ClusteringClass("sortedngrampairs") @@ -21,7 +23,9 @@ public class SortedNgramPairs extends NgramPairs { Collections.sort(tokens); - return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max")); + return ngramPairs( + Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), + param("max")); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 50cea4db3..392aecc79 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; import java.util.List; import java.util.Map; -import eu.dnetlib.pace.config.Config; import org.apache.commons.lang3.RandomStringUtils; import org.apache.commons.lang3.StringUtils; import com.google.common.collect.Lists; +import eu.dnetlib.pace.config.Config; + @ClusteringClass("spacetrimmingfieldvalue") public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { @@ -21,7 +23,10 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { protected Collection doApply(final Config conf, final String s) { final List res = Lists.newArrayList(); - res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); + res + .add( + StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) + : s.toLowerCase().replaceAll("\\s+", "")); return res; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index fa1f64362..2a1c023a9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -5,6 +6,7 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; @ClusteringClass("suffixprefix") @@ -18,7 +20,7 @@ public class SuffixPrefix extends AbstractClusteringFunction { protected Collection doApply(Config conf, String s) { return suffixPrefix(s, param("len"), param("max")); } - + private Collection suffixPrefix(String s, int len, int max) { final Set bigrams = Sets.newLinkedHashSet(); int i = 0; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index feb60a221..122e01179 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -1,8 +1,5 @@ -package eu.dnetlib.pace.clustering; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; +package eu.dnetlib.pace.clustering; import java.net.MalformedURLException; import java.net.URL; @@ -12,43 +9,46 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; + @ClusteringClass("urlclustering") public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { - protected Map params; + protected Map params; - public UrlClustering(final Map params) { - this.params = params; - } + public UrlClustering(final Map params) { + this.params = params; + } - @Override - public Collection apply(final Config conf, List fields) { - try { - return fields.stream() - .filter(f -> !f.isEmpty()) - .map(Field::stringValue) - .map(this::asUrl) - .map(URL::getHost) - .collect(Collectors.toCollection(HashSet::new)); - } - catch (IllegalStateException e){ - return new HashSet<>(); - } - } + @Override + public Collection apply(final Config conf, List fields) { + try { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::asUrl) + .map(URL::getHost) + .collect(Collectors.toCollection(HashSet::new)); + } catch (IllegalStateException e) { + return new HashSet<>(); + } + } - @Override - public Map getParams() { - return null; - } - - private URL asUrl(String value) { - try { - return new URL(value); - } catch (MalformedURLException e) { - // should not happen as checked by pace typing - throw new IllegalStateException("invalid URL: " + value); - } - } + @Override + public Map getParams() { + return null; + } + private URL asUrl(String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java index 6fa2668fa..c8e02f8f0 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java @@ -1,90 +1,91 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; +package eu.dnetlib.pace.clustering; import java.util.*; import java.util.stream.Collectors; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; + @ClusteringClass("wordsStatsSuffixPrefixChain") public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction { - public WordsStatsSuffixPrefixChain(Map params) { - super(params); - } + public WordsStatsSuffixPrefixChain(Map params) { + super(params); + } - @Override - protected Collection doApply(Config conf, String s) { - return suffixPrefixChain(s, param("mod")); - } + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefixChain(s, param("mod")); + } - private Collection suffixPrefixChain(String s, int mod) { + private Collection suffixPrefixChain(String s, int mod) { - //create the list of words from the string (remove short words) - List wordsList = - Arrays.stream(s.split(" ")) - .filter(si -> si.length() > 3) - .collect(Collectors.toList()); + // create the list of words from the string (remove short words) + List wordsList = Arrays + .stream(s.split(" ")) + .filter(si -> si.length() > 3) + .collect(Collectors.toList()); - final int words = wordsList.size(); - final int letters = s.length(); + final int words = wordsList.size(); + final int letters = s.length(); - //create the prefix: number of words + number of letters/mod - String prefix = words + "-" + letters/mod + "-"; + // create the prefix: number of words + number of letters/mod + String prefix = words + "-" + letters / mod + "-"; - return doSuffixPrefixChain(wordsList, prefix); + return doSuffixPrefixChain(wordsList, prefix); - } + } - private Collection doSuffixPrefixChain(List wordsList, String prefix) { + private Collection doSuffixPrefixChain(List wordsList, String prefix) { - Set set = Sets.newLinkedHashSet(); - switch(wordsList.size()){ - case 0: - case 1: - break; - case 2: - set.add( - prefix + - suffix(wordsList.get(0), 3) + - prefix(wordsList.get(1), 3) - ); + Set set = Sets.newLinkedHashSet(); + switch (wordsList.size()) { + case 0: + case 1: + break; + case 2: + set + .add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3)); - set.add( - prefix + - prefix(wordsList.get(0), 3) + - suffix(wordsList.get(1), 3) - ); + set + .add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3)); - break; - default: - set.add( - prefix + - suffix(wordsList.get(0), 3) + - prefix(wordsList.get(1), 3) + - suffix(wordsList.get(2), 3) - ); + break; + default: + set + .add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3) + + suffix(wordsList.get(2), 3)); - set.add( - prefix + - prefix(wordsList.get(0), 3) + - suffix(wordsList.get(1), 3) + - prefix(wordsList.get(2), 3) - ); - break; - } + set + .add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3) + + prefix(wordsList.get(2), 3)); + break; + } - return set; + return set; - } + } + private String suffix(String s, int len) { + return s.substring(s.length() - len); + } - private String suffix(String s, int len) { - return s.substring(s.length()-len); - } - - private String prefix(String s, int len) { - return s.substring(0, len); - } + private String prefix(String s, int len) { + return s.substring(0, len); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java index 1e94b34d2..e606590a5 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.clustering; import java.util.Collection; @@ -5,53 +6,54 @@ import java.util.Map; import java.util.Set; import com.google.common.collect.Sets; + import eu.dnetlib.pace.config.Config; @ClusteringClass("wordssuffixprefix") public class WordsSuffixPrefix extends AbstractClusteringFunction { - public WordsSuffixPrefix(Map params) { - super(params); - } + public WordsSuffixPrefix(Map params) { + super(params); + } - @Override - protected Collection doApply(Config conf, String s) { - return suffixPrefix(s, param("len"), param("max")); - } + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefix(s, param("len"), param("max")); + } - private Collection suffixPrefix(String s, int len, int max) { + private Collection suffixPrefix(String s, int len, int max) { - final int words = s.split(" ").length; + final int words = s.split(" ").length; - // adjust the token length according to the number of words - switch (words) { - case 1: - return Sets.newLinkedHashSet(); - case 2: - return doSuffixPrefix(s, len+2, max, words); - case 3: - return doSuffixPrefix(s, len+1, max, words); - default: - return doSuffixPrefix(s, len, max, words); - } - } + // adjust the token length according to the number of words + switch (words) { + case 1: + return Sets.newLinkedHashSet(); + case 2: + return doSuffixPrefix(s, len + 2, max, words); + case 3: + return doSuffixPrefix(s, len + 1, max, words); + default: + return doSuffixPrefix(s, len, max, words); + } + } - private Collection doSuffixPrefix(String s, int len, int max, int words) { - final Set bigrams = Sets.newLinkedHashSet(); - int i = 0; - while (++i < s.length() && bigrams.size() < max) { - int j = s.indexOf(" ", i); + private Collection doSuffixPrefix(String s, int len, int max, int words) { + final Set bigrams = Sets.newLinkedHashSet(); + int i = 0; + while (++i < s.length() && bigrams.size() < max) { + int j = s.indexOf(" ", i); - int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); + int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); - if (j - len > 0) { - String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); - if (bigram.length() >= 4) { - bigrams.add(words+bigram); - } - } - } - return bigrams; - } + if (j - len > 0) { + String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); + if (bigram.length() >= 4) { + bigrams.add(words + bigram); + } + } + } + return bigrams; + } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index bfe9f6220..9902508b8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -1,16 +1,5 @@ -package eu.dnetlib.pace.common; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import com.ibm.icu.text.Transliterator; -import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.FieldListImpl; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.common; import java.io.IOException; import java.io.StringWriter; @@ -21,6 +10,20 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; + +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldListImpl; + /** * Set of common functions for the framework * @@ -28,330 +31,337 @@ import java.util.stream.Collectors; */ public abstract class AbstractPaceFunctions { - //city map to be used when translating the city names into codes - private static Map cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); - - //list of stopwords in different languages - protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); - protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); - protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); - protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); - protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); - protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); - protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); - - //transliterator - protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - - //blacklist of ngrams: to avoid generic keys - protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); - - //html regex for normalization - public final String HTML_REGEX = "<[^>]*>"; - - private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; - private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; - private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; - - //doi prefix for normalization - public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - - private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); - - private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); - - protected final static FieldList EMPTY_FIELD = new FieldListImpl(); - - protected String concat(final List l) { - return Joiner.on(" ").skipNulls().join(l); - } - - protected String cleanup(final String s) { - - final String s1 = s.replaceAll(HTML_REGEX, ""); - final String s2 = unicodeNormalization(s1.toLowerCase()); - final String s3 = nfd(s2); - final String s4 = fixXML(s3); - final String s5 = s4.replaceAll("([0-9]+)", " $1 "); - final String s6 = transliterate(s5); - final String s7 = fixAliases(s6); - final String s8 = s7.replaceAll("[^\\p{ASCII}]", ""); - final String s9 = s8.replaceAll("[\\p{Punct}]", " "); - final String s10 = s9.replaceAll("\\n", " "); - final String s11 = s10.replaceAll("(?m)\\s+", " "); - final String s12 = s11.trim(); - return s12; - } - - protected String fixXML(final String a){ - - return a.replaceAll("–", " ") - .replaceAll("&", " ") - .replaceAll(""", " ") - .replaceAll("−", " "); - } - - protected boolean checkNumbers(final String a, final String b) { - final String numbersA = getNumbers(a); - final String numbersB = getNumbers(b); - final String romansA = getRomans(a); - final String romansB = getRomans(b); - return !numbersA.equals(numbersB) || !romansA.equals(romansB); - } - - protected String getRomans(final String s) { - final StringBuilder sb = new StringBuilder(); - for (final String t : s.split(" ")) { - sb.append(isRoman(t) ? t : ""); - } - return sb.toString(); - } - - protected boolean isRoman(final String s) { - return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop"); - } - - protected String getNumbers(final String s) { - final StringBuilder sb = new StringBuilder(); - for (final String t : s.split(" ")) { - sb.append(isNumber(t) ? t : ""); - } - return sb.toString(); - } - - public boolean isNumber(String strNum) { - if (strNum == null) { - return false; - } - return numberPattern.matcher(strNum).matches(); - } - - protected static String fixAliases(final String s) { - final StringBuilder sb = new StringBuilder(); - - s.chars().forEach(ch -> { - final int i = StringUtils.indexOf(aliases_from, ch); - sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch); - }); - - return sb.toString(); - } - - protected static String transliterate(final String s) { - try { - return transliterator.transliterate(s); - } - catch(Exception e) { - return s; - } - } - - protected String removeSymbols(final String s) { - final StringBuilder sb = new StringBuilder(); - - s.chars().forEach(ch -> { - sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' '); - }); - - return sb.toString().replaceAll("\\s+", " "); - } - - protected String getFirstValue(final Field values) { - return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : ""; - } - - protected boolean notNull(final String s) { - return s != null; - } - - protected String normalize(final String s) { - return fixAliases(transliterate(nfd(unicodeNormalization(s)))) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings - .replaceAll("[^ \\w]+", "") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } - - public String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } - - public String utf8(final String s) { - byte[] bytes = s.getBytes(StandardCharsets.UTF_8); - return new String(bytes, StandardCharsets.UTF_8); - } - - public String unicodeNormalization(final String s) { - - Matcher m = hexUnicodePattern.matcher(s); - StringBuffer buf = new StringBuffer(s.length()); - while (m.find()) { - String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16)); - m.appendReplacement(buf, Matcher.quoteReplacement(ch)); - } - m.appendTail(buf); - return buf.toString(); - } - - protected String filterStopWords(final String s, final Set stopwords) { - final StringTokenizer st = new StringTokenizer(s); - final StringBuilder sb = new StringBuilder(); - while (st.hasMoreTokens()) { - final String token = st.nextToken(); - if (!stopwords.contains(token)) { - sb.append(token); - sb.append(" "); - } - } - return sb.toString().trim(); - } - - public String filterAllStopWords(String s) { - - s = filterStopWords(s, stopwords_en); - s = filterStopWords(s, stopwords_de); - s = filterStopWords(s, stopwords_it); - s = filterStopWords(s, stopwords_fr); - s = filterStopWords(s, stopwords_pt); - s = filterStopWords(s, stopwords_es); - s = filterStopWords(s, stopwords_gr); - - return s; - } - - protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { - final Set newset = Sets.newLinkedHashSet(); - for (final String s : set) { - if (!ngramBlacklist.contains(s)) { - newset.add(s); - } - } - return newset; - } - - public static Set loadFromClasspath(final String classpath) { - - Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - - final Set h = Sets.newHashSet(); - try { - for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { - h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords - } - } catch (final Throwable e) { - return Sets.newHashSet(); - } - return h; - } - - public static Map loadMapFromClasspath(final String classpath) { - - Transliterator transliterator = Transliterator.getInstance("Any-Eng"); - - final Map m = new HashMap<>(); - try { - for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { - //string is like this: code;word1;word2;word3 - String[] line = s.split(";"); - String value = line[0]; - for (int i = 1; i < line.length; i++) { - m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value); - } - } - } catch (final Throwable e) { - return new HashMap<>(); - } - return m; - } - - public String removeKeywords(String s, Set keywords) { - - s = " " + s + " "; - for (String k : keywords) { - s = s.replaceAll(k.toLowerCase(), ""); - } - - return s.trim(); - } - - public double commonElementsPercentage(Set s1, Set s2) { - - double longer = Math.max(s1.size(), s2.size()); - return (double) s1.stream().filter(s2::contains).count() / longer; - } - - //convert the set of keywords to codes - public Set toCodes(Set keywords, Map translationMap) { - return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); - } - - public Set keywordsToCodes(Set keywords, Map translationMap) { - return toCodes(keywords, translationMap); - } - - public Set citiesToCodes(Set keywords) { - return toCodes(keywords, cityMap); - } - - protected String firstLC(final String s) { - return StringUtils.substring(s, 0, 1).toLowerCase(); - } - - protected Iterable tokens(final String s, final int maxTokens) { - return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); - } - - public String normalizePid(String pid) { - return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); - } - - //get the list of keywords into the input string - public Set getKeywords(String s1, Map translationMap, int windowSize) { - - String s = s1; - - List tokens = Arrays.asList(s.toLowerCase().split(" ")); - - Set codes = new HashSet<>(); - - if (tokens.size() < windowSize) - windowSize = tokens.size(); - - int length = windowSize; - - while (length != 0) { - - for (int i = 0; i <= tokens.size() - length; i++) { - String candidate = concat(tokens.subList(i, i + length)); - if (translationMap.containsKey(candidate)) { - codes.add(candidate); - s = s.replace(candidate, "").trim(); - } - } - - tokens = Arrays.asList(s.split(" ")); - length -= 1; - } - - return codes; - } - - public Set getCities(String s1, int windowSize) { - return getKeywords(s1, cityMap, windowSize); - } - - public static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } + // city map to be used when translating the city names into codes + private static Map cityMap = AbstractPaceFunctions + .loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); + + // list of stopwords in different languages + protected static Set stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); + protected static Set stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + protected static Set stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt"); + protected static Set stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt"); + protected static Set stopwords_fr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_fr.txt"); + protected static Set stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); + protected static Set stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); + + // transliterator + protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + + // blacklist of ngrams: to avoid generic keys + protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); + + // html regex for normalization + public final String HTML_REGEX = "<[^>]*>"; + + private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; + private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń"; + private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn"; + + // doi prefix for normalization + public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + + private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?"); + + private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); + + protected final static FieldList EMPTY_FIELD = new FieldListImpl(); + + protected String concat(final List l) { + return Joiner.on(" ").skipNulls().join(l); + } + + protected String cleanup(final String s) { + + final String s1 = s.replaceAll(HTML_REGEX, ""); + final String s2 = unicodeNormalization(s1.toLowerCase()); + final String s3 = nfd(s2); + final String s4 = fixXML(s3); + final String s5 = s4.replaceAll("([0-9]+)", " $1 "); + final String s6 = transliterate(s5); + final String s7 = fixAliases(s6); + final String s8 = s7.replaceAll("[^\\p{ASCII}]", ""); + final String s9 = s8.replaceAll("[\\p{Punct}]", " "); + final String s10 = s9.replaceAll("\\n", " "); + final String s11 = s10.replaceAll("(?m)\\s+", " "); + final String s12 = s11.trim(); + return s12; + } + + protected String fixXML(final String a) { + + return a + .replaceAll("–", " ") + .replaceAll("&", " ") + .replaceAll(""", " ") + .replaceAll("−", " "); + } + + protected boolean checkNumbers(final String a, final String b) { + final String numbersA = getNumbers(a); + final String numbersB = getNumbers(b); + final String romansA = getRomans(a); + final String romansB = getRomans(b); + return !numbersA.equals(numbersB) || !romansA.equals(romansB); + } + + protected String getRomans(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final String t : s.split(" ")) { + sb.append(isRoman(t) ? t : ""); + } + return sb.toString(); + } + + protected boolean isRoman(final String s) { + return s + .replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop") + .equals("qwertyuiop"); + } + + protected String getNumbers(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final String t : s.split(" ")) { + sb.append(isNumber(t) ? t : ""); + } + return sb.toString(); + } + + public boolean isNumber(String strNum) { + if (strNum == null) { + return false; + } + return numberPattern.matcher(strNum).matches(); + } + + protected static String fixAliases(final String s) { + final StringBuilder sb = new StringBuilder(); + + s.chars().forEach(ch -> { + final int i = StringUtils.indexOf(aliases_from, ch); + sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch); + }); + + return sb.toString(); + } + + protected static String transliterate(final String s) { + try { + return transliterator.transliterate(s); + } catch (Exception e) { + return s; + } + } + + protected String removeSymbols(final String s) { + final StringBuilder sb = new StringBuilder(); + + s.chars().forEach(ch -> { + sb.append(StringUtils.contains(alpha, ch) ? (char) ch : ' '); + }); + + return sb.toString().replaceAll("\\s+", " "); + } + + protected String getFirstValue(final Field values) { + return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() + : ""; + } + + protected boolean notNull(final String s) { + return s != null; + } + + protected String normalize(final String s) { + return fixAliases(transliterate(nfd(unicodeNormalization(s)))) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input + // strings + .replaceAll("[^ \\w]+", "") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", "") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + public String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + public String utf8(final String s) { + byte[] bytes = s.getBytes(StandardCharsets.UTF_8); + return new String(bytes, StandardCharsets.UTF_8); + } + + public String unicodeNormalization(final String s) { + + Matcher m = hexUnicodePattern.matcher(s); + StringBuffer buf = new StringBuffer(s.length()); + while (m.find()) { + String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16)); + m.appendReplacement(buf, Matcher.quoteReplacement(ch)); + } + m.appendTail(buf); + return buf.toString(); + } + + protected String filterStopWords(final String s, final Set stopwords) { + final StringTokenizer st = new StringTokenizer(s); + final StringBuilder sb = new StringBuilder(); + while (st.hasMoreTokens()) { + final String token = st.nextToken(); + if (!stopwords.contains(token)) { + sb.append(token); + sb.append(" "); + } + } + return sb.toString().trim(); + } + + public String filterAllStopWords(String s) { + + s = filterStopWords(s, stopwords_en); + s = filterStopWords(s, stopwords_de); + s = filterStopWords(s, stopwords_it); + s = filterStopWords(s, stopwords_fr); + s = filterStopWords(s, stopwords_pt); + s = filterStopWords(s, stopwords_es); + s = filterStopWords(s, stopwords_gr); + + return s; + } + + protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { + final Set newset = Sets.newLinkedHashSet(); + for (final String s : set) { + if (!ngramBlacklist.contains(s)) { + newset.add(s); + } + } + return newset; + } + + public static Set loadFromClasspath(final String classpath) { + + Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + + final Set h = Sets.newHashSet(); + try { + for (final String s : IOUtils + .readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { + h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords + } + } catch (final Throwable e) { + return Sets.newHashSet(); + } + return h; + } + + public static Map loadMapFromClasspath(final String classpath) { + + Transliterator transliterator = Transliterator.getInstance("Any-Eng"); + + final Map m = new HashMap<>(); + try { + for (final String s : IOUtils + .readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { + // string is like this: code;word1;word2;word3 + String[] line = s.split(";"); + String value = line[0]; + for (int i = 1; i < line.length; i++) { + m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value); + } + } + } catch (final Throwable e) { + return new HashMap<>(); + } + return m; + } + + public String removeKeywords(String s, Set keywords) { + + s = " " + s + " "; + for (String k : keywords) { + s = s.replaceAll(k.toLowerCase(), ""); + } + + return s.trim(); + } + + public double commonElementsPercentage(Set s1, Set s2) { + + double longer = Math.max(s1.size(), s2.size()); + return (double) s1.stream().filter(s2::contains).count() / longer; + } + + // convert the set of keywords to codes + public Set toCodes(Set keywords, Map translationMap) { + return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); + } + + public Set keywordsToCodes(Set keywords, Map translationMap) { + return toCodes(keywords, translationMap); + } + + public Set citiesToCodes(Set keywords) { + return toCodes(keywords, cityMap); + } + + protected String firstLC(final String s) { + return StringUtils.substring(s, 0, 1).toLowerCase(); + } + + protected Iterable tokens(final String s, final int maxTokens) { + return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); + } + + public String normalizePid(String pid) { + return pid.toLowerCase().replaceAll(DOI_PREFIX, ""); + } + + // get the list of keywords into the input string + public Set getKeywords(String s1, Map translationMap, int windowSize) { + + String s = s1; + + List tokens = Arrays.asList(s.toLowerCase().split(" ")); + + Set codes = new HashSet<>(); + + if (tokens.size() < windowSize) + windowSize = tokens.size(); + + int length = windowSize; + + while (length != 0) { + + for (int i = 0; i <= tokens.size() - length; i++) { + String candidate = concat(tokens.subList(i, i + length)); + if (translationMap.containsKey(candidate)) { + codes.add(candidate); + s = s.replace(candidate, "").trim(); + } + } + + tokens = Arrays.asList(s.split(" ")); + length -= 1; + } + + return codes; + } + + public Set getCities(String s1, int windowSize) { + return getKeywords(s1, cityMap, windowSize); + } + + public static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 6b44f4ebd..0623b468f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.config; import java.util.List; @@ -50,7 +51,6 @@ public interface Config { */ public Map> blacklists(); - /** * Translation map. * diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index a377b087f..ee24ff476 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,16 +1,5 @@ -package eu.dnetlib.pace.config; -import com.fasterxml.jackson.annotation.JsonIgnore; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Maps; -import eu.dnetlib.pace.model.ClusteringDef; -import eu.dnetlib.pace.model.FieldDef; -import eu.dnetlib.pace.util.PaceException; -import org.antlr.stringtemplate.StringTemplate; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package eu.dnetlib.pace.config; import java.io.IOException; import java.io.Serializable; @@ -22,9 +11,20 @@ import java.util.Map.Entry; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; + +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; - +import eu.dnetlib.pace.util.PaceException; public class DedupConfig implements Config, Serializable { @@ -56,7 +56,8 @@ public class DedupConfig implements Config, Serializable { defaults.put("idPath", "$.id"); } - public DedupConfig() {} + public DedupConfig() { + } public static DedupConfig load(final String json) { @@ -66,10 +67,21 @@ public class DedupConfig implements Config, Serializable { config.getPace().initModel(); config.getPace().initTranslationMap(); - config.blacklists = config.getPace().getBlacklists().entrySet() - .stream() - .collect(Collectors.toMap(e -> e.getKey(), - e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) )); + config.blacklists = config + .getPace() + .getBlacklists() + .entrySet() + .stream() + .collect( + Collectors + .toMap( + e -> e.getKey(), + e -> e + .getValue() + .stream() + .filter(s -> !StringUtils.isBlank(s)) + .map(Pattern::compile) + .collect(Collectors.toList()))); return config; } catch (IOException e) { @@ -131,7 +143,7 @@ public class DedupConfig implements Config, Serializable { } @Override - public Map decisionTree(){ + public Map decisionTree() { return getPace().getDecisionTree(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java index 5ae2edfb0..b4afad9c8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -1,19 +1,20 @@ + package eu.dnetlib.pace.config; +import java.io.Serializable; +import java.util.List; +import java.util.Map; import com.fasterxml.jackson.annotation.JsonIgnore; import com.google.common.collect.Maps; import com.ibm.icu.text.Transliterator; + import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.util.PaceResolver; -import java.io.Serializable; -import java.util.List; -import java.util.Map; - public class PaceConfig extends AbstractPaceFunctions implements Serializable { private List model; @@ -33,7 +34,8 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable { @JsonIgnore public static PaceResolver resolver = new PaceResolver(); - public PaceConfig() {} + public PaceConfig() { + } public void initModel() { modelMap = Maps.newHashMap(); @@ -42,20 +44,21 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable { } } - public void initTranslationMap(){ + public void initTranslationMap() { translationMap = Maps.newHashMap(); Transliterator transliterator = Transliterator.getInstance("Any-Eng"); for (String key : synonyms.keySet()) { - for (String term : synonyms.get(key)){ - translationMap.put( + for (String term : synonyms.get(key)) { + translationMap + .put( fixAliases(transliterator.transliterate(term.toLowerCase())), - key); + key); } } } - public Map translationMap(){ + public Map translationMap() { return translationMap; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java index 20981c427..9f3323edc 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.config; public enum Type { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java index 78fc18a13..8dea04232 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -1,10 +1,5 @@ -package eu.dnetlib.pace.config; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.config; import java.io.IOException; import java.io.Serializable; @@ -12,6 +7,13 @@ import java.util.HashSet; import java.util.List; import java.util.Set; +import org.apache.commons.lang3.StringUtils; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.util.PaceException; public class WfConfig implements Serializable { @@ -76,7 +78,6 @@ public class WfConfig implements Serializable { /** Maximum number of allowed children. */ private int maxChildren = MAX_CHILDREN; - /** Default maximum number of iterations. */ private final static int MAX_ITERATIONS = 20; @@ -84,9 +85,10 @@ public class WfConfig implements Serializable { private int maxIterations = MAX_ITERATIONS; /** The Jquery path to retrieve the identifier */ - private String idPath = "$.id"; + private String idPath = "$.id"; - public WfConfig() {} + public WfConfig() { + } /** * Instantiates a new dedup config. @@ -114,8 +116,10 @@ public class WfConfig implements Serializable { * @param idPath * the path for the id of the entity */ - public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun, - final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations, final String idPath) { + public WfConfig(final String entityType, final String orderField, final List rootBuilder, + final String dedupRun, + final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, + final boolean includeChildren, final int maxIterations, final String idPath) { super(); this.entityType = entityType; this.orderField = orderField; @@ -257,7 +261,6 @@ public class WfConfig implements Serializable { this.maxChildren = maxChildren; } - public int getMaxIterations() { return maxIterations; } @@ -277,7 +280,6 @@ public class WfConfig implements Serializable { /* * (non-Javadoc) - * * @see java.lang.Object#toString() */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java index b418b75bb..c11d461ab 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import eu.dnetlib.pace.config.Type; @@ -16,7 +17,8 @@ public abstract class AbstractField implements Field { /** * Instantiates a new abstract field. */ - protected AbstractField() {} + protected AbstractField() { + } /** * Instantiates a new abstract field. @@ -33,7 +35,6 @@ public abstract class AbstractField implements Field { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#getName() */ @Override @@ -43,7 +44,6 @@ public abstract class AbstractField implements Field { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#getType() */ @Override @@ -53,7 +53,6 @@ public abstract class AbstractField implements Field { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#setName(java.lang.String) */ @Override @@ -63,7 +62,6 @@ public abstract class AbstractField implements Field { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type) */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index c15885ecf..d9ad81d42 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,15 +1,16 @@ -package eu.dnetlib.pace.model; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.clustering.ClusteringFunction; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.util.PaceException; +package eu.dnetlib.pace.model; import java.io.IOException; import java.io.Serializable; import java.util.List; import java.util.Map; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.util.PaceException; public class ClusteringDef implements Serializable { @@ -19,7 +20,8 @@ public class ClusteringDef implements Serializable { private Map params; - public ClusteringDef() {} + public ClusteringDef() { + } public String getName() { return name; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java index fcacadd6f..d9c06d4e4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.util.Set; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java index 4b7a73e31..d5712cf2f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java @@ -1,9 +1,10 @@ + package eu.dnetlib.pace.model; -import eu.dnetlib.pace.config.Type; - import java.io.Serializable; +import eu.dnetlib.pace.config.Type; + /** * The Interface Field. */ diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index f7831edaa..8b123f2d5 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -1,13 +1,15 @@ + package eu.dnetlib.pace.model; +import java.io.Serializable; +import java.util.List; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Splitter; import com.google.common.collect.Lists; -import eu.dnetlib.pace.config.Type; -import java.io.Serializable; -import java.util.List; +import eu.dnetlib.pace.config.Type; /** * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated compare algorithm. @@ -34,19 +36,20 @@ public class FieldDef implements Serializable { */ private int length = -1; - public FieldDef() {} + public FieldDef() { + } // def apply(s: String): Field[A] public Field apply(final Type type, final String s) { switch (type) { - case Int: - return new FieldValueImpl(type, name, Integer.parseInt(s)); - case String: - return new FieldValueImpl(type, name, s); - case List: - return new FieldListImpl(name, type); - default: - throw new IllegalArgumentException("Casting not implemented for type " + type); + case Int: + return new FieldValueImpl(type, name, Integer.parseInt(s)); + case String: + return new FieldValueImpl(type, name, s); + case List: + return new FieldListImpl(name, type); + default: + throw new IllegalArgumentException("Casting not implemented for type " + type); } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java index b1f5422b7..b47795d8b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.util.List; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java index 635178b83..ca23a0bfc 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java @@ -1,17 +1,19 @@ + package eu.dnetlib.pace.model; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.ListIterator; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; -import eu.dnetlib.pace.config.Type; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.ListIterator; +import eu.dnetlib.pace.config.Type; /** * The Class FieldListImpl. @@ -41,7 +43,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#add(java.lang.Object) */ @Override @@ -51,7 +52,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#add(int, java.lang.Object) */ @Override @@ -61,7 +61,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#addAll(java.util.Collection) */ @Override @@ -71,7 +70,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#addAll(int, java.util.Collection) */ @Override @@ -81,7 +79,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#clear() */ @Override @@ -91,7 +88,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#contains(java.lang.Object) */ @Override @@ -101,7 +97,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#containsAll(java.util.Collection) */ @Override @@ -111,7 +106,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#get(int) */ @Override @@ -121,7 +115,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#indexOf(java.lang.Object) */ @Override @@ -131,7 +124,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#isEmpty() */ @Override @@ -141,7 +133,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.lang.Iterable#iterator() */ @Override @@ -151,7 +142,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#lastIndexOf(java.lang.Object) */ @Override @@ -161,7 +151,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#listIterator() */ @Override @@ -171,7 +160,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#listIterator(int) */ @Override @@ -181,7 +169,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#remove(java.lang.Object) */ @Override @@ -191,7 +178,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#remove(int) */ @Override @@ -201,7 +187,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#removeAll(java.util.Collection) */ @Override @@ -211,7 +196,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#retainAll(java.util.Collection) */ @Override @@ -221,7 +205,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#set(int, java.lang.Object) */ @Override @@ -231,7 +214,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#size() */ @Override @@ -241,7 +223,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#subList(int, int) */ @Override @@ -251,7 +232,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#toArray() */ @Override @@ -261,7 +241,6 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see java.util.List#toArray(java.lang.Object[]) */ @Override @@ -271,33 +250,31 @@ public class FieldListImpl extends AbstractField implements FieldList { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#stringValue() */ @Override public String stringValue() { switch (getType()) { - case List: - case Int: - case String: - return Joiner.on(" ").join(stringList()); - case JSON: - String json; - try { - json = new ObjectMapper().writeValueAsString(this); - } catch (JsonProcessingException e) { - json = null; - } - return json; - default: - throw new IllegalArgumentException("Unknown type: " + getType().toString()); + case List: + case Int: + case String: + return Joiner.on(" ").join(stringList()); + case JSON: + String json; + try { + json = new ObjectMapper().writeValueAsString(this); + } catch (JsonProcessingException e) { + json = null; + } + return json; + default: + throw new IllegalArgumentException("Unknown type: " + getType().toString()); } } /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.FieldList#stringList() */ @Override @@ -317,10 +294,10 @@ public class FieldListImpl extends AbstractField implements FieldList { @Override public double[] doubleArray() { - return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d-> d).toArray(); + return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d -> d).toArray(); } - private Function getDouble() { + private Function getDouble() { return new Function() { @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java index ebe474363..b20f21a5c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; /** diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java index a235315d9..eff54abfb 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.net.MalformedURLException; @@ -6,9 +7,10 @@ import java.util.Collections; import java.util.Iterator; import java.util.List; -import eu.dnetlib.pace.config.Type; import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.pace.config.Type; + /** * The Class FieldValueImpl. */ @@ -20,7 +22,8 @@ public class FieldValueImpl extends AbstractField implements FieldValue { /** * Instantiates a new field value impl. */ - public FieldValueImpl() {} + public FieldValueImpl() { + } /** * Instantiates a new field value impl. @@ -39,17 +42,17 @@ public class FieldValueImpl extends AbstractField implements FieldValue { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#isEmpty() */ @Override public boolean isEmpty() { - if (value == null) return false; + if (value == null) + return false; switch (type) { case String: case JSON: - return value.toString().isEmpty(); + return value.toString().isEmpty(); case List: try { List list = (List) value; @@ -61,9 +64,9 @@ public class FieldValueImpl extends AbstractField implements FieldValue { String str = value.toString(); return StringUtils.isBlank(str) || !isValidURL(str); case DoubleArray: - return doubleArrayValue().length==0; - default: - return true; + return doubleArrayValue().length == 0; + default: + return true; } } @@ -78,7 +81,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.FieldValue#getValue() */ @Override @@ -88,7 +90,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object) */ @Override @@ -98,7 +99,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.Field#stringValue() */ @Override @@ -119,12 +119,11 @@ public class FieldValueImpl extends AbstractField implements FieldValue { } public double[] doubleArrayValue() { - return (double[])getValue(); + return (double[]) getValue(); } /* * (non-Javadoc) - * * @see java.lang.Iterable#iterator() */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java index 77b7c120e..c2860ca3b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.io.Serializable; @@ -57,7 +58,6 @@ public class MapDocument implements Document, Serializable { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.document.Document#fields() */ @Override @@ -67,7 +67,6 @@ public class MapDocument implements Document, Serializable { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String) */ @Override @@ -77,7 +76,6 @@ public class MapDocument implements Document, Serializable { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.document.Document#fieldNames() */ @Override @@ -87,7 +85,6 @@ public class MapDocument implements Document, Serializable { /* * (non-Javadoc) - * * @see java.lang.Object#toString() */ @Override @@ -107,7 +104,6 @@ public class MapDocument implements Document, Serializable { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.model.document.Document#getIdentifier() */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java index 7217b2b59..a77dcbc0c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.util.Comparator; @@ -28,18 +29,19 @@ public class MapDocumentComparator implements Comparator { /* * (non-Javadoc) - * * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) */ @Override public int compare(final Document d1, final Document d2) { - if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0; + if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) + return 0; final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue(); final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue(); - if ((o1 == null) || (o2 == null)) return 0; + if ((o1 == null) || (o2 == null)) + return 0; final String to1 = NGramUtils.cleanupForOrdering(o1); final String to2 = NGramUtils.cleanupForOrdering(o2); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java index e5b3522df..d71f780ad 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.lang.reflect.Type; @@ -33,7 +34,8 @@ public class MapDocumentSerializer implements InstanceCreator { gson.registerTypeAdapter(Field.class, new JsonDeserializer() { @Override - public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { + public Field deserialize(final JsonElement json, final Type typeOfT, + final JsonDeserializationContext context) throws JsonParseException { final FieldListImpl fl = new FieldListImpl(); if (json.isJsonObject()) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java index 543b1bdfe..96120cf4d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.nio.charset.Charset; @@ -43,7 +44,7 @@ public class Person { // s = s.replaceAll("[\\W&&[^,-]]", ""); } - if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname + if (s.contains(",")) { // if the name contains a comma it is easy derivable the name and the surname final String[] arr = s.split(","); if (arr.length == 1) { fullname = splitTerms(arr[0]); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java index a900a6082..1f8aab4bf 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.model; import java.util.ArrayList; @@ -57,7 +58,7 @@ public class PersonComparatorUtils { private static boolean verifyNames(List list1, List list2) { return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) - && verifySimilarity(extractInitials(list1), extractInitials(list2)); + && verifySimilarity(extractInitials(list1), extractInitials(list2)); } private static boolean verifySurnames(List list1, List list2) { @@ -76,7 +77,7 @@ public class PersonComparatorUtils { Collections.sort(list1); Collections.sort(list2); return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) - && verifySimilarity(extractInitials(list1), extractInitials(list2)); + && verifySimilarity(extractInitials(list1), extractInitials(list2)); } private static List extractExtendedNames(List list) { @@ -107,7 +108,7 @@ public class PersonComparatorUtils { for (String s : list1) { int curr = list2.indexOf(s); if (curr > pos) { - list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm" + list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm" pos = curr; } else { return false; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java index 2fb8eb97c..f53655a8e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AlwaysMatch.java @@ -1,42 +1,43 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("alwaysMatch") public class AlwaysMatch extends AbstractComparator { - public AlwaysMatch(final Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } + public AlwaysMatch(final Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } - public AlwaysMatch(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public AlwaysMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { - return 1.0; - } + @Override + public double compare(final Field a, final Field b, final Config conf) { + return 1.0; + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } - diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index 33f86d85d..047e121e3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -1,13 +1,5 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Iterables; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.Person; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import com.wcohen.ss.AbstractStringDistance; +package eu.dnetlib.pace.tree; import java.util.Comparator; import java.util.List; @@ -16,139 +8,164 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import com.google.common.collect.Iterables; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.Person; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("authorsMatch") public class AuthorsMatch extends AbstractComparator { - Map params; + Map params; - private double SURNAME_THRESHOLD; - private double NAME_THRESHOLD; - private double FULLNAME_THRESHOLD; - private String MODE; //full or surname - private int SIZE_THRESHOLD; - private String TYPE; //count or percentage - private int common; + private double SURNAME_THRESHOLD; + private double NAME_THRESHOLD; + private double FULLNAME_THRESHOLD; + private String MODE; // full or surname + private int SIZE_THRESHOLD; + private String TYPE; // count or percentage + private int common; - public AuthorsMatch(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - this.params = params; + public AuthorsMatch(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + this.params = params; - MODE = params.getOrDefault("mode", "full"); - SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95")); - NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95")); - FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); - SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20")); - TYPE = params.getOrDefault("type", "percentage"); - common = 0; - } + MODE = params.getOrDefault("mode", "full"); + SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95")); + NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95")); + FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); + SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20")); + TYPE = params.getOrDefault("type", "percentage"); + common = 0; + } - protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { - super(w, ssalgo); - } + protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { + @Override + public double compare(final Field a, final Field b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; + if (a.isEmpty() || b.isEmpty()) + return -1; - if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD) - return 1.0; + if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD) + return 1.0; - List aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - List bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + List aList = ((FieldList) a) + .stringList() + .stream() + .map(author -> new Person(author, false)) + .collect(Collectors.toList()); + List bList = ((FieldList) b) + .stringList() + .stream() + .map(author -> new Person(author, false)) + .collect(Collectors.toList()); - common = 0; - //compare each element of List1 with each element of List2 - for (Person p1 : aList) + common = 0; + // compare each element of List1 with each element of List2 + for (Person p1 : aList) - for (Person p2 : bList) { + for (Person p2 : bList) { - //both persons are inaccurate - if (!p1.isAccurate() && !p2.isAccurate()) { - //compare just normalized fullnames - String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname()); - String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname()); + // both persons are inaccurate + if (!p1.isAccurate() && !p2.isAccurate()) { + // compare just normalized fullnames + String fullname1 = normalization( + p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()); + String fullname2 = normalization( + p2.getNormalisedFullname().isEmpty() ? p2.getOriginal() : p2.getNormalisedFullname()); - if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) { - common += 1; - break; - } - } + if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) { + common += 1; + break; + } + } - //one person is inaccurate - if (p1.isAccurate() ^ p2.isAccurate()) { - //prepare data - //data for the accurate person - String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName()); - String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname()); + // one person is inaccurate + if (p1.isAccurate() ^ p2.isAccurate()) { + // prepare data + // data for the accurate person + String name = normalization( + p1.isAccurate() ? p1.getNormalisedFirstName() : p2.getNormalisedFirstName()); + String surname = normalization( + p1.isAccurate() ? p1.getNormalisedSurname() : p2.getNormalisedSurname()); - //data for the inaccurate person - String fullname = normalization( - p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname()) - ); + // data for the inaccurate person + String fullname = normalization( + p1.isAccurate() + ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) + : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())); - if (fullname.contains(surname)) { - if (MODE.equals("full")) { - if (fullname.contains(name)) { - common += 1; - break; - } - } - else { //MODE equals "surname" - common += 1; - break; - } - } - } + if (fullname.contains(surname)) { + if (MODE.equals("full")) { + if (fullname.contains(name)) { + common += 1; + break; + } + } else { // MODE equals "surname" + common += 1; + break; + } + } + } - //both persons are accurate - if (p1.isAccurate() && p2.isAccurate()) { + // both persons are accurate + if (p1.isAccurate() && p2.isAccurate()) { - if (compareSurname(p1, p2)) { - if (MODE.equals("full")) { - if(compareFirstname(p1, p2)) { - common += 1; - break; - } - } - else { //MODE equals "surname" - common += 1; - break; - } - } + if (compareSurname(p1, p2)) { + if (MODE.equals("full")) { + if (compareFirstname(p1, p2)) { + common += 1; + break; + } + } else { // MODE equals "surname" + common += 1; + break; + } + } - } + } - } + } - //normalization factor to compute the score - int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); + // normalization factor to compute the score + int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); - if(TYPE.equals("percentage")) { - return (double) common / normFactor; - } - else { - return (double) common; - } - } + if (TYPE.equals("percentage")) { + return (double) common / normFactor; + } else { + return (double) common; + } + } - public boolean compareSurname(Person p1, Person p2) { - return ssalgo.score(normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD; - } + public boolean compareSurname(Person p1, Person p2) { + return ssalgo + .score( + normalization(p1.getNormalisedSurname()), normalization(p2.getNormalisedSurname())) > SURNAME_THRESHOLD; + } - public boolean compareFirstname(Person p1, Person p2) { + public boolean compareFirstname(Person p1, Person p2) { - if(p1.getNormalisedFirstName().length()<=2 || p2.getNormalisedFirstName().length()<=2) { - if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName()))) - return true; - } + if (p1.getNormalisedFirstName().length() <= 2 || p2.getNormalisedFirstName().length() <= 2) { + if (firstLC(p1.getNormalisedFirstName()).equals(firstLC(p2.getNormalisedFirstName()))) + return true; + } - return ssalgo.score(normalization(p1.getNormalisedFirstName()), normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD; - } + return ssalgo + .score( + normalization(p1.getNormalisedFirstName()), + normalization(p2.getNormalisedFirstName())) > NAME_THRESHOLD; + } - public String normalization(String s) { - return normalize(utf8(cleanup(s))); - } + public String normalization(String s) { + return normalize(utf8(cleanup(s))); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java index 8ff818e07..f3da29e8e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CityMatch.java @@ -1,47 +1,48 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; +import java.util.Set; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; -import java.util.Set; - @ComparatorClass("cityMatch") public class CityMatch extends AbstractComparator { - private Map params; + private Map params; - public CityMatch(Map params) { - super(params); - this.params = params; - } + public CityMatch(Map params) { + super(params); + this.params = params; + } - @Override - public double distance(final String a, final String b, final Config conf) { + @Override + public double distance(final String a, final String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + String ca = cleanup(a); + String cb = cleanup(b); - ca = normalize(ca); - cb = normalize(cb); + ca = normalize(ca); + cb = normalize(cb); - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); - Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set codes1 = citiesToCodes(cities1); - Set codes2 = citiesToCodes(cities2); + Set codes1 = citiesToCodes(cities1); + Set codes2 = citiesToCodes(cities2); - //if no cities are detected, the comparator gives 1.0 - if (codes1.isEmpty() && codes2.isEmpty()) - return 1.0; - else { - if (codes1.isEmpty() ^ codes2.isEmpty()) - return -1; //undefined if one of the two has no cities - return commonElementsPercentage(codes1, codes2); - } - } + // if no cities are detected, the comparator gives 1.0 + if (codes1.isEmpty() && codes2.isEmpty()) + return 1.0; + else { + if (codes1.isEmpty() ^ codes2.isEmpty()) + return -1; // undefined if one of the two has no cities + return commonElementsPercentage(codes1, codes2); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java index 5d441771e..82d84794f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CosineSimilarity.java @@ -1,5 +1,11 @@ + package eu.dnetlib.pace.tree; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; @@ -8,46 +14,40 @@ import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - @ComparatorClass("cosineSimilarity") public class CosineSimilarity extends AbstractComparator { - Map params; + Map params; - public CosineSimilarity(Map params) { - super(params); - } + public CosineSimilarity(Map params) { + super(params); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { + @Override + public double compare(final Field a, final Field b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; + if (a.isEmpty() || b.isEmpty()) + return -1; - double[] aVector = ((FieldValueImpl) a).doubleArrayValue(); - double[] bVector = ((FieldValueImpl) b).doubleArrayValue(); + double[] aVector = ((FieldValueImpl) a).doubleArrayValue(); + double[] bVector = ((FieldValueImpl) b).doubleArrayValue(); - return cosineSimilarity(aVector, bVector); - } + return cosineSimilarity(aVector, bVector); + } - double cosineSimilarity(double[] a, double[] b) { - double dotProduct = 0; - double normASum = 0; - double normBSum = 0; + double cosineSimilarity(double[] a, double[] b) { + double dotProduct = 0; + double normASum = 0; + double normBSum = 0; - for(int i = 0; i < a.length; i ++) { - dotProduct += a[i] * b[i]; - normASum += a[i] * a[i]; - normBSum += b[i] * b[i]; - } - - double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum); - return dotProduct / eucledianDist; - } + for (int i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i]; + normASum += a[i] * a[i]; + normBSum += b[i] * b[i]; + } + double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum); + return dotProduct / eucledianDist; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java index 24a94c8cd..24f3dfc02 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DoiExactMatch.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree; import java.util.Map; @@ -13,15 +14,15 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("doiExactMatch") public class DoiExactMatch extends ExactMatchIgnoreCase { - public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - public DoiExactMatch(final Map params) { - super(params); - } + public DoiExactMatch(final Map params) { + super(params); + } - @Override - protected String getValue(final Field f) { - return super.getValue(f).replaceAll(PREFIX, ""); - } + @Override + protected String getValue(final Field f) { + return super.getValue(f).replaceAll(PREFIX, ""); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java index 958028e8a..efafe6573 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DomainExactMatch.java @@ -1,30 +1,31 @@ -package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("domainExactMatch") public class DomainExactMatch extends ExactMatchIgnoreCase { - public DomainExactMatch(final Map params) { - super(params); - } + public DomainExactMatch(final Map params) { + super(params); + } - @Override - protected String getValue(final Field f) { + @Override + protected String getValue(final Field f) { - try { - return asUrl(super.getValue(f)).getHost(); - } catch (MalformedURLException e) { - return ""; - } - } + try { + return asUrl(super.getValue(f)).getHost(); + } catch (MalformedURLException e) { + return ""; + } + } - private URL asUrl(final String value) throws MalformedURLException { - return new URL(value); - } + private URL asUrl(final String value) throws MalformedURLException { + return new URL(value); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java index 21479cf3f..85ce6744d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatch.java @@ -1,42 +1,44 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("exactMatch") public class ExactMatch extends AbstractComparator { - public ExactMatch(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } + public ExactMatch(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } - public ExactMatch(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public ExactMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1.0; //return -1 if a field is missing - } - return a.equals(b) ? 1.0 : 0; - } + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } + return a.equals(b) ? 1.0 : 0; + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java index 89cd2719a..307f02246 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ExactMatchIgnoreCase.java @@ -1,32 +1,33 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("exactMatchIgnoreCase") public class ExactMatchIgnoreCase extends AbstractComparator { - public ExactMatchIgnoreCase(Map params) { - super(params); - } + public ExactMatchIgnoreCase(Map params) { + super(params); + } - @Override - public double compare(Field a, Field b, final Config conf) { + @Override + public double compare(Field a, Field b, final Config conf) { - final String fa = getValue(a); - final String fb = getValue(b); + final String fa = getValue(a); + final String fb = getValue(b); - if (fa.isEmpty() || fb.isEmpty()) - return -1; + if (fa.isEmpty() || fb.isEmpty()) + return -1; - return fa.equalsIgnoreCase(fb) ? 1 : 0; - } + return fa.equalsIgnoreCase(fb) ? 1 : 0; + } - protected String getValue(final Field f) { - return getFirstValue(f); - } -} \ No newline at end of file + protected String getValue(final Field f) { + return getFirstValue(f); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java index 661b17433..bdef1225d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java @@ -1,11 +1,5 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.HashMap; import java.util.List; @@ -13,72 +7,79 @@ import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("instanceTypeMatch") public class InstanceTypeMatch extends AbstractComparator { - final Map translationMap = new HashMap<>(); + final Map translationMap = new HashMap<>(); - public InstanceTypeMatch(Map params){ - super(params); + public InstanceTypeMatch(Map params) { + super(params); - //jolly types - translationMap.put("Conference object", "*"); - translationMap.put("Other literature type", "*"); - translationMap.put("Unknown", "*"); + // jolly types + translationMap.put("Conference object", "*"); + translationMap.put("Other literature type", "*"); + translationMap.put("Unknown", "*"); - //article types - translationMap.put("Article", "Article"); - translationMap.put("Data Paper", "Article"); - translationMap.put("Software Paper", "Article"); - translationMap.put("Preprint", "Article"); + // article types + translationMap.put("Article", "Article"); + translationMap.put("Data Paper", "Article"); + translationMap.put("Software Paper", "Article"); + translationMap.put("Preprint", "Article"); - //thesis types - translationMap.put("Thesis", "Thesis"); - translationMap.put("Master thesis", "Thesis"); - translationMap.put("Bachelor thesis", "Thesis"); - translationMap.put("Doctoral thesis", "Thesis"); - } + // thesis types + translationMap.put("Thesis", "Thesis"); + translationMap.put("Master thesis", "Thesis"); + translationMap.put("Bachelor thesis", "Thesis"); + translationMap.put("Doctoral thesis", "Thesis"); + } + @Override + public double compare(final Field a, final Field b, final Config conf) { - @Override - public double compare(final Field a, final Field b, final Config conf) { + if (a == null || b == null) { + return -1; + } - if (a == null || b == null) { - return -1; - } + final List sa = ((FieldList) a).stringList(); + final List sb = ((FieldList) b).stringList(); - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } - if (sa.isEmpty() || sb.isEmpty()) { - return -1; - } + final Set ca = sa.stream().map(this::translate).collect(Collectors.toSet()); + final Set cb = sb.stream().map(this::translate).collect(Collectors.toSet()); - final Set ca = sa.stream().map(this::translate).collect(Collectors.toSet()); - final Set cb = sb.stream().map(this::translate).collect(Collectors.toSet()); + // if at least one is a jolly type, it must produce a match + if (ca.contains("*") || cb.contains("*")) + return 1.0; - //if at least one is a jolly type, it must produce a match - if (ca.contains("*") || cb.contains("*")) - return 1.0; + int incommon = Sets.intersection(ca, cb).size(); - int incommon = Sets.intersection(ca, cb).size(); + // if at least one is in common, it must produce a match + return incommon >= 1 ? 1 : 0; + } - //if at least one is in common, it must produce a match - return incommon >= 1 ? 1 : 0; - } + public String translate(String term) { + return translationMap.getOrDefault(term, term); + } - public String translate(String term){ - return translationMap.getOrDefault(term, term); - } + @Override + public double getWeight() { + return super.weight; + } - @Override - public double getWeight() { - return super.weight; - } - - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java index 9e214f6a4..7511e5ec9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinkler.java @@ -1,44 +1,46 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @ComparatorClass("jaroWinkler") public class JaroWinkler extends AbstractComparator { - public JaroWinkler(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - } + public JaroWinkler(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + } - public JaroWinkler(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public JaroWinkler(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double distance(String a, String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + @Override + public double distance(String a, String b, final Config conf) { + String ca = cleanup(a); + String cb = cleanup(b); - return normalize(ssalgo.score(ca, cb)); - } + return normalize(ssalgo.score(ca, cb)); + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(double d) { - return d; - } + @Override + protected double normalize(double d) { + return d; + } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java index d8e416df9..4f4c68e47 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerNormalizedName.java @@ -1,72 +1,74 @@ + package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; - -import eu.dnetlib.pace.config.Config; - - import java.util.Map; import java.util.Set; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("jaroWinklerNormalizedName") public class JaroWinklerNormalizedName extends AbstractComparator { - private Map params; + private Map params; - public JaroWinklerNormalizedName(Map params){ - super(params, new com.wcohen.ss.JaroWinkler()); - this.params = params; - } + public JaroWinklerNormalizedName(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + this.params = params; + } - public JaroWinklerNormalizedName(double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public JaroWinklerNormalizedName(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - @Override - public double distance(String a, String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + @Override + public double distance(String a, String b, final Config conf) { + String ca = cleanup(a); + String cb = cleanup(b); - ca = normalize(ca); - cb = normalize(cb); + ca = normalize(ca); + cb = normalize(cb); - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords1 = getKeywords( + ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords2 = getKeywords( + cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4"))); - ca = removeKeywords(ca, keywords1); - ca = removeKeywords(ca, cities1); - cb = removeKeywords(cb, keywords2); - cb = removeKeywords(cb, cities2); + ca = removeKeywords(ca, keywords1); + ca = removeKeywords(ca, cities1); + cb = removeKeywords(cb, keywords2); + cb = removeKeywords(cb, cities2); - ca = ca.replaceAll("[ ]{2,}", " "); - cb = cb.replaceAll("[ ]{2,}", " "); + ca = ca.replaceAll("[ ]{2,}", " "); + cb = cb.replaceAll("[ ]{2,}", " "); - if (ca.isEmpty() && cb.isEmpty()) - return 1.0; - else - return normalize(ssalgo.score(ca,cb)); - } + if (ca.isEmpty() && cb.isEmpty()) + return 1.0; + else + return normalize(ssalgo.score(ca, cb)); + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(double d) { - return d; - } + @Override + protected double normalize(double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java index 38ed437de..d97d8d061 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JaroWinklerTitle.java @@ -1,18 +1,19 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; import eu.dnetlib.pace.config.Config; - -import java.util.Map; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) @ComparatorClass("jaroWinklerTitle") public class JaroWinklerTitle extends AbstractComparator { - public JaroWinklerTitle(Map params){ + public JaroWinklerTitle(Map params) { super(params, new com.wcohen.ss.JaroWinkler()); } @@ -23,7 +24,7 @@ public class JaroWinklerTitle extends AbstractComparator { protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { super(weight, ssalgo); } - + @Override public double distance(String a, String b, final Config conf) { String ca = cleanup(a); @@ -31,7 +32,7 @@ public class JaroWinklerTitle extends AbstractComparator { boolean check = checkNumbers(ca, cb); return check ? 0.5 : normalize(ssalgo.score(ca, cb)); - } + } @Override public double getWeight() { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index eb831b094..e5c69a852 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -1,77 +1,81 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.util.MapDocumentUtil; + @ComparatorClass("jsonListMatch") public class JsonListMatch extends AbstractComparator { - private static final Log log = LogFactory.getLog(JsonListMatch.class); - private Map params; + private static final Log log = LogFactory.getLog(JsonListMatch.class); + private Map params; - private String MODE; //"percentage" or "count" + private String MODE; // "percentage" or "count" - public JsonListMatch(final Map params) { - super(params); - this.params = params; + public JsonListMatch(final Map params) { + super(params); + this.params = params; - MODE = params.getOrDefault("mode", "percentage"); - } + MODE = params.getOrDefault("mode", "percentage"); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { + @Override + public double compare(final Field a, final Field b, final Config conf) { - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); + final List sa = ((FieldList) a).stringList(); + final List sb = ((FieldList) b).stringList(); - if (sa.isEmpty() || sb.isEmpty()) { - return -1; - } + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } - final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); - final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); + final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); + final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); - int incommon = Sets.intersection(ca, cb).size(); - int simDiff = Sets.symmetricDifference(ca, cb).size(); + int incommon = Sets.intersection(ca, cb).size(); + int simDiff = Sets.symmetricDifference(ca, cb).size(); - if (incommon + simDiff == 0) { - return 0.0; - } + if (incommon + simDiff == 0) { + return 0.0; + } - if (MODE.equals("percentage")) - return (double)incommon / (incommon + simDiff); - else - return incommon; + if (MODE.equals("percentage")) + return (double) incommon / (incommon + simDiff); + else + return incommon; - } + } - //converts every json into a comparable string basing on parameters - private String toComparableString(String json){ + // converts every json into a comparable string basing on parameters + private String toComparableString(String json) { - StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters + StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into + // parameters - //for each path in the param list - for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { - String path = params.get(key); - String value = MapDocumentUtil.getJPathString(path, json); - if (value == null || value.isEmpty()) - value = ""; - st.append( value + "::"); - } + // for each path in the param list + for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { + String path = params.get(key); + String value = MapDocumentUtil.getJPathString(path, json); + if (value == null || value.isEmpty()) + value = ""; + st.append(value + "::"); + } - st.setLength(st.length()-2); - return st.toString(); - } + st.setLength(st.length() - 2); + return st.toString(); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java index 7d275425d..0d69e5177 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/KeywordMatch.java @@ -1,47 +1,50 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; +import java.util.Set; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; -import java.util.Set; - @ComparatorClass("keywordMatch") public class KeywordMatch extends AbstractComparator { - Map params; + Map params; - public KeywordMatch(Map params) { - super(params); - this.params = params; - } + public KeywordMatch(Map params) { + super(params); + this.params = params; + } - @Override - public double distance(final String a, final String b, final Config conf) { + @Override + public double distance(final String a, final String b, final Config conf) { - String ca = cleanup(a); - String cb = cleanup(b); + String ca = cleanup(a); + String cb = cleanup(b); - ca = normalize(ca); - cb = normalize(cb); + ca = normalize(ca); + cb = normalize(cb); - ca = filterAllStopWords(ca); - cb = filterAllStopWords(cb); + ca = filterAllStopWords(ca); + cb = filterAllStopWords(cb); - Set keywords1 = getKeywords(ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set keywords2 = getKeywords(cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords1 = getKeywords( + ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); + Set keywords2 = getKeywords( + cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4"))); - Set codes1 = toCodes(keywords1, conf.translationMap()); - Set codes2 = toCodes(keywords2, conf.translationMap()); + Set codes1 = toCodes(keywords1, conf.translationMap()); + Set codes2 = toCodes(keywords2, conf.translationMap()); - //if no cities are detected, the comparator gives 1.0 - if (codes1.isEmpty() && codes2.isEmpty()) - return 1.0; - else { - if (codes1.isEmpty() ^ codes2.isEmpty()) - return -1.0; //undefined if one of the two has no keywords - return commonElementsPercentage(codes1, codes2); - } - } + // if no cities are detected, the comparator gives 1.0 + if (codes1.isEmpty() && codes2.isEmpty()) + return 1.0; + else { + if (codes1.isEmpty() ^ codes2.isEmpty()) + return -1.0; // undefined if one of the two has no keywords + return commonElementsPercentage(codes1, codes2); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java index f9e01356d..d483049d7 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinkler.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("level2JaroWinkler") public class Level2JaroWinkler extends AbstractComparator { - public Level2JaroWinkler(Map params){ + public Level2JaroWinkler(Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java index 29f99e4ae..a87a6079a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2JaroWinklerTitle.java @@ -1,16 +1,18 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.config.Config; - -import java.util.Map; @ComparatorClass("level2JaroWinklerTitle") public class Level2JaroWinklerTitle extends AbstractComparator { - public Level2JaroWinklerTitle(Map params){ + public Level2JaroWinklerTitle(Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); } @@ -29,7 +31,8 @@ public class Level2JaroWinklerTitle extends AbstractComparator { final boolean check = checkNumbers(ca, cb); - if (check) return 0.5; + if (check) + return 0.5; return ssalgo.score(ca, cb); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java index 73fb13c13..5ac19ee2e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Level2Levenstein.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("level2Levenstein") public class Level2Levenstein extends AbstractComparator { - public Level2Levenstein(Map params){ + public Level2Levenstein(Map params) { super(params, new com.wcohen.ss.Level2Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java index c146e5ab1..4072f52aa 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/Levenstein.java @@ -1,15 +1,17 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("levenstein") public class Levenstein extends AbstractComparator { - public Levenstein(Map params){ + public Levenstein(Map params) { super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index fda5848df..896e93f09 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -1,21 +1,23 @@ + package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.config.Config; +import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import java.util.Map; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("levensteinTitle") public class LevensteinTitle extends AbstractComparator { private static final Log log = LogFactory.getLog(LevensteinTitle.class); - public LevensteinTitle(Map params){ + public LevensteinTitle(Map params) { super(params, new com.wcohen.ss.Levenstein()); } @@ -34,7 +36,8 @@ public class LevensteinTitle extends AbstractComparator { final boolean check = checkNumbers(ca, cb); - if (check) return 0.5; + if (check) + return 0.5; return normalize(ssalgo.score(ca, cb), ca.length(), cb.length()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java index 506760fa0..796edf49e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitleIgnoreVersion.java @@ -1,12 +1,13 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.config.Config; - - -import java.util.Map; /** * Compared compare between two titles, ignoring version numbers. Suitable for Software entities. @@ -14,7 +15,7 @@ import java.util.Map; @ComparatorClass("levensteinTitleIgnoreVersion") public class LevensteinTitleIgnoreVersion extends AbstractComparator { - public LevensteinTitleIgnoreVersion(Map params){ + public LevensteinTitleIgnoreVersion(Map params) { super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java index 40e041f6f..8abe37d96 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java @@ -1,17 +1,19 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; +package eu.dnetlib.pace.tree; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + /** * The Class Contains match * @@ -20,55 +22,54 @@ import java.util.stream.Collectors; @ComparatorClass("listContainsMatch") public class ListContainsMatch extends AbstractComparator { - private Map params; - private boolean CASE_SENSITIVE; - private String STRING; - private String AGGREGATOR; + private Map params; + private boolean CASE_SENSITIVE; + private String STRING; + private String AGGREGATOR; - public ListContainsMatch(Map params) { - super(params); - this.params = params; + public ListContainsMatch(Map params) { + super(params); + this.params = params; - //read parameters - CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); - STRING = params.get("string"); - AGGREGATOR = params.get("bool"); - } + // read parameters + CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); + STRING = params.get("string"); + AGGREGATOR = params.get("bool"); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { + @Override + public double compare(final Field a, final Field b, final Config conf) { - List sa = ((FieldList) a).stringList(); - List sb = ((FieldList) b).stringList(); + List sa = ((FieldList) a).stringList(); + List sb = ((FieldList) b).stringList(); - if (sa.isEmpty() || sb.isEmpty()) { - return -1; - } + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } - if (!CASE_SENSITIVE) { - sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList()); - sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList()); - STRING = STRING.toLowerCase(); - } + if (!CASE_SENSITIVE) { + sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList()); + sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList()); + STRING = STRING.toLowerCase(); + } - switch(AGGREGATOR) { - case "AND": - if(sa.contains(STRING) && sb.contains(STRING)) - return 1.0; - break; - case "OR": - if(sa.contains(STRING) || sb.contains(STRING)) - return 1.0; - break; - case "XOR": - if(sa.contains(STRING) ^ sb.contains(STRING)) - return 1.0; - break; - default: - return 0.0; - } - return 0.0; + switch (AGGREGATOR) { + case "AND": + if (sa.contains(STRING) && sb.contains(STRING)) + return 1.0; + break; + case "OR": + if (sa.contains(STRING) || sb.contains(STRING)) + return 1.0; + break; + case "XOR": + if (sa.contains(STRING) ^ sb.contains(STRING)) + return 1.0; + break; + default: + return 0.0; + } + return 0.0; - } + } } - diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java index 64413dfff..ee4b58d9c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MustBeDifferent.java @@ -1,16 +1,18 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import eu.dnetlib.pace.config.Config; - -import java.util.Map; @ComparatorClass("mustBeDifferent") public class MustBeDifferent extends AbstractComparator { - public MustBeDifferent(Map params){ + public MustBeDifferent(Map params) { super(params, new com.wcohen.ss.Levenstein()); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java index 98c2f4548..8b400122f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NullDistanceAlgo.java @@ -1,12 +1,13 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.Comparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * Not all fields of a document need to partecipate in the compare measure. We model those fields as having a * NullDistanceAlgo. @@ -14,7 +15,7 @@ import java.util.Map; @ComparatorClass("null") public class NullDistanceAlgo implements Comparator { - public NullDistanceAlgo(Map params){ + public NullDistanceAlgo(Map params) { } @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java index ac6d78403..ebe25bab4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersComparator.java @@ -1,34 +1,35 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("numbersComparator") public class NumbersComparator extends AbstractComparator { - Map params; + Map params; - public NumbersComparator(Map params) { - super(params); - this.params = params; - } + public NumbersComparator(Map params) { + super(params); + this.params = params; + } - @Override - public double distance(String a, String b, Config conf) { + @Override + public double distance(String a, String b, Config conf) { - //extracts numbers from the field - String numbers1 = getNumbers(nfd(a)); - String numbers2 = getNumbers(nfd(b)); + // extracts numbers from the field + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); - if (numbers1.isEmpty() || numbers2.isEmpty()) - return -1.0; + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; - int n1 = Integer.parseInt(numbers1); - int n2 = Integer.parseInt(numbers2); + int n1 = Integer.parseInt(numbers1); + int n2 = Integer.parseInt(numbers2); - return Math.abs(n1 - n2); - } + return Math.abs(n1 - n2); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java index ce60a672a..52f99d018 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/NumbersMatch.java @@ -1,35 +1,35 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("numbersMatch") public class NumbersMatch extends AbstractComparator { + public NumbersMatch(Map params) { + super(params); + } - public NumbersMatch(Map params) { - super(params); - } + @Override + public double distance(String a, String b, Config conf) { - @Override - public double distance(String a, String b, Config conf) { + // extracts numbers from the field + String numbers1 = getNumbers(nfd(a)); + String numbers2 = getNumbers(nfd(b)); - //extracts numbers from the field - String numbers1 = getNumbers(nfd(a)); - String numbers2 = getNumbers(nfd(b)); + if (numbers1.isEmpty() && numbers2.isEmpty()) + return 1.0; - if (numbers1.isEmpty() && numbers2.isEmpty()) - return 1.0; + if (numbers1.isEmpty() || numbers2.isEmpty()) + return -1.0; - if (numbers1.isEmpty() || numbers2.isEmpty()) - return -1.0; + if (numbers1.equals(numbers2)) + return 1.0; - if (numbers1.equals(numbers2)) - return 1.0; - - return 0.0; - } -} \ No newline at end of file + return 0.0; + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java index bdbde9610..08e4d5d84 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/RomansMatch.java @@ -1,35 +1,35 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - @ComparatorClass("romansMatch") public class RomansMatch extends AbstractComparator { + public RomansMatch(Map params) { + super(params); + } - public RomansMatch(Map params) { - super(params); - } + @Override + public double distance(String a, String b, Config conf) { - @Override - public double distance(String a, String b, Config conf) { + // extracts romans from the field + String romans1 = getRomans(nfd(a)); + String romans2 = getRomans(nfd(b)); - //extracts romans from the field - String romans1 = getRomans(nfd(a)); - String romans2 = getRomans(nfd(b)); + if (romans1.isEmpty() && romans2.isEmpty()) + return 1.0; - if (romans1.isEmpty() && romans2.isEmpty()) - return 1.0; + if (romans1.isEmpty() || romans2.isEmpty()) + return -1.0; - if (romans1.isEmpty() || romans2.isEmpty()) - return -1.0; + if (romans1.equals(romans2)) + return 1.0; - if (romans1.equals(romans2)) - return 1.0; - - return 0.0; - } + return 0.0; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java index 175b0666d..01cb3dd63 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SizeMatch.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree; import java.util.List; @@ -18,34 +19,34 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("sizeMatch") public class SizeMatch extends AbstractComparator { - /** - * Instantiates a new size match. - * - * @param params - * the parameters - */ - public SizeMatch(final Map params) { - super(params); - } + /** + * Instantiates a new size match. + * + * @param params + * the parameters + */ + public SizeMatch(final Map params) { + super(params); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { + @Override + public double compare(final Field a, final Field b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; + if (a.isEmpty() || b.isEmpty()) + return -1; - return Iterables.size(a) == Iterables.size(b) ? 1 : 0; - } + return Iterables.size(a) == Iterables.size(b) ? 1 : 0; + } - /** - * Checks if is empty. - * - * @param a - * the a - * @return true, if is empty - */ - protected boolean isEmpty(final Iterable a) { - return (a == null) || Iterables.isEmpty(a); - } + /** + * Checks if is empty. + * + * @param a + * the a + * @return true, if is empty + */ + protected boolean isEmpty(final Iterable a) { + return (a == null) || Iterables.isEmpty(a); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java index 79173ba66..6e5c36def 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedJaroWinkler.java @@ -1,18 +1,20 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * The Class SortedJaroWinkler. */ @ComparatorClass("sortedJaroWinkler") public class SortedJaroWinkler extends AbstractSortedComparator { - public SortedJaroWinkler(Map params){ + public SortedJaroWinkler(Map params) { super(params, new com.wcohen.ss.Levenstein()); } @@ -40,7 +42,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override @@ -50,7 +51,6 @@ public class SortedJaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java index de8c669d7..3046fceae 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SortedLevel2JaroWinkler.java @@ -1,11 +1,13 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.tree.support.AbstractSortedComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * The Class SortedJaroWinkler. */ @@ -22,7 +24,7 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { super(weight, new com.wcohen.ss.Level2JaroWinkler()); } - public SortedLevel2JaroWinkler(final Map params){ + public SortedLevel2JaroWinkler(final Map params) { super(params, new com.wcohen.ss.Level2JaroWinkler()); } @@ -40,7 +42,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override @@ -50,7 +51,6 @@ public class SortedLevel2JaroWinkler extends AbstractSortedComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java index 9c5a9fed5..cef6de504 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java @@ -1,11 +1,12 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import java.util.Map; - /** * The Class Contains match * @@ -14,53 +15,53 @@ import java.util.Map; @ComparatorClass("stringContainsMatch") public class StringContainsMatch extends AbstractComparator { - private Map params; + private Map params; - private boolean CASE_SENSITIVE; - private String STRING; - private String AGGREGATOR; + private boolean CASE_SENSITIVE; + private String STRING; + private String AGGREGATOR; - public StringContainsMatch(Map params) { - super(params); - this.params = params; + public StringContainsMatch(Map params) { + super(params); + this.params = params; - //read parameters - CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); - STRING = params.get("string"); - AGGREGATOR = params.get("aggregator"); + // read parameters + CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false")); + STRING = params.get("string"); + AGGREGATOR = params.get("aggregator"); - } + } - @Override - public double distance(final String a, final String b, final Config conf) { + @Override + public double distance(final String a, final String b, final Config conf) { - String ca = a; - String cb = b; - if (!CASE_SENSITIVE) { - ca = a.toLowerCase(); - cb = b.toLowerCase(); - STRING = STRING.toLowerCase(); - } + String ca = a; + String cb = b; + if (!CASE_SENSITIVE) { + ca = a.toLowerCase(); + cb = b.toLowerCase(); + STRING = STRING.toLowerCase(); + } - if (AGGREGATOR != null) { - switch (AGGREGATOR) { - case "AND": - if (ca.contains(STRING) && cb.contains(STRING)) - return 1.0; - break; - case "OR": - if (ca.contains(STRING) || cb.contains(STRING)) - return 1.0; - break; - case "XOR": - if (ca.contains(STRING) ^ cb.contains(STRING)) - return 1.0; - break; - default: - return 0.0; - } - } + if (AGGREGATOR != null) { + switch (AGGREGATOR) { + case "AND": + if (ca.contains(STRING) && cb.contains(STRING)) + return 1.0; + break; + case "OR": + if (ca.contains(STRING) || cb.contains(STRING)) + return 1.0; + break; + case "XOR": + if (ca.contains(STRING) ^ cb.contains(STRING)) + return 1.0; + break; + default: + return 0.0; + } + } - return 0.0; - } + return 0.0; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java index e67a7ea0b..c74deadc9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java @@ -1,54 +1,57 @@ -package eu.dnetlib.pace.tree; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +package eu.dnetlib.pace.tree; import java.util.HashSet; import java.util.Map; import java.util.Set; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("stringListMatch") public class StringListMatch extends AbstractComparator { - private static final Log log = LogFactory.getLog(StringListMatch.class); - private Map params; + private static final Log log = LogFactory.getLog(StringListMatch.class); + private Map params; - final private String TYPE; //percentage or count + final private String TYPE; // percentage or count - public StringListMatch(final Map params) { - super(params); - this.params = params; + public StringListMatch(final Map params) { + super(params); + this.params = params; - TYPE = params.getOrDefault("type", "percentage"); - } + TYPE = params.getOrDefault("type", "percentage"); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { + @Override + public double compare(final Field a, final Field b, final Config conf) { - final Set pa = new HashSet<>(((FieldList) a).stringList()); - final Set pb = new HashSet<>(((FieldList) b).stringList()); + final Set pa = new HashSet<>(((FieldList) a).stringList()); + final Set pb = new HashSet<>(((FieldList) b).stringList()); - if (pa.isEmpty() || pb.isEmpty()) { - return -1; //return undefined if one of the two lists is empty - } + if (pa.isEmpty() || pb.isEmpty()) { + return -1; // return undefined if one of the two lists is empty + } - int incommon = Sets.intersection(pa, pb).size(); - int simDiff = Sets.symmetricDifference(pa, pb).size(); + int incommon = Sets.intersection(pa, pb).size(); + int simDiff = Sets.symmetricDifference(pa, pb).size(); - if (incommon + simDiff == 0) { - return 0.0; - } + if (incommon + simDiff == 0) { + return 0.0; + } - if(TYPE.equals("percentage")) - return (double)incommon / (incommon + simDiff); - else - return incommon; + if (TYPE.equals("percentage")) + return (double) incommon / (incommon + simDiff); + else + return incommon; - } -} \ No newline at end of file + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java index 0c4165bf6..23be3f752 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/SubStringLevenstein.java @@ -1,17 +1,18 @@ + package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.config.Config; +import java.util.Map; + import org.apache.commons.lang3.StringUtils; import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; - -import java.util.Map; - /** * The Class SubStringLevenstein. */ @@ -31,7 +32,7 @@ public class SubStringLevenstein extends AbstractComparator { super(w, new com.wcohen.ss.Levenstein()); } - public SubStringLevenstein(Map params){ + public SubStringLevenstein(Map params) { super(params, new com.wcohen.ss.Levenstein()); this.limit = Integer.parseInt(params.getOrDefault("limit", "1")); } @@ -66,8 +67,8 @@ public class SubStringLevenstein extends AbstractComparator { /* * (non-Javadoc) - * - * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#compare(eu.dnetlib.pace.model.Field, + * eu.dnetlib.pace.model.Field) */ @Override public double distance(final Field a, final Field b, final Config conf) { @@ -79,7 +80,6 @@ public class SubStringLevenstein extends AbstractComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.DistanceAlgo#getWeight() */ @Override @@ -89,7 +89,6 @@ public class SubStringLevenstein extends AbstractComparator { /* * (non-Javadoc) - * * @see eu.dnetlib.pace.compare.SecondStringDistanceAlgo#normalize(double) */ @Override diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java index 873a0c100..db1faf9e2 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/TitleVersionMatch.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree; import java.util.List; @@ -17,24 +18,24 @@ import eu.dnetlib.pace.tree.support.ComparatorClass; @ComparatorClass("titleVersionMatch") public class TitleVersionMatch extends AbstractComparator { - public TitleVersionMatch(final Map params) { - super(params); - } + public TitleVersionMatch(final Map params) { + super(params); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { - final String valueA = getFirstValue(a); - final String valueB = getFirstValue(b); + @Override + public double compare(final Field a, final Field b, final Config conf) { + final String valueA = getFirstValue(a); + final String valueB = getFirstValue(b); - if (valueA.isEmpty() || valueB.isEmpty()) - return -1; + if (valueA.isEmpty() || valueB.isEmpty()) + return -1; - return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0; - } + return notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : 0; + } - @Override - public String toString() { - return getClass().getSimpleName() + ":" + super.toString(); - } + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java index fc98fc187..f4f00a908 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/UrlMatcher.java @@ -1,59 +1,61 @@ -package eu.dnetlib.pace.tree; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.tree; import java.net.MalformedURLException; import java.net.URL; import java.util.Map; +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("urlMatcher") public class UrlMatcher extends Levenstein { - private Map params; + private Map params; - public UrlMatcher(Map params){ - super(params); - this.params = params; - } + public UrlMatcher(Map params) { + super(params); + this.params = params; + } - public UrlMatcher(double weight, Map params) { - super(weight); - this.params = params; - } + public UrlMatcher(double weight, Map params) { + super(weight); + this.params = params; + } - public void setParams(Map params) { - this.params = params; - } + public void setParams(Map params) { + this.params = params; + } - @Override - public double distance(Field a, Field b, final Config conf) { - final URL urlA = asUrl(getFirstValue(a)); - final URL urlB = asUrl(getFirstValue(b)); + @Override + public double distance(Field a, Field b, final Config conf) { + final URL urlA = asUrl(getFirstValue(a)); + final URL urlB = asUrl(getFirstValue(b)); - if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { - return 0.0; - } + if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { + return 0.0; + } - Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5")); - Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5")); + Double hostW = Double.parseDouble(params.getOrDefault("host", "0.5")); + Double pathW = Double.parseDouble(params.getOrDefault("path", "0.5")); - if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { - return hostW * 0.5; - } + if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { + return hostW * 0.5; + } - return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); - } + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath(), conf); + } - private URL asUrl(final String value) { - try { - return new URL(value); - } catch (MalformedURLException e) { - // should not happen as checked by pace typing - throw new IllegalStateException("invalid URL: " + value); - } - } + private URL asUrl(final String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java index 5dda0e25f..7ee8c8bad 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/YearMatch.java @@ -1,12 +1,14 @@ + package eu.dnetlib.pace.tree; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.tree.support.AbstractComparator; import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.lang3.StringUtils; - -import java.util.Map; /** * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing. @@ -16,36 +18,36 @@ import java.util.Map; @ComparatorClass("yearMatch") public class YearMatch extends AbstractComparator { - private int limit = 4; + private int limit = 4; - public YearMatch(final Map params) { - super(params); - } + public YearMatch(final Map params) { + super(params); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { - final String valueA = getNumbers(getFirstValue(a)); - final String valueB = getNumbers(getFirstValue(b)); + @Override + public double compare(final Field a, final Field b, final Config conf) { + final String valueA = getNumbers(getFirstValue(a)); + final String valueB = getNumbers(getFirstValue(b)); - if (valueA.isEmpty() || valueB.isEmpty()) - return -1; + if (valueA.isEmpty() || valueB.isEmpty()) + return -1; - final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); - final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); + final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); + final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); - return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0; - } + return lengthMatch && valueA.equals(valueB) || onemissing ? 1 : 0; + } - protected boolean checkLength(final String s) { - return s.length() == limit; - } + protected boolean checkLength(final String s) { + return s.length() == limit; + } - protected String getFirstValue(final Field value) { - return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : ""; - } + protected String getFirstValue(final Field value) { + return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : ""; + } - @Override - public String toString() { - return getClass().getSimpleName() + ":" + super.toString(); - } -} \ No newline at end of file + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index ddfcc5565..3ecffb289 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -1,124 +1,130 @@ + package eu.dnetlib.pace.tree.support; +import java.util.List; +import java.util.Map; + import com.wcohen.ss.AbstractStringDistance; + import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; -import java.util.List; -import java.util.Map; - public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator { - /** The ssalgo. */ - protected AbstractStringDistance ssalgo; + /** The ssalgo. */ + protected AbstractStringDistance ssalgo; - /** The weight. */ - protected double weight = 0.0; + /** The weight. */ + protected double weight = 0.0; - private Map params; + private Map params; - protected AbstractComparator(Map params) { - this.params = params; - } + protected AbstractComparator(Map params) { + this.params = params; + } - protected AbstractComparator(Map params, final AbstractStringDistance ssalgo){ - this.params = params; - this.weight = 1.0; - this.ssalgo = ssalgo; - } + protected AbstractComparator(Map params, final AbstractStringDistance ssalgo) { + this.params = params; + this.weight = 1.0; + this.ssalgo = ssalgo; + } - /** - * Instantiates a new second string compare algo. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { - this.ssalgo = ssalgo; - this.weight = weight; - } + /** + * Instantiates a new second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractComparator(final double weight, final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + this.weight = weight; + } - protected AbstractComparator(final AbstractStringDistance ssalgo){ - this.ssalgo = ssalgo; - } + protected AbstractComparator(final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + } - /** - * Normalize. - * - * @param d - * the d - * @return the double - */ - protected double normalize(double d) { - return d; - } + /** + * Normalize. + * + * @param d + * the d + * @return the double + */ + protected double normalize(double d) { + return d; + } - /** - * Distance. - * - * @param a - * the a - * @param b - * the b - * @return the double - */ - public double distance(final String a, final String b, final Config conf) { + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1; //return -1 if a field is missing - } - double score = ssalgo.score(a, b); - return normalize(score); - } + if (a.isEmpty() || b.isEmpty()) { + return -1; // return -1 if a field is missing + } + double score = ssalgo.score(a, b); + return normalize(score); + } - /** - * Distance. - * - * @param a - * the a - * @param b - * the b - * @return the double - */ - protected double distance(final List a, final List b, final Config conf) { - return distance(concat(a), concat(b), conf); - } + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + protected double distance(final List a, final List b, final Config conf) { + return distance(concat(a), concat(b), conf); + } - public double distance(final Field a, final Field b, final Config conf) { - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf); + public double distance(final Field a, final Field b, final Config conf) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) + return distance(a.stringValue(), b.stringValue(), conf); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) + return distance(toList(a), toList(b), conf); - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); - } + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } - @Override - public double compare(final Field a, final Field b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) - return -1; - if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue(), conf); - if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b), conf); + @Override + public double compare(final Field a, final Field b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) + return -1; + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) + return distance(a.stringValue(), b.stringValue(), conf); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) + return distance(toList(a), toList(b), conf); - throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); - } + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } - /** - * To list. - * - * @param list - * the list - * @return the list - */ - protected List toList(final Field list) { - return ((FieldList) list).stringList(); - } + /** + * To list. + * + * @param list + * the list + * @return the list + */ + protected List toList(final Field list) { + return ((FieldList) list).stringList(); + } - public double getWeight(){ - return this.weight; - } + public double getWeight() { + return this.weight; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java index 557ad0c29..8927f2e14 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractSortedComparator.java @@ -1,38 +1,40 @@ -package eu.dnetlib.pace.tree.support; -import com.google.common.collect.Lists; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; +package eu.dnetlib.pace.tree.support; import java.util.Collections; import java.util.List; import java.util.Map; +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + public abstract class AbstractSortedComparator extends AbstractComparator { - /** - * Instantiates a new sorted second string compare algo. - * - * @param weight - * the weight - * @param ssalgo - * the ssalgo - */ - protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + /** + * Instantiates a new sorted second string compare algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected AbstractSortedComparator(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo){ - super(Double.parseDouble(params.get("weight")), ssalgo); - } + protected AbstractSortedComparator(final Map params, final AbstractStringDistance ssalgo) { + super(Double.parseDouble(params.get("weight")), ssalgo); + } - @Override - protected List toList(final Field list) { - FieldList fl = (FieldList) list; - List values = Lists.newArrayList(fl.stringList()); - Collections.sort(values); - return values; - } + @Override + protected List toList(final Field list) { + FieldList fl = (FieldList) list; + List values = Lists.newArrayList(fl.stringList()); + Collections.sort(values); + return values; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java index caf7cd4c8..7011f2bf3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AggType.java @@ -1,24 +1,21 @@ + package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.util.PaceException; public enum AggType { - W_MEAN, //weighted mean - AVG, //average - SUM, - MAX, - MIN, - AND, //used for necessary conditions - OR; //used for sufficient conditions + W_MEAN, // weighted mean + AVG, // average + SUM, MAX, MIN, AND, // used for necessary conditions + OR; // used for sufficient conditions - public static AggType getEnum(String value) { + public static AggType getEnum(String value) { - try { - return AggType.valueOf(value); - } - catch (IllegalArgumentException e) { - throw new PaceException("Undefined aggregation type", e); - } - } + try { + return AggType.valueOf(value); + } catch (IllegalArgumentException e) { + throw new PaceException("Undefined aggregation type", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java index 67c26e99a..b11ca5429 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/Comparator.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree.support; import eu.dnetlib.pace.config.Config; @@ -5,10 +6,9 @@ import eu.dnetlib.pace.model.Field; public interface Comparator { - /* - * return : -1 -> can't decide (i.e. missing field) - * >0 -> similarity degree (depends on the algorithm) - * */ - public double compare(Field a, Field b, Config conf); + /* + * return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm) + */ + public double compare(Field a, Field b, Config conf); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java index 8c3002eb6..5ef0932cf 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/ComparatorClass.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree.support; import java.lang.annotation.ElementType; @@ -9,5 +10,5 @@ import java.lang.annotation.Target; @Target(ElementType.TYPE) public @interface ComparatorClass { - public String value(); + public String value(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java index 44971876f..d81c68e38 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldConf.java @@ -1,82 +1,84 @@ + package eu.dnetlib.pace.tree.support; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.util.PaceException; - - import java.io.IOException; import java.io.Serializable; import java.util.Map; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.util.PaceException; + /** * The class that defines the configuration of each field in the decision tree. * */ public class FieldConf implements Serializable { - private String field; //name of the field on which apply the comparator - private String comparator; //comparator name - private double weight = 1.0; //weight for the field (to be used in the aggregation) - private Map params; //parameters + private String field; // name of the field on which apply the comparator + private String comparator; // comparator name + private double weight = 1.0; // weight for the field (to be used in the aggregation) + private Map params; // parameters - private boolean countIfUndefined; + private boolean countIfUndefined; - public boolean isCountIfUndefined() { - return countIfUndefined; - } + public boolean isCountIfUndefined() { + return countIfUndefined; + } - public void setCountIfUndefined(boolean countIfUndefined) { - this.countIfUndefined = countIfUndefined; - } + public void setCountIfUndefined(boolean countIfUndefined) { + this.countIfUndefined = countIfUndefined; + } - public FieldConf() { - } + public FieldConf() { + } - public FieldConf(String field, String comparator, double weight, Map params, boolean countIfUndefined) { - this.field = field; - this.comparator = comparator; - this.weight = weight; - this.params = params; - this.countIfUndefined = countIfUndefined; - } + public FieldConf(String field, String comparator, double weight, Map params, + boolean countIfUndefined) { + this.field = field; + this.comparator = comparator; + this.weight = weight; + this.params = params; + this.countIfUndefined = countIfUndefined; + } - public String getField() { - return field; - } + public String getField() { + return field; + } - public void setField(String field) { - this.field = field; - } + public void setField(String field) { + this.field = field; + } - public String getComparator() { - return comparator; - } + public String getComparator() { + return comparator; + } - public void setComparator(String comparator) { - this.comparator = comparator; - } + public void setComparator(String comparator) { + this.comparator = comparator; + } - public double getWeight() { - return weight; - } + public double getWeight() { + return weight; + } - public void setWeight(double weight) { - this.weight = weight; - } + public void setWeight(double weight) { + this.weight = weight; + } - public Map getParams() { - return params; - } + public Map getParams() { + return params; + } - public void setParams(Map params) { - this.params = params; - } + public void setParams(Map params) { + this.params = params; + } - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } -} \ No newline at end of file + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java index fb0b51b47..0d5c80f53 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/FieldStats.java @@ -1,89 +1,90 @@ + package eu.dnetlib.pace.tree.support; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.util.PaceException; - - import java.io.IOException; import java.io.Serializable; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.util.PaceException; + /** * The class that contains the result of each comparison in the decision tree * */ public class FieldStats implements Serializable { - private double weight; //weight for the field (to be used in the aggregation) - private double threshold; //threshold for the field (to be used in some kind of aggregations) - private double result; //the result of the comparison - private Field a; - private Field b; + private double weight; // weight for the field (to be used in the aggregation) + private double threshold; // threshold for the field (to be used in some kind of aggregations) + private double result; // the result of the comparison + private Field a; + private Field b; - private boolean countIfUndefined; + private boolean countIfUndefined; - public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) { - this.weight = weight; - this.threshold = threshold; - this.result = result; - this.countIfUndefined = countIfUndefined; - this.a = a; - this.b = b; - } + public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) { + this.weight = weight; + this.threshold = threshold; + this.result = result; + this.countIfUndefined = countIfUndefined; + this.a = a; + this.b = b; + } - public double getThreshold() { - return threshold; - } + public double getThreshold() { + return threshold; + } - public void setThreshold(double threshold) { - this.threshold = threshold; - } + public void setThreshold(double threshold) { + this.threshold = threshold; + } - public double getWeight() { - return weight; - } + public double getWeight() { + return weight; + } - public void setWeight(double weight) { - this.weight = weight; - } + public void setWeight(double weight) { + this.weight = weight; + } - public double getResult() { - return result; - } + public double getResult() { + return result; + } - public void setResult(double result) { - this.result = result; - } + public void setResult(double result) { + this.result = result; + } - public boolean isCountIfUndefined() { - return countIfUndefined; - } + public boolean isCountIfUndefined() { + return countIfUndefined; + } - public void setCountIfUndefined(boolean countIfUndefined) { - this.countIfUndefined = countIfUndefined; - } + public void setCountIfUndefined(boolean countIfUndefined) { + this.countIfUndefined = countIfUndefined; + } - public Field getA() { - return a; - } + public Field getA() { + return a; + } - public void setA(Field a) { - this.a = a; - } + public void setA(Field a) { + this.a = a; + } - public Field getB() { - return b; - } + public Field getB() { + return b; + } - public void setB(Field b) { - this.b = b; - } + public void setB(Field b) { + this.b = b; + } - @Override - public String toString(){ - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java index c16039587..8dff818e8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/MatchType.java @@ -1,18 +1,16 @@ + package eu.dnetlib.pace.tree.support; public enum MatchType { - MATCH, - NO_MATCH, - UNDEFINED; + MATCH, NO_MATCH, UNDEFINED; - public static MatchType parse(String value) { + public static MatchType parse(String value) { - try { - return MatchType.valueOf(value); - } - catch (IllegalArgumentException e) { - return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable - } - } + try { + return MatchType.valueOf(value); + } catch (IllegalArgumentException e) { + return MatchType.UNDEFINED; // return UNDEFINED if the enum is not parsable + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java index f7ebe96d1..a754f13cd 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeDef.java @@ -1,150 +1,160 @@ -package eu.dnetlib.pace.tree.support; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.PaceConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.lang3.StringUtils; +package eu.dnetlib.pace.tree.support; import java.io.IOException; import java.io.Serializable; import java.io.StringWriter; import java.util.List; +import org.apache.commons.lang3.StringUtils; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.config.PaceConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.PaceException; + public class TreeNodeDef implements Serializable { - final static String CROSS_COMPARE = "crossCompare"; + final static String CROSS_COMPARE = "crossCompare"; - private List fields; - private AggType aggregation; + private List fields; + private AggType aggregation; - private double threshold; + private double threshold; - private String positive; - private String negative; - private String undefined; + private String positive; + private String negative; + private String undefined; - boolean ignoreUndefined; + boolean ignoreUndefined; - public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) { - this.fields = fields; - this.aggregation = aggregation; - this.threshold = threshold; - this.positive = positive; - this.negative = negative; - this.undefined = undefined; - this.ignoreUndefined = ignoreUndefined; - } + public TreeNodeDef(List fields, AggType aggregation, double threshold, String positive, String negative, + String undefined, boolean ignoreUndefined) { + this.fields = fields; + this.aggregation = aggregation; + this.threshold = threshold; + this.positive = positive; + this.negative = negative; + this.undefined = undefined; + this.ignoreUndefined = ignoreUndefined; + } - public TreeNodeDef() {} + public TreeNodeDef() { + } - //function for the evaluation of the node - public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { + // function for the evaluation of the node + public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) { - TreeNodeStats stats = new TreeNodeStats(); + TreeNodeStats stats = new TreeNodeStats(); - //for each field in the node, it computes the - for (FieldConf fieldConf : fields) { + // for each field in the node, it computes the + for (FieldConf fieldConf : fields) { - double weight = fieldConf.getWeight(); + double weight = fieldConf.getWeight(); - double result; + double result; - //if the param specifies a cross comparison (i.e. compare elements from different fields), compute the result for both sides and return the maximum - if(fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) { - String crossField = fieldConf.getParams().get(CROSS_COMPARE); - double result1 = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf); - double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf); - result = Math.max(result1,result2); - } - else { - result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf); - } + // if the param specifies a cross comparison (i.e. compare elements from different fields), compute the + // result for both sides and return the maximum + if (fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) { + String crossField = fieldConf.getParams().get(CROSS_COMPARE); + double result1 = comparator(fieldConf) + .compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf); + double result2 = comparator(fieldConf) + .compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf); + result = Math.max(result1, result2); + } else { + result = comparator(fieldConf) + .compare( + doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), + conf); + } - stats.addFieldStats( - fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), - new FieldStats( - weight, - Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")), - result, - fieldConf.isCountIfUndefined(), - doc1.getFieldMap().get(fieldConf.getField()), - doc2.getFieldMap().get(fieldConf.getField()) - )); - } + stats + .addFieldStats( + fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf), + new FieldStats( + weight, + Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")), + result, + fieldConf.isCountIfUndefined(), + doc1.getFieldMap().get(fieldConf.getField()), + doc2.getFieldMap().get(fieldConf.getField()))); + } - return stats; - } + return stats; + } - private Comparator comparator(final FieldConf field){ + private Comparator comparator(final FieldConf field) { - return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); - } + return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams()); + } - public List getFields() { - return fields; - } + public List getFields() { + return fields; + } - public void setFields(List fields) { - this.fields = fields; - } + public void setFields(List fields) { + this.fields = fields; + } - public AggType getAggregation() { - return aggregation; - } + public AggType getAggregation() { + return aggregation; + } - public void setAggregation(AggType aggregation) { - this.aggregation = aggregation; - } + public void setAggregation(AggType aggregation) { + this.aggregation = aggregation; + } - public double getThreshold() { - return threshold; - } + public double getThreshold() { + return threshold; + } - public void setThreshold(double threshold) { - this.threshold = threshold; - } + public void setThreshold(double threshold) { + this.threshold = threshold; + } - public String getPositive() { - return positive; - } + public String getPositive() { + return positive; + } - public void setPositive(String positive) { - this.positive = positive; - } + public void setPositive(String positive) { + this.positive = positive; + } - public String getNegative() { - return negative; - } + public String getNegative() { + return negative; + } - public void setNegative(String negative) { - this.negative = negative; - } + public void setNegative(String negative) { + this.negative = negative; + } - public String getUndefined() { - return undefined; - } + public String getUndefined() { + return undefined; + } - public void setUndefined(String undefined) { - this.undefined = undefined; - } + public void setUndefined(String undefined) { + this.undefined = undefined; + } - public boolean isIgnoreUndefined() { - return ignoreUndefined; - } + public boolean isIgnoreUndefined() { + return ignoreUndefined; + } - public void setIgnoreUndefined(boolean ignoreUndefined) { - this.ignoreUndefined = ignoreUndefined; - } + public void setIgnoreUndefined(boolean ignoreUndefined) { + this.ignoreUndefined = ignoreUndefined; + } - @Override - public String toString() { - try { - return new ObjectMapper().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } + @Override + public String toString() { + try { + return new ObjectMapper().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java index f9612a41f..2b96048b4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeNodeStats.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.tree.support; import java.io.Serializable; @@ -6,129 +7,128 @@ import java.util.Map; public class TreeNodeStats implements Serializable { - private Map results; //this is an accumulator for the results of the node + private Map results; // this is an accumulator for the results of the node - public TreeNodeStats(){ - this.results = new HashMap<>(); - } + public TreeNodeStats() { + this.results = new HashMap<>(); + } - public Map getResults() { - return results; - } + public Map getResults() { + return results; + } - public void addFieldStats(String id, FieldStats fieldStats){ - this.results.put(id, fieldStats); - } + public void addFieldStats(String id, FieldStats fieldStats) { + this.results.put(id, fieldStats); + } - public int fieldsCount(){ - return this.results.size(); - } + public int fieldsCount() { + return this.results.size(); + } - public int undefinedCount(){ - int undefinedCount = 0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult() == -1) - undefinedCount ++; - } - return undefinedCount; - } + public int undefinedCount() { + int undefinedCount = 0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() == -1) + undefinedCount++; + } + return undefinedCount; + } - public double scoreSum(){ - double scoreSum = 0.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>=0.0) { - scoreSum += fs.getResult(); - } - } - return scoreSum; - } + public double scoreSum() { + double scoreSum = 0.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() >= 0.0) { + scoreSum += fs.getResult(); + } + } + return scoreSum; + } - //return the sum of the weights without considering the fields with countIfMissing=false && result=-1 - public double weightSum(){ - double weightSum = 0.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>=0.0 || (fs.getResult()<0.0 && fs.isCountIfUndefined())) { - weightSum += fs.getWeight(); - } - } - return weightSum; - } + // return the sum of the weights without considering the fields with countIfMissing=false && result=-1 + public double weightSum() { + double weightSum = 0.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() >= 0.0 || (fs.getResult() < 0.0 && fs.isCountIfUndefined())) { + weightSum += fs.getWeight(); + } + } + return weightSum; + } - public double weightedScoreSum(){ - double weightedScoreSum = 0.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>=0.0) { - weightedScoreSum += fs.getResult()*fs.getWeight(); - } - } - return weightedScoreSum; - } + public double weightedScoreSum() { + double weightedScoreSum = 0.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() >= 0.0) { + weightedScoreSum += fs.getResult() * fs.getWeight(); + } + } + return weightedScoreSum; + } - public double max(){ - double max = -1.0; - for(FieldStats fs: this.results.values()){ - if(fs.getResult()>max) - max = fs.getResult(); - } - return max; - } + public double max() { + double max = -1.0; + for (FieldStats fs : this.results.values()) { + if (fs.getResult() > max) + max = fs.getResult(); + } + return max; + } - public double min(){ - double min = 100.0; //random high value - for(FieldStats fs: this.results.values()){ - if(fs.getResult()=0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) - min = fs.getResult(); - } - } - return min; - } + public double min() { + double min = 100.0; // random high value + for (FieldStats fs : this.results.values()) { + if (fs.getResult() < min) { + if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) + min = fs.getResult(); + } + } + return min; + } - //if at least one is true, return 1.0 - public double or(){ - for (FieldStats fieldStats : this.results.values()) { - if (fieldStats.getResult() >= fieldStats.getThreshold()) - return 1.0; - } - return 0.0; - } + // if at least one is true, return 1.0 + public double or() { + for (FieldStats fieldStats : this.results.values()) { + if (fieldStats.getResult() >= fieldStats.getThreshold()) + return 1.0; + } + return 0.0; + } - //if at least one is false, return 0.0 - public double and() { - for (FieldStats fieldStats : this.results.values()) { + // if at least one is false, return 0.0 + public double and() { + for (FieldStats fieldStats : this.results.values()) { - if (fieldStats.getResult() == -1) { - if (fieldStats.isCountIfUndefined()) - return 0.0; - } - else { - if (fieldStats.getResult() < fieldStats.getThreshold()) - return 0.0; - } + if (fieldStats.getResult() == -1) { + if (fieldStats.isCountIfUndefined()) + return 0.0; + } else { + if (fieldStats.getResult() < fieldStats.getThreshold()) + return 0.0; + } - } - return 1.0; - } + } + return 1.0; + } - public double getFinalScore(AggType aggregation){ + public double getFinalScore(AggType aggregation) { - switch (aggregation){ - case AVG: - return scoreSum()/fieldsCount(); - case SUM: - return scoreSum(); - case MAX: - return max(); - case MIN: - return min(); - case W_MEAN: - return weightedScoreSum()/weightSum(); - case OR: - return or(); - case AND: - return and(); - default: - return 0.0; - } - } + switch (aggregation) { + case AVG: + return scoreSum() / fieldsCount(); + case SUM: + return scoreSum(); + case MAX: + return max(); + case MIN: + return min(); + case W_MEAN: + return weightedScoreSum() / weightSum(); + case OR: + return or(); + case AND: + return and(); + default: + return 0.0; + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java index 4828a5dd8..04e16be34 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeProcessor.java @@ -1,16 +1,17 @@ + package eu.dnetlib.pace.tree.support; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.PaceException; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - /** * The compare between two documents is given by the weighted mean of the field distances */ -public class TreeProcessor{ +public class TreeProcessor { private static final Log log = LogFactory.getLog(TreeProcessor.class); @@ -21,35 +22,34 @@ public class TreeProcessor{ } public boolean compare(final MapDocument a, final MapDocument b) { - //evaluate the decision tree + // evaluate the decision tree return evaluateTree(a, b).getResult() == MatchType.MATCH; } - public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2){ + public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2) { TreeStats treeStats = new TreeStats(); String current = "start"; - while (MatchType.parse(current)==MatchType.UNDEFINED) { + while (MatchType.parse(current) == MatchType.UNDEFINED) { TreeNodeDef currentNode = config.decisionTree().get(current); - //throw an exception if the node doesn't exist + // throw an exception if the node doesn't exist if (currentNode == null) throw new PaceException("Missing tree node: " + current); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); treeStats.addNodeStats(current, stats); - //if ignoreUndefined=false the miss is considered as undefined - if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { + // if ignoreUndefined=false the miss is considered as undefined + if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { current = currentNode.getUndefined(); } - //if ignoreUndefined=true the miss is ignored and the score computed anyway + // if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { current = currentNode.getPositive(); - } - else { + } else { current = currentNode.getNegative(); } @@ -63,25 +63,24 @@ public class TreeProcessor{ String current = "start"; double score = 0.0; - while (MatchType.parse(current)==MatchType.UNDEFINED) { + while (MatchType.parse(current) == MatchType.UNDEFINED) { TreeNodeDef currentNode = config.decisionTree().get(current); - //throw an exception if the node doesn't exist + // throw an exception if the node doesn't exist if (currentNode == null) throw new PaceException("The Tree Node doesn't exist: " + current); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); score = stats.getFinalScore(currentNode.getAggregation()); - //if ignoreUndefined=false the miss is considered as undefined - if (!currentNode.isIgnoreUndefined() && stats.undefinedCount()>0) { + // if ignoreUndefined=false the miss is considered as undefined + if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { current = currentNode.getUndefined(); } - //if ignoreUndefined=true the miss is ignored and the score computed anyway + // if ignoreUndefined=true the miss is ignored and the score computed anyway else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) { current = currentNode.getPositive(); - } - else { + } else { current = currentNode.getNegative(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java index 186e8d11e..9a7f38f47 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/TreeStats.java @@ -1,51 +1,52 @@ -package eu.dnetlib.pace.tree.support; -import eu.dnetlib.pace.util.PaceException; -import com.fasterxml.jackson.databind.ObjectMapper; +package eu.dnetlib.pace.tree.support; import java.io.IOException; import java.util.HashMap; import java.util.Map; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.pace.util.PaceException; + public class TreeStats { - //> - Map stats; - MatchType result; + // > + Map stats; + MatchType result; - public TreeStats(){ - this.stats = new HashMap<>(); - this.result = MatchType.NO_MATCH; - } + public TreeStats() { + this.stats = new HashMap<>(); + this.result = MatchType.NO_MATCH; + } - public MatchType getResult(){ - return this.result; - } + public MatchType getResult() { + return this.result; + } - public void setResult(MatchType result){ - this.result = result; - } + public void setResult(MatchType result) { + this.result = result; + } - public Map getStats() { - return stats; - } + public Map getStats() { + return stats; + } - public void setStats(Map stats) { - this.stats = stats; - } + public void setStats(Map stats) { + this.stats = stats; + } - public void addNodeStats(String layerID, TreeNodeStats treeNodeStats){ - this.stats.put(layerID, treeNodeStats); - } - - @Override - public String toString(){ - try { - return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Impossible to convert to JSON: ", e); - } - } + public void addNodeStats(String layerID, TreeNodeStats treeNodeStats) { + this.stats.put(layerID, treeNodeStats); + } + @Override + public String toString() { + try { + return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Impossible to convert to JSON: ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index f1eefc3ea..4053a123c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -1,192 +1,217 @@ + package eu.dnetlib.pace.util; -import com.google.common.collect.Lists; -import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.WfConfig; -import eu.dnetlib.pace.tree.support.TreeProcessor; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.MapDocumentComparator; +import java.util.*; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import java.util.*; +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.WfConfig; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.MapDocumentComparator; +import eu.dnetlib.pace.tree.support.TreeProcessor; public class BlockProcessor { - public static final List accumulators= new ArrayList<>(); + public static final List accumulators = new ArrayList<>(); - private static final Log log = LogFactory.getLog(BlockProcessor.class); + private static final Log log = LogFactory.getLog(BlockProcessor.class); - private DedupConfig dedupConf; + private DedupConfig dedupConf; - public static void constructAccumulator( final DedupConfig dedupConf) { - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()))); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); - } + public static void constructAccumulator(final DedupConfig dedupConf) { + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1")); + accumulators + .add( + String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); + accumulators + .add( + String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), + dedupConf.getWf().getGroupMaxSize()))); + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list")); + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); + accumulators + .add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); + } - public BlockProcessor(DedupConfig dedupConf) { - this.dedupConf = dedupConf; - } + public BlockProcessor(DedupConfig dedupConf) { + this.dedupConf = dedupConf; + } - public void processSortedBlock(final String key, final List documents, final Reporter context) { - if (documents.size() > 1) { + public void processSortedBlock(final String key, final List documents, final Reporter context) { + if (documents.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(prepare(documents), context); + process(prepare(documents), context); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } - public void process(final String key, final Iterable documents, final Reporter context) { + public void process(final String key, final Iterable documents, final Reporter context) { - final Queue q = prepare(documents); + final Queue q = prepare(documents); - if (q.size() > 1) { + if (q.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(simplifyQueue(q, key, context), context); + process(simplifyQueue(q, key, context), context); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } - private Queue prepare(final Iterable documents) { - final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); + private Queue prepare(final Iterable documents) { + final Queue queue = new PriorityQueue<>(100, + new MapDocumentComparator(dedupConf.getWf().getOrderField())); - final Set seen = new HashSet(); - final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); + final Set seen = new HashSet(); + final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); - documents.forEach(doc -> { - if (queue.size() <= queueMaxSize) { - final String id = doc.getIdentifier(); + documents.forEach(doc -> { + if (queue.size() <= queueMaxSize) { + final String id = doc.getIdentifier(); - if (!seen.contains(id)) { - seen.add(id); - queue.add(doc); - } - } - }); + if (!seen.contains(id)) { + seen.add(id); + queue.add(doc); + } + } + }); - return queue; - } + return queue; + } - private Queue simplifyQueue(final Queue queue, final String ngram, final Reporter context) { - final Queue q = new LinkedList<>(); + private Queue simplifyQueue(final Queue queue, final String ngram, + final Reporter context) { + final Queue q = new LinkedList<>(); - String fieldRef = ""; - final List tempResults = Lists.newArrayList(); + String fieldRef = ""; + final List tempResults = Lists.newArrayList(); - while (!queue.isEmpty()) { - final MapDocument result = queue.remove(); + while (!queue.isEmpty()) { + final MapDocument result = queue.remove(); - final String orderFieldName = dedupConf.getWf().getOrderField(); - final Field orderFieldValue = result.values(orderFieldName); - if (!orderFieldValue.isEmpty()) { - final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); - if (field.equals(fieldRef)) { - tempResults.add(result); - } else { - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - tempResults.clear(); - tempResults.add(result); - fieldRef = field; - } - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); - } - } - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + final String orderFieldName = dedupConf.getWf().getOrderField(); + final Field orderFieldValue = result.values(orderFieldName); + if (!orderFieldValue.isEmpty()) { + final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); + if (field.equals(fieldRef)) { + tempResults.add(result); + } else { + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + tempResults.clear(); + tempResults.add(result); + fieldRef = field; + } + } else { + context + .incrementCounter( + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); + } + } + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - return q; - } + return q; + } - private void populateSimplifiedQueue(final Queue q, - final List tempResults, - final Reporter context, - final String fieldRef, - final String ngram) { - WfConfig wf = dedupConf.getWf(); - if (tempResults.size() < wf.getGroupMaxSize()) { - q.addAll(tempResults); - } else { - context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size()); + private void populateSimplifiedQueue(final Queue q, + final List tempResults, + final Reporter context, + final String fieldRef, + final String ngram) { + WfConfig wf = dedupConf.getWf(); + if (tempResults.size() < wf.getGroupMaxSize()) { + q.addAll(tempResults); + } else { + context + .incrementCounter( + wf.getEntityType(), + String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), + tempResults.size()); // log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); - } - } + } + } - private void process(final Queue queue, final Reporter context) { + private void process(final Queue queue, final Reporter context) { - while (!queue.isEmpty()) { + while (!queue.isEmpty()) { - final MapDocument pivot = queue.remove(); - final String idPivot = pivot.getIdentifier(); + final MapDocument pivot = queue.remove(); + final String idPivot = pivot.getIdentifier(); - WfConfig wf = dedupConf.getWf(); - final Field fieldsPivot = pivot.values(wf.getOrderField()); - final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); + WfConfig wf = dedupConf.getWf(); + final Field fieldsPivot = pivot.values(wf.getOrderField()); + final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); - if (fieldPivot != null) { - int i = 0; - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); + if (fieldPivot != null) { + int i = 0; + for (final MapDocument curr : queue) { + final String idCurr = curr.getIdentifier(); - if (mustSkip(idCurr)) { + if (mustSkip(idCurr)) { - context.incrementCounter(wf.getEntityType(), "skip list", 1); + context.incrementCounter(wf.getEntityType(), "skip list", 1); - break; - } + break; + } - if (i > wf.getSlidingWindowSize()) { - break; - } + if (i > wf.getSlidingWindowSize()) { + break; + } - final Field fieldsCurr = curr.values(wf.getOrderField()); - final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue(); + final Field fieldsCurr = curr.values(wf.getOrderField()); + final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null + : fieldsCurr.stringValue(); - if (!idCurr.equals(idPivot) && (fieldCurr != null)) { + if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); + final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); - emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); + emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); - } - } - } - } - } + } + } + } + } + } - private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { - if (result) { - writeSimilarity(context, idPivot, idCurr); - context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); - } - } + if (result) { + writeSimilarity(context, idPivot, idCurr); + context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); + } + } - private boolean mustSkip(final String idPivot) { - return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); - } + private boolean mustSkip(final String idPivot) { + return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); + } - private String getNsPrefix(final String id) { - return StringUtils.substringBetween(id, "|", "::"); - } + private String getNsPrefix(final String id) { + return StringUtils.substringBetween(id, "|", "::"); + } - private void writeSimilarity(final Reporter context, final String from, final String to) { - final String type = dedupConf.getWf().getEntityType(); + private void writeSimilarity(final Reporter context, final String from, final String to) { + final String type = dedupConf.getWf().getEntityType(); - context.emit(type, from, to); - context.emit(type, to, from); - } + context.emit(type, from, to); + context.emit(type, to, from); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java index 24264c0bf..40f502e11 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java @@ -1,6 +1,14 @@ + package eu.dnetlib.pace.util; +import java.util.*; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import com.google.common.collect.Lists; + import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.WfConfig; @@ -9,240 +17,260 @@ import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocumentComparator; import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.tree.support.TreeProcessor; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import java.util.*; public class BlockProcessorForTesting { - public static final List accumulators= new ArrayList<>(); + public static final List accumulators = new ArrayList<>(); - private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class); + private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class); - private DedupConfig dedupConf; + private DedupConfig dedupConf; - public static void constructAccumulator( final DedupConfig dedupConf) { - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), String.format("Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()))); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "skip list")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); - accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); - } + public static void constructAccumulator(final DedupConfig dedupConf) { + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1")); + accumulators + .add( + String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); + accumulators + .add( + String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(), + dedupConf.getWf().getGroupMaxSize()))); + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list")); + accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)")); + accumulators + .add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold())); + } - public BlockProcessorForTesting(DedupConfig dedupConf) { - this.dedupConf = dedupConf; - } + public BlockProcessorForTesting(DedupConfig dedupConf) { + this.dedupConf = dedupConf; + } - public void processSortedBlock(final String key, final List documents, final Reporter context, boolean useTree, boolean noMatch) { - if (documents.size() > 1) { + public void processSortedBlock(final String key, final List documents, final Reporter context, + boolean useTree, boolean noMatch) { + if (documents.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(prepare(documents), context, useTree, noMatch); + process(prepare(documents), context, useTree, noMatch); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } - public void process(final String key, final Iterable documents, final Reporter context, boolean useTree, boolean noMatch) { + public void process(final String key, final Iterable documents, final Reporter context, + boolean useTree, boolean noMatch) { - final Queue q = prepare(documents); + final Queue q = prepare(documents); - if (q.size() > 1) { + if (q.size() > 1) { // log.info("reducing key: '" + key + "' records: " + q.size()); - process(simplifyQueue(q, key, context), context, useTree, noMatch); + process(simplifyQueue(q, key, context), context, useTree, noMatch); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); - } - } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1); + } + } - private Queue prepare(final Iterable documents) { - final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); + private Queue prepare(final Iterable documents) { + final Queue queue = new PriorityQueue<>(100, + new MapDocumentComparator(dedupConf.getWf().getOrderField())); - final Set seen = new HashSet(); - final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); + final Set seen = new HashSet(); + final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); - documents.forEach(doc -> { - if (queue.size() <= queueMaxSize) { - final String id = doc.getIdentifier(); + documents.forEach(doc -> { + if (queue.size() <= queueMaxSize) { + final String id = doc.getIdentifier(); - if (!seen.contains(id)) { - seen.add(id); - queue.add(doc); - } - } - }); + if (!seen.contains(id)) { + seen.add(id); + queue.add(doc); + } + } + }); - return queue; - } + return queue; + } - private Queue simplifyQueue(final Queue queue, final String ngram, final Reporter context) { - final Queue q = new LinkedList<>(); + private Queue simplifyQueue(final Queue queue, final String ngram, + final Reporter context) { + final Queue q = new LinkedList<>(); - String fieldRef = ""; - final List tempResults = Lists.newArrayList(); + String fieldRef = ""; + final List tempResults = Lists.newArrayList(); - while (!queue.isEmpty()) { - final MapDocument result = queue.remove(); + while (!queue.isEmpty()) { + final MapDocument result = queue.remove(); - final String orderFieldName = dedupConf.getWf().getOrderField(); - final Field orderFieldValue = result.values(orderFieldName); - if (!orderFieldValue.isEmpty()) { - final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); - if (field.equals(fieldRef)) { - tempResults.add(result); - } else { - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - tempResults.clear(); - tempResults.add(result); - fieldRef = field; - } - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); - } - } - populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + final String orderFieldName = dedupConf.getWf().getOrderField(); + final Field orderFieldValue = result.values(orderFieldName); + if (!orderFieldValue.isEmpty()) { + final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); + if (field.equals(fieldRef)) { + tempResults.add(result); + } else { + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); + tempResults.clear(); + tempResults.add(result); + fieldRef = field; + } + } else { + context + .incrementCounter( + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1); + } + } + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram); - return q; - } + return q; + } - private void populateSimplifiedQueue(final Queue q, - final List tempResults, - final Reporter context, - final String fieldRef, - final String ngram) { - WfConfig wf = dedupConf.getWf(); - if (tempResults.size() < wf.getGroupMaxSize()) { - q.addAll(tempResults); - } else { - context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size()); + private void populateSimplifiedQueue(final Queue q, + final List tempResults, + final Reporter context, + final String fieldRef, + final String ngram) { + WfConfig wf = dedupConf.getWf(); + if (tempResults.size() < wf.getGroupMaxSize()) { + q.addAll(tempResults); + } else { + context + .incrementCounter( + wf.getEntityType(), + String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), + tempResults.size()); // log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); - } - } + } + } - private void process(final Queue queue, final Reporter context, boolean useTree, boolean noMatch) { + private void process(final Queue queue, final Reporter context, boolean useTree, boolean noMatch) { - while (!queue.isEmpty()) { + while (!queue.isEmpty()) { - final MapDocument pivot = queue.remove(); - final String idPivot = pivot.getIdentifier(); + final MapDocument pivot = queue.remove(); + final String idPivot = pivot.getIdentifier(); - WfConfig wf = dedupConf.getWf(); - final Field fieldsPivot = pivot.values(wf.getOrderField()); - final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); + WfConfig wf = dedupConf.getWf(); + final Field fieldsPivot = pivot.values(wf.getOrderField()); + final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue(); - if (fieldPivot != null) { - int i = 0; - for (final MapDocument curr : queue) { - final String idCurr = curr.getIdentifier(); + if (fieldPivot != null) { + int i = 0; + for (final MapDocument curr : queue) { + final String idCurr = curr.getIdentifier(); - if (mustSkip(idCurr)) { + if (mustSkip(idCurr)) { - context.incrementCounter(wf.getEntityType(), "skip list", 1); + context.incrementCounter(wf.getEntityType(), "skip list", 1); - break; - } + break; + } - if (i > wf.getSlidingWindowSize()) { - break; - } + if (i > wf.getSlidingWindowSize()) { + break; + } - final Field fieldsCurr = curr.values(wf.getOrderField()); - final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue(); + final Field fieldsCurr = curr.values(wf.getOrderField()); + final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null + : fieldsCurr.stringValue(); - if (!idCurr.equals(idPivot) && (fieldCurr != null)) { + if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - //draws no match relations (test purpose) - if (noMatch) { - emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); - } - else { - //use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications) - if (useTree) - emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); - else - emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); - } + // draws no match relations (test purpose) + if (noMatch) { + emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); + } else { + // use the decision tree implementation or the "normal" implementation of the similarity + // score (valid only for publications) + if (useTree) + emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context); + else + emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context); + } // if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) { // emitOutput(true, idPivot, idCurr, context); // } - } - } - } - } - } + } + } + } + } + } - protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) { - Map params = new HashMap<>(); - InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params); - double compare = instanceTypeMatch.compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf); - return compare>=1.0; - } + protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) { + Map params = new HashMap<>(); + InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params); + double compare = instanceTypeMatch + .compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf); + return compare >= 1.0; + } - private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { - //if the score gives 1, the publications are equivalent - Map params = new HashMap<>(); - params.put("jpath_value", "$.value"); - params.put("jpath_classid", "$.qualifier.classid"); - params.put("mode", "count"); + private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) { + // if the score gives 1, the publications are equivalent + Map params = new HashMap<>(); + params.put("jpath_value", "$.value"); + params.put("jpath_classid", "$.qualifier.classid"); + params.put("mode", "count"); - double score = 0.0; + double score = 0.0; - //levenstein title - LevensteinTitle levensteinTitle = new LevensteinTitle(params); - if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) { - score += 0.2; - } + // levenstein title + LevensteinTitle levensteinTitle = new LevensteinTitle(params); + if (levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) { + score += 0.2; + } - //pid - JsonListMatch jsonListMatch = new JsonListMatch(params); - if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) { - score += 0.5; - } + // pid + JsonListMatch jsonListMatch = new JsonListMatch(params); + if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) { + score += 0.5; + } - //title version - TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); - double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); - if(result1<0 || result1>=1.0) { - score += 0.1; - } + // title version + TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); + double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config); + if (result1 < 0 || result1 >= 1.0) { + score += 0.1; + } - //authors match - params.remove("mode"); - AuthorsMatch authorsMatch = new AuthorsMatch(params); - double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); - if(result2 <0|| result2>=0.6) { - score += 0.2; - } + // authors match + params.remove("mode"); + AuthorsMatch authorsMatch = new AuthorsMatch(params); + double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config); + if (result2 < 0 || result2 >= 0.6) { + score += 0.2; + } - return score>=0.5; - } + return score >= 0.5; + } - private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { - if (result) { - writeSimilarity(context, idPivot, idCurr); - context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); - } else { - context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); - } - } + if (result) { + writeSimilarity(context, idPivot, idCurr); + context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1); + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); + } + } - private boolean mustSkip(final String idPivot) { - return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); - } + private boolean mustSkip(final String idPivot) { + return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); + } - private String getNsPrefix(final String id) { - return StringUtils.substringBetween(id, "|", "::"); - } + private String getNsPrefix(final String id) { + return StringUtils.substringBetween(id, "|", "::"); + } - private void writeSimilarity(final Reporter context, final String from, final String to) { - final String type = dedupConf.getWf().getEntityType(); + private void writeSimilarity(final Reporter context, final String from, final String to) { + final String type = dedupConf.getWf().getEntityType(); - context.emit(type, from, to); - } + context.emit(type, from, to); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java index 2de729045..403d91dd9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java @@ -1,15 +1,18 @@ + package eu.dnetlib.pace.util; +import org.apache.commons.lang3.text.WordUtils; import com.google.common.base.Function; -import org.apache.commons.lang3.text.WordUtils; public class Capitalise implements Function { - private final char[] DELIM = {' ', '-'}; + private final char[] DELIM = { + ' ', '-' + }; - @Override - public String apply(final String s) { - return WordUtils.capitalize(s.toLowerCase(), DELIM); - } + @Override + public String apply(final String s) { + return WordUtils.capitalize(s.toLowerCase(), DELIM); + } }; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java index 45e011fdd..84d49bd5c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DiffPatchMatch.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.util; /* @@ -17,7 +18,6 @@ package eu.dnetlib.pace.util; * See the License for the specific language governing permissions and * limitations under the License. */ - import java.io.UnsupportedEncodingException; import java.net.URLDecoder; import java.net.URLEncoder; @@ -39,2433 +39,2498 @@ import java.util.regex.Pattern; */ public class DiffPatchMatch { - // Defaults. - // Set these on your diff_match_patch instance to override the defaults. - - /** - * Number of seconds to map a diff before giving up (0 for infinity). - */ - public float Diff_Timeout = 1.0f; - /** - * Cost of an empty edit operation in terms of edit characters. - */ - public short Diff_EditCost = 4; - /** - * At what point is no match declared (0.0 = perfection, 1.0 = very loose). - */ - public float Match_Threshold = 0.5f; - /** - * How far to search for a match (0 = exact location, 1000+ = broad match). - * A match this many characters away from the expected location will add - * 1.0 to the score (0.0 is a perfect match). - */ - public int Match_Distance = 1000; - /** - * When deleting a large block of text (over ~64 characters), how close do - * the contents have to be to match the expected contents. (0.0 = perfection, - * 1.0 = very loose). Note that Match_Threshold controls how closely the - * end points of a delete need to match. - */ - public float Patch_DeleteThreshold = 0.5f; - /** - * Chunk size for context length. - */ - public short Patch_Margin = 4; - - /** - * The number of bits in an int. - */ - private short Match_MaxBits = 32; - - /** - * Internal class for returning results from diff_linesToChars(). - * Other less paranoid languages just use a three-element array. - */ - protected static class LinesToCharsResult { - protected String chars1; - protected String chars2; - protected List lineArray; - - protected LinesToCharsResult(String chars1, String chars2, - List lineArray) { - this.chars1 = chars1; - this.chars2 = chars2; - this.lineArray = lineArray; - } - } - - - // DIFF FUNCTIONS - - - /** - * The data structure representing a diff is a Linked list of Diff objects: - * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), - * Diff(Operation.EQUAL, " world.")} - * which means: delete "Hello", add "Goodbye" and keep " world." - */ - public enum Operation { - DELETE, INSERT, EQUAL - } - - /** - * Find the differences between two texts. - * Run a faster, slightly less optimal diff. - * This method allows the 'checklines' of diff_main() to be optional. - * Most of the time checklines is wanted, so default to true. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @return Linked List of Diff objects. - */ - public LinkedList diff_main(String text1, String text2) { - return diff_main(text1, text2, true); - } - - /** - * Find the differences between two texts. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param checklines Speedup flag. If false, then don't run a - * line-level diff first to identify the changed areas. - * If true, then run a faster slightly less optimal diff. - * @return Linked List of Diff objects. - */ - public LinkedList diff_main(String text1, String text2, - boolean checklines) { - // Set a deadline by which time the diff must be complete. - long deadline; - if (Diff_Timeout <= 0) { - deadline = Long.MAX_VALUE; - } else { - deadline = System.currentTimeMillis() + (long) (Diff_Timeout * 1000); - } - return diff_main(text1, text2, checklines, deadline); - } - - /** - * Find the differences between two texts. Simplifies the problem by - * stripping any common prefix or suffix off the texts before diffing. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param checklines Speedup flag. If false, then don't run a - * line-level diff first to identify the changed areas. - * If true, then run a faster slightly less optimal diff. - * @param deadline Time when the diff should be complete by. Used - * internally for recursive calls. Users should set DiffTimeout instead. - * @return Linked List of Diff objects. - */ - private LinkedList diff_main(String text1, String text2, - boolean checklines, long deadline) { - // Check for null inputs. - if (text1 == null || text2 == null) { - throw new IllegalArgumentException("Null inputs. (diff_main)"); - } - - // Check for equality (speedup). - LinkedList diffs; - if (text1.equals(text2)) { - diffs = new LinkedList(); - if (text1.length() != 0) { - diffs.add(new Diff(Operation.EQUAL, text1)); - } - return diffs; - } - - // Trim off common prefix (speedup). - int commonlength = diff_commonPrefix(text1, text2); - String commonprefix = text1.substring(0, commonlength); - text1 = text1.substring(commonlength); - text2 = text2.substring(commonlength); - - // Trim off common suffix (speedup). - commonlength = diff_commonSuffix(text1, text2); - String commonsuffix = text1.substring(text1.length() - commonlength); - text1 = text1.substring(0, text1.length() - commonlength); - text2 = text2.substring(0, text2.length() - commonlength); - - // Compute the diff on the middle block. - diffs = diff_compute(text1, text2, checklines, deadline); - - // Restore the prefix and suffix. - if (commonprefix.length() != 0) { - diffs.addFirst(new Diff(Operation.EQUAL, commonprefix)); - } - if (commonsuffix.length() != 0) { - diffs.addLast(new Diff(Operation.EQUAL, commonsuffix)); - } - - diff_cleanupMerge(diffs); - return diffs; - } - - /** - * Find the differences between two texts. Assumes that the texts do not - * have any common prefix or suffix. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param checklines Speedup flag. If false, then don't run a - * line-level diff first to identify the changed areas. - * If true, then run a faster slightly less optimal diff. - * @param deadline Time when the diff should be complete by. - * @return Linked List of Diff objects. - */ - private LinkedList diff_compute(String text1, String text2, - boolean checklines, long deadline) { - LinkedList diffs = new LinkedList(); - - if (text1.length() == 0) { - // Just add some text (speedup). - diffs.add(new Diff(Operation.INSERT, text2)); - return diffs; - } - - if (text2.length() == 0) { - // Just delete some text (speedup). - diffs.add(new Diff(Operation.DELETE, text1)); - return diffs; - } - - String longtext = text1.length() > text2.length() ? text1 : text2; - String shorttext = text1.length() > text2.length() ? text2 : text1; - int i = longtext.indexOf(shorttext); - if (i != -1) { - // Shorter text is inside the longer text (speedup). - Operation op = (text1.length() > text2.length()) ? - Operation.DELETE : Operation.INSERT; - diffs.add(new Diff(op, longtext.substring(0, i))); - diffs.add(new Diff(Operation.EQUAL, shorttext)); - diffs.add(new Diff(op, longtext.substring(i + shorttext.length()))); - return diffs; - } - - if (shorttext.length() == 1) { - // Single character string. - // After the previous speedup, the character can't be an equality. - diffs.add(new Diff(Operation.DELETE, text1)); - diffs.add(new Diff(Operation.INSERT, text2)); - return diffs; - } - - // Check to see if the problem can be split in two. - String[] hm = diff_halfMatch(text1, text2); - if (hm != null) { - // A half-match was found, sort out the return data. - String text1_a = hm[0]; - String text1_b = hm[1]; - String text2_a = hm[2]; - String text2_b = hm[3]; - String mid_common = hm[4]; - // Send both pairs off for separate processing. - LinkedList diffs_a = diff_main(text1_a, text2_a, - checklines, deadline); - LinkedList diffs_b = diff_main(text1_b, text2_b, - checklines, deadline); - // Merge the results. - diffs = diffs_a; - diffs.add(new Diff(Operation.EQUAL, mid_common)); - diffs.addAll(diffs_b); - return diffs; - } - - if (checklines && text1.length() > 100 && text2.length() > 100) { - return diff_lineMode(text1, text2, deadline); - } - - return diff_bisect(text1, text2, deadline); - } - - /** - * Do a quick line-level diff on both strings, then rediff the parts for - * greater accuracy. - * This speedup can produce non-minimal diffs. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param deadline Time when the diff should be complete by. - * @return Linked List of Diff objects. - */ - private LinkedList diff_lineMode(String text1, String text2, - long deadline) { - // Scan the text on a line-by-line basis first. - LinesToCharsResult a = diff_linesToChars(text1, text2); - text1 = a.chars1; - text2 = a.chars2; - List linearray = a.lineArray; - - LinkedList diffs = diff_main(text1, text2, false, deadline); - - // Convert the diff back to original text. - diff_charsToLines(diffs, linearray); - // Eliminate freak matches (e.g. blank lines) - diff_cleanupSemantic(diffs); - - // Rediff any replacement blocks, this time character-by-character. - // Add a dummy entry at the end. - diffs.add(new Diff(Operation.EQUAL, "")); - int count_delete = 0; - int count_insert = 0; - String text_delete = ""; - String text_insert = ""; - ListIterator pointer = diffs.listIterator(); - Diff thisDiff = pointer.next(); - while (thisDiff != null) { - switch (thisDiff.operation) { - case INSERT: - count_insert++; - text_insert += thisDiff.text; - break; - case DELETE: - count_delete++; - text_delete += thisDiff.text; - break; - case EQUAL: - // Upon reaching an equality, check for prior redundancies. - if (count_delete >= 1 && count_insert >= 1) { - // Delete the offending records and add the merged ones. - pointer.previous(); - for (int j = 0; j < count_delete + count_insert; j++) { - pointer.previous(); - pointer.remove(); - } - for (Diff subDiff : diff_main(text_delete, text_insert, false, - deadline)) { - pointer.add(subDiff); - } - } - count_insert = 0; - count_delete = 0; - text_delete = ""; - text_insert = ""; - break; - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - diffs.removeLast(); // Remove the dummy entry at the end. - - return diffs; - } - - /** - * Find the 'middle snake' of a diff, split the problem in two - * and return the recursively constructed diff. - * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param deadline Time at which to bail if not yet complete. - * @return LinkedList of Diff objects. - */ - protected LinkedList diff_bisect(String text1, String text2, - long deadline) { - // Cache the text lengths to prevent multiple calls. - int text1_length = text1.length(); - int text2_length = text2.length(); - int max_d = (text1_length + text2_length + 1) / 2; - int v_offset = max_d; - int v_length = 2 * max_d; - int[] v1 = new int[v_length]; - int[] v2 = new int[v_length]; - for (int x = 0; x < v_length; x++) { - v1[x] = -1; - v2[x] = -1; - } - v1[v_offset + 1] = 0; - v2[v_offset + 1] = 0; - int delta = text1_length - text2_length; - // If the total number of characters is odd, then the front path will - // collide with the reverse path. - boolean front = (delta % 2 != 0); - // Offsets for start and end of k loop. - // Prevents mapping of space beyond the grid. - int k1start = 0; - int k1end = 0; - int k2start = 0; - int k2end = 0; - for (int d = 0; d < max_d; d++) { - // Bail out if deadline is reached. - if (System.currentTimeMillis() > deadline) { - break; - } - - // Walk the front path one step. - for (int k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { - int k1_offset = v_offset + k1; - int x1; - if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) { - x1 = v1[k1_offset + 1]; - } else { - x1 = v1[k1_offset - 1] + 1; - } - int y1 = x1 - k1; - while (x1 < text1_length && y1 < text2_length - && text1.charAt(x1) == text2.charAt(y1)) { - x1++; - y1++; - } - v1[k1_offset] = x1; - if (x1 > text1_length) { - // Ran off the right of the graph. - k1end += 2; - } else if (y1 > text2_length) { - // Ran off the bottom of the graph. - k1start += 2; - } else if (front) { - int k2_offset = v_offset + delta - k1; - if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) { - // Mirror x2 onto top-left coordinate system. - int x2 = text1_length - v2[k2_offset]; - if (x1 >= x2) { - // Overlap detected. - return diff_bisectSplit(text1, text2, x1, y1, deadline); - } - } - } - } - - // Walk the reverse path one step. - for (int k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { - int k2_offset = v_offset + k2; - int x2; - if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) { - x2 = v2[k2_offset + 1]; - } else { - x2 = v2[k2_offset - 1] + 1; - } - int y2 = x2 - k2; - while (x2 < text1_length && y2 < text2_length - && text1.charAt(text1_length - x2 - 1) - == text2.charAt(text2_length - y2 - 1)) { - x2++; - y2++; - } - v2[k2_offset] = x2; - if (x2 > text1_length) { - // Ran off the left of the graph. - k2end += 2; - } else if (y2 > text2_length) { - // Ran off the top of the graph. - k2start += 2; - } else if (!front) { - int k1_offset = v_offset + delta - k2; - if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) { - int x1 = v1[k1_offset]; - int y1 = v_offset + x1 - k1_offset; - // Mirror x2 onto top-left coordinate system. - x2 = text1_length - x2; - if (x1 >= x2) { - // Overlap detected. - return diff_bisectSplit(text1, text2, x1, y1, deadline); - } - } - } - } - } - // Diff took too long and hit the deadline or - // number of diffs equals number of characters, no commonality at all. - LinkedList diffs = new LinkedList(); - diffs.add(new Diff(Operation.DELETE, text1)); - diffs.add(new Diff(Operation.INSERT, text2)); - return diffs; - } - - /** - * Given the location of the 'middle snake', split the diff in two parts - * and recurse. - * @param text1 Old string to be diffed. - * @param text2 New string to be diffed. - * @param x Index of split point in text1. - * @param y Index of split point in text2. - * @param deadline Time at which to bail if not yet complete. - * @return LinkedList of Diff objects. - */ - private LinkedList diff_bisectSplit(String text1, String text2, - int x, int y, long deadline) { - String text1a = text1.substring(0, x); - String text2a = text2.substring(0, y); - String text1b = text1.substring(x); - String text2b = text2.substring(y); - - // Compute both diffs serially. - LinkedList diffs = diff_main(text1a, text2a, false, deadline); - LinkedList diffsb = diff_main(text1b, text2b, false, deadline); - - diffs.addAll(diffsb); - return diffs; - } - - /** - * Split two texts into a list of strings. Reduce the texts to a string of - * hashes where each Unicode character represents one line. - * @param text1 First string. - * @param text2 Second string. - * @return An object containing the encoded text1, the encoded text2 and - * the List of unique strings. The zeroth element of the List of - * unique strings is intentionally blank. - */ - protected LinesToCharsResult diff_linesToChars(String text1, String text2) { - List lineArray = new ArrayList(); - Map lineHash = new HashMap(); - // e.g. linearray[4] == "Hello\n" - // e.g. linehash.get("Hello\n") == 4 - - // "\x00" is a valid character, but various debuggers don't like it. - // So we'll insert a junk entry to avoid generating a null character. - lineArray.add(""); - - // Allocate 2/3rds of the space for text1, the rest for text2. - String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash, 40000); - String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash, 65535); - return new LinesToCharsResult(chars1, chars2, lineArray); - } - - /** - * Split a text into a list of strings. Reduce the texts to a string of - * hashes where each Unicode character represents one line. - * @param text String to encode. - * @param lineArray List of unique strings. - * @param lineHash Map of strings to indices. - * @param maxLines Maximum length of lineArray. - * @return Encoded string. - */ - private String diff_linesToCharsMunge(String text, List lineArray, - Map lineHash, int maxLines) { - int lineStart = 0; - int lineEnd = -1; - String line; - StringBuilder chars = new StringBuilder(); - // Walk the text, pulling out a substring for each line. - // text.split('\n') would would temporarily double our memory footprint. - // Modifying text would create many large strings to garbage collect. - while (lineEnd < text.length() - 1) { - lineEnd = text.indexOf('\n', lineStart); - if (lineEnd == -1) { - lineEnd = text.length() - 1; - } - line = text.substring(lineStart, lineEnd + 1); - - if (lineHash.containsKey(line)) { - chars.append(String.valueOf((char) (int) lineHash.get(line))); - } else { - if (lineArray.size() == maxLines) { - // Bail out at 65535 because - // String.valueOf((char) 65536).equals(String.valueOf(((char) 0))) - line = text.substring(lineStart); - lineEnd = text.length(); - } - lineArray.add(line); - lineHash.put(line, lineArray.size() - 1); - chars.append(String.valueOf((char) (lineArray.size() - 1))); - } - lineStart = lineEnd + 1; - } - return chars.toString(); - } - - /** - * Rehydrate the text in a diff from a string of line hashes to real lines of - * text. - * @param diffs List of Diff objects. - * @param lineArray List of unique strings. - */ - protected void diff_charsToLines(List diffs, - List lineArray) { - StringBuilder text; - for (Diff diff : diffs) { - text = new StringBuilder(); - for (int j = 0; j < diff.text.length(); j++) { - text.append(lineArray.get(diff.text.charAt(j))); - } - diff.text = text.toString(); - } - } - - /** - * Determine the common prefix of two strings - * @param text1 First string. - * @param text2 Second string. - * @return The number of characters common to the start of each string. - */ - public int diff_commonPrefix(String text1, String text2) { - // Performance analysis: https://neil.fraser.name/news/2007/10/09/ - int n = Math.min(text1.length(), text2.length()); - for (int i = 0; i < n; i++) { - if (text1.charAt(i) != text2.charAt(i)) { - return i; - } - } - return n; - } - - /** - * Determine the common suffix of two strings - * @param text1 First string. - * @param text2 Second string. - * @return The number of characters common to the end of each string. - */ - public int diff_commonSuffix(String text1, String text2) { - // Performance analysis: https://neil.fraser.name/news/2007/10/09/ - int text1_length = text1.length(); - int text2_length = text2.length(); - int n = Math.min(text1_length, text2_length); - for (int i = 1; i <= n; i++) { - if (text1.charAt(text1_length - i) != text2.charAt(text2_length - i)) { - return i - 1; - } - } - return n; - } - - /** - * Determine if the suffix of one string is the prefix of another. - * @param text1 First string. - * @param text2 Second string. - * @return The number of characters common to the end of the first - * string and the start of the second string. - */ - protected int diff_commonOverlap(String text1, String text2) { - // Cache the text lengths to prevent multiple calls. - int text1_length = text1.length(); - int text2_length = text2.length(); - // Eliminate the null case. - if (text1_length == 0 || text2_length == 0) { - return 0; - } - // Truncate the longer string. - if (text1_length > text2_length) { - text1 = text1.substring(text1_length - text2_length); - } else if (text1_length < text2_length) { - text2 = text2.substring(0, text1_length); - } - int text_length = Math.min(text1_length, text2_length); - // Quick check for the worst case. - if (text1.equals(text2)) { - return text_length; - } - - // Start by looking for a single character match - // and increase length until no match is found. - // Performance analysis: https://neil.fraser.name/news/2010/11/04/ - int best = 0; - int length = 1; - while (true) { - String pattern = text1.substring(text_length - length); - int found = text2.indexOf(pattern); - if (found == -1) { - return best; - } - length += found; - if (found == 0 || text1.substring(text_length - length).equals( - text2.substring(0, length))) { - best = length; - length++; - } - } - } - - /** - * Do the two texts share a substring which is at least half the length of - * the longer text? - * This speedup can produce non-minimal diffs. - * @param text1 First string. - * @param text2 Second string. - * @return Five element String array, containing the prefix of text1, the - * suffix of text1, the prefix of text2, the suffix of text2 and the - * common middle. Or null if there was no match. - */ - protected String[] diff_halfMatch(String text1, String text2) { - if (Diff_Timeout <= 0) { - // Don't risk returning a non-optimal diff if we have unlimited time. - return null; - } - String longtext = text1.length() > text2.length() ? text1 : text2; - String shorttext = text1.length() > text2.length() ? text2 : text1; - if (longtext.length() < 4 || shorttext.length() * 2 < longtext.length()) { - return null; // Pointless. - } - - // First check if the second quarter is the seed for a half-match. - String[] hm1 = diff_halfMatchI(longtext, shorttext, - (longtext.length() + 3) / 4); - // Check again based on the third quarter. - String[] hm2 = diff_halfMatchI(longtext, shorttext, - (longtext.length() + 1) / 2); - String[] hm; - if (hm1 == null && hm2 == null) { - return null; - } else if (hm2 == null) { - hm = hm1; - } else if (hm1 == null) { - hm = hm2; - } else { - // Both matched. Select the longest. - hm = hm1[4].length() > hm2[4].length() ? hm1 : hm2; - } - - // A half-match was found, sort out the return data. - if (text1.length() > text2.length()) { - return hm; - //return new String[]{hm[0], hm[1], hm[2], hm[3], hm[4]}; - } else { - return new String[]{hm[2], hm[3], hm[0], hm[1], hm[4]}; - } - } - - /** - * Does a substring of shorttext exist within longtext such that the - * substring is at least half the length of longtext? - * @param longtext Longer string. - * @param shorttext Shorter string. - * @param i Start index of quarter length substring within longtext. - * @return Five element String array, containing the prefix of longtext, the - * suffix of longtext, the prefix of shorttext, the suffix of shorttext - * and the common middle. Or null if there was no match. - */ - private String[] diff_halfMatchI(String longtext, String shorttext, int i) { - // Start with a 1/4 length substring at position i as a seed. - String seed = longtext.substring(i, i + longtext.length() / 4); - int j = -1; - String best_common = ""; - String best_longtext_a = "", best_longtext_b = ""; - String best_shorttext_a = "", best_shorttext_b = ""; - while ((j = shorttext.indexOf(seed, j + 1)) != -1) { - int prefixLength = diff_commonPrefix(longtext.substring(i), - shorttext.substring(j)); - int suffixLength = diff_commonSuffix(longtext.substring(0, i), - shorttext.substring(0, j)); - if (best_common.length() < suffixLength + prefixLength) { - best_common = shorttext.substring(j - suffixLength, j) - + shorttext.substring(j, j + prefixLength); - best_longtext_a = longtext.substring(0, i - suffixLength); - best_longtext_b = longtext.substring(i + prefixLength); - best_shorttext_a = shorttext.substring(0, j - suffixLength); - best_shorttext_b = shorttext.substring(j + prefixLength); - } - } - if (best_common.length() * 2 >= longtext.length()) { - return new String[]{best_longtext_a, best_longtext_b, - best_shorttext_a, best_shorttext_b, best_common}; - } else { - return null; - } - } - - /** - * Reduce the number of edits by eliminating semantically trivial equalities. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupSemantic(LinkedList diffs) { - if (diffs.isEmpty()) { - return; - } - boolean changes = false; - Deque equalities = new ArrayDeque(); // Double-ended queue of qualities. - String lastEquality = null; // Always equal to equalities.peek().text - ListIterator pointer = diffs.listIterator(); - // Number of characters that changed prior to the equality. - int length_insertions1 = 0; - int length_deletions1 = 0; - // Number of characters that changed after the equality. - int length_insertions2 = 0; - int length_deletions2 = 0; - Diff thisDiff = pointer.next(); - while (thisDiff != null) { - if (thisDiff.operation == Operation.EQUAL) { - // Equality found. - equalities.push(thisDiff); - length_insertions1 = length_insertions2; - length_deletions1 = length_deletions2; - length_insertions2 = 0; - length_deletions2 = 0; - lastEquality = thisDiff.text; - } else { - // An insertion or deletion. - if (thisDiff.operation == Operation.INSERT) { - length_insertions2 += thisDiff.text.length(); - } else { - length_deletions2 += thisDiff.text.length(); - } - // Eliminate an equality that is smaller or equal to the edits on both - // sides of it. - if (lastEquality != null && (lastEquality.length() - <= Math.max(length_insertions1, length_deletions1)) - && (lastEquality.length() - <= Math.max(length_insertions2, length_deletions2))) { - //System.out.println("Splitting: '" + lastEquality + "'"); - // Walk back to offending equality. - while (thisDiff != equalities.peek()) { - thisDiff = pointer.previous(); - } - pointer.next(); - - // Replace equality with a delete. - pointer.set(new Diff(Operation.DELETE, lastEquality)); - // Insert a corresponding an insert. - pointer.add(new Diff(Operation.INSERT, lastEquality)); - - equalities.pop(); // Throw away the equality we just deleted. - if (!equalities.isEmpty()) { - // Throw away the previous equality (it needs to be reevaluated). - equalities.pop(); - } - if (equalities.isEmpty()) { - // There are no previous equalities, walk back to the start. - while (pointer.hasPrevious()) { - pointer.previous(); - } - } else { - // There is a safe equality we can fall back to. - thisDiff = equalities.peek(); - while (thisDiff != pointer.previous()) { - // Intentionally empty loop. - } - } - - length_insertions1 = 0; // Reset the counters. - length_insertions2 = 0; - length_deletions1 = 0; - length_deletions2 = 0; - lastEquality = null; - changes = true; - } - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - - // Normalize the diff. - if (changes) { - diff_cleanupMerge(diffs); - } - diff_cleanupSemanticLossless(diffs); - - // Find any overlaps between deletions and insertions. - // e.g: abcxxxxxxdef - // -> abcxxxdef - // e.g: xxxabcdefxxx - // -> defxxxabc - // Only extract an overlap if it is as big as the edit ahead or behind it. - pointer = diffs.listIterator(); - Diff prevDiff = null; - thisDiff = null; - if (pointer.hasNext()) { - prevDiff = pointer.next(); - if (pointer.hasNext()) { - thisDiff = pointer.next(); - } - } - while (thisDiff != null) { - if (prevDiff.operation == Operation.DELETE && - thisDiff.operation == Operation.INSERT) { - String deletion = prevDiff.text; - String insertion = thisDiff.text; - int overlap_length1 = this.diff_commonOverlap(deletion, insertion); - int overlap_length2 = this.diff_commonOverlap(insertion, deletion); - if (overlap_length1 >= overlap_length2) { - if (overlap_length1 >= deletion.length() / 2.0 || - overlap_length1 >= insertion.length() / 2.0) { - // Overlap found. Insert an equality and trim the surrounding edits. - pointer.previous(); - pointer.add(new Diff(Operation.EQUAL, - insertion.substring(0, overlap_length1))); - prevDiff.text = - deletion.substring(0, deletion.length() - overlap_length1); - thisDiff.text = insertion.substring(overlap_length1); - // pointer.add inserts the element before the cursor, so there is - // no need to step past the new element. - } - } else { - if (overlap_length2 >= deletion.length() / 2.0 || - overlap_length2 >= insertion.length() / 2.0) { - // Reverse overlap found. - // Insert an equality and swap and trim the surrounding edits. - pointer.previous(); - pointer.add(new Diff(Operation.EQUAL, - deletion.substring(0, overlap_length2))); - prevDiff.operation = Operation.INSERT; - prevDiff.text = - insertion.substring(0, insertion.length() - overlap_length2); - thisDiff.operation = Operation.DELETE; - thisDiff.text = deletion.substring(overlap_length2); - // pointer.add inserts the element before the cursor, so there is - // no need to step past the new element. - } - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - prevDiff = thisDiff; - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - } - - /** - * Look for single edits surrounded on both sides by equalities - * which can be shifted sideways to align the edit to a word boundary. - * e.g: The cat came. -> The cat came. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupSemanticLossless(LinkedList diffs) { - String equality1, edit, equality2; - String commonString; - int commonOffset; - int score, bestScore; - String bestEquality1, bestEdit, bestEquality2; - // Create a new iterator at the start. - ListIterator pointer = diffs.listIterator(); - Diff prevDiff = pointer.hasNext() ? pointer.next() : null; - Diff thisDiff = pointer.hasNext() ? pointer.next() : null; - Diff nextDiff = pointer.hasNext() ? pointer.next() : null; - // Intentionally ignore the first and last element (don't need checking). - while (nextDiff != null) { - if (prevDiff.operation == Operation.EQUAL && - nextDiff.operation == Operation.EQUAL) { - // This is a single edit surrounded by equalities. - equality1 = prevDiff.text; - edit = thisDiff.text; - equality2 = nextDiff.text; - - // First, shift the edit as far left as possible. - commonOffset = diff_commonSuffix(equality1, edit); - if (commonOffset != 0) { - commonString = edit.substring(edit.length() - commonOffset); - equality1 = equality1.substring(0, equality1.length() - commonOffset); - edit = commonString + edit.substring(0, edit.length() - commonOffset); - equality2 = commonString + equality2; - } - - // Second, step character by character right, looking for the best fit. - bestEquality1 = equality1; - bestEdit = edit; - bestEquality2 = equality2; - bestScore = diff_cleanupSemanticScore(equality1, edit) - + diff_cleanupSemanticScore(edit, equality2); - while (edit.length() != 0 && equality2.length() != 0 - && edit.charAt(0) == equality2.charAt(0)) { - equality1 += edit.charAt(0); - edit = edit.substring(1) + equality2.charAt(0); - equality2 = equality2.substring(1); - score = diff_cleanupSemanticScore(equality1, edit) - + diff_cleanupSemanticScore(edit, equality2); - // The >= encourages trailing rather than leading whitespace on edits. - if (score >= bestScore) { - bestScore = score; - bestEquality1 = equality1; - bestEdit = edit; - bestEquality2 = equality2; - } - } - - if (!prevDiff.text.equals(bestEquality1)) { - // We have an improvement, save it back to the diff. - if (bestEquality1.length() != 0) { - prevDiff.text = bestEquality1; - } else { - pointer.previous(); // Walk past nextDiff. - pointer.previous(); // Walk past thisDiff. - pointer.previous(); // Walk past prevDiff. - pointer.remove(); // Delete prevDiff. - pointer.next(); // Walk past thisDiff. - pointer.next(); // Walk past nextDiff. - } - thisDiff.text = bestEdit; - if (bestEquality2.length() != 0) { - nextDiff.text = bestEquality2; - } else { - pointer.remove(); // Delete nextDiff. - nextDiff = thisDiff; - thisDiff = prevDiff; - } - } - } - prevDiff = thisDiff; - thisDiff = nextDiff; - nextDiff = pointer.hasNext() ? pointer.next() : null; - } - } - - /** - * Given two strings, compute a score representing whether the internal - * boundary falls on logical boundaries. - * Scores range from 6 (best) to 0 (worst). - * @param one First string. - * @param two Second string. - * @return The score. - */ - private int diff_cleanupSemanticScore(String one, String two) { - if (one.length() == 0 || two.length() == 0) { - // Edges are the best. - return 6; - } - - // Each port of this function behaves slightly differently due to - // subtle differences in each language's definition of things like - // 'whitespace'. Since this function's purpose is largely cosmetic, - // the choice has been made to use each language's native features - // rather than force total conformity. - char char1 = one.charAt(one.length() - 1); - char char2 = two.charAt(0); - boolean nonAlphaNumeric1 = !Character.isLetterOrDigit(char1); - boolean nonAlphaNumeric2 = !Character.isLetterOrDigit(char2); - boolean whitespace1 = nonAlphaNumeric1 && Character.isWhitespace(char1); - boolean whitespace2 = nonAlphaNumeric2 && Character.isWhitespace(char2); - boolean lineBreak1 = whitespace1 - && Character.getType(char1) == Character.CONTROL; - boolean lineBreak2 = whitespace2 - && Character.getType(char2) == Character.CONTROL; - boolean blankLine1 = lineBreak1 && BLANKLINEEND.matcher(one).find(); - boolean blankLine2 = lineBreak2 && BLANKLINESTART.matcher(two).find(); - - if (blankLine1 || blankLine2) { - // Five points for blank lines. - return 5; - } else if (lineBreak1 || lineBreak2) { - // Four points for line breaks. - return 4; - } else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) { - // Three points for end of sentences. - return 3; - } else if (whitespace1 || whitespace2) { - // Two points for whitespace. - return 2; - } else if (nonAlphaNumeric1 || nonAlphaNumeric2) { - // One point for non-alphanumeric. - return 1; - } - return 0; - } - - // Define some regex patterns for matching boundaries. - private Pattern BLANKLINEEND - = Pattern.compile("\\n\\r?\\n\\Z", Pattern.DOTALL); - private Pattern BLANKLINESTART - = Pattern.compile("\\A\\r?\\n\\r?\\n", Pattern.DOTALL); - - /** - * Reduce the number of edits by eliminating operationally trivial equalities. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupEfficiency(LinkedList diffs) { - if (diffs.isEmpty()) { - return; - } - boolean changes = false; - Deque equalities = new ArrayDeque(); // Double-ended queue of equalities. - String lastEquality = null; // Always equal to equalities.peek().text - ListIterator pointer = diffs.listIterator(); - // Is there an insertion operation before the last equality. - boolean pre_ins = false; - // Is there a deletion operation before the last equality. - boolean pre_del = false; - // Is there an insertion operation after the last equality. - boolean post_ins = false; - // Is there a deletion operation after the last equality. - boolean post_del = false; - Diff thisDiff = pointer.next(); - Diff safeDiff = thisDiff; // The last Diff that is known to be unsplittable. - while (thisDiff != null) { - if (thisDiff.operation == Operation.EQUAL) { - // Equality found. - if (thisDiff.text.length() < Diff_EditCost && (post_ins || post_del)) { - // Candidate found. - equalities.push(thisDiff); - pre_ins = post_ins; - pre_del = post_del; - lastEquality = thisDiff.text; - } else { - // Not a candidate, and can never become one. - equalities.clear(); - lastEquality = null; - safeDiff = thisDiff; - } - post_ins = post_del = false; - } else { - // An insertion or deletion. - if (thisDiff.operation == Operation.DELETE) { - post_del = true; - } else { - post_ins = true; - } - /* - * Five types to be split: - * ABXYCD - * AXCD - * ABXC - * AXCD - * ABXC - */ - if (lastEquality != null - && ((pre_ins && pre_del && post_ins && post_del) - || ((lastEquality.length() < Diff_EditCost / 2) - && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) - + (post_ins ? 1 : 0) + (post_del ? 1 : 0)) == 3))) { - //System.out.println("Splitting: '" + lastEquality + "'"); - // Walk back to offending equality. - while (thisDiff != equalities.peek()) { - thisDiff = pointer.previous(); - } - pointer.next(); - - // Replace equality with a delete. - pointer.set(new Diff(Operation.DELETE, lastEquality)); - // Insert a corresponding an insert. - pointer.add(thisDiff = new Diff(Operation.INSERT, lastEquality)); - - equalities.pop(); // Throw away the equality we just deleted. - lastEquality = null; - if (pre_ins && pre_del) { - // No changes made which could affect previous entry, keep going. - post_ins = post_del = true; - equalities.clear(); - safeDiff = thisDiff; - } else { - if (!equalities.isEmpty()) { - // Throw away the previous equality (it needs to be reevaluated). - equalities.pop(); - } - if (equalities.isEmpty()) { - // There are no previous questionable equalities, - // walk back to the last known safe diff. - thisDiff = safeDiff; - } else { - // There is an equality we can fall back to. - thisDiff = equalities.peek(); - } - while (thisDiff != pointer.previous()) { - // Intentionally empty loop. - } - post_ins = post_del = false; - } - - changes = true; - } - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - - if (changes) { - diff_cleanupMerge(diffs); - } - } - - /** - * Reorder and merge like edit sections. Merge equalities. - * Any edit section can move as long as it doesn't cross an equality. - * @param diffs LinkedList of Diff objects. - */ - public void diff_cleanupMerge(LinkedList diffs) { - diffs.add(new Diff(Operation.EQUAL, "")); // Add a dummy entry at the end. - ListIterator pointer = diffs.listIterator(); - int count_delete = 0; - int count_insert = 0; - String text_delete = ""; - String text_insert = ""; - Diff thisDiff = pointer.next(); - Diff prevEqual = null; - int commonlength; - while (thisDiff != null) { - switch (thisDiff.operation) { - case INSERT: - count_insert++; - text_insert += thisDiff.text; - prevEqual = null; - break; - case DELETE: - count_delete++; - text_delete += thisDiff.text; - prevEqual = null; - break; - case EQUAL: - if (count_delete + count_insert > 1) { - boolean both_types = count_delete != 0 && count_insert != 0; - // Delete the offending records. - pointer.previous(); // Reverse direction. - while (count_delete-- > 0) { - pointer.previous(); - pointer.remove(); - } - while (count_insert-- > 0) { - pointer.previous(); - pointer.remove(); - } - if (both_types) { - // Factor out any common prefixies. - commonlength = diff_commonPrefix(text_insert, text_delete); - if (commonlength != 0) { - if (pointer.hasPrevious()) { - thisDiff = pointer.previous(); - assert thisDiff.operation == Operation.EQUAL - : "Previous diff should have been an equality."; - thisDiff.text += text_insert.substring(0, commonlength); - pointer.next(); - } else { - pointer.add(new Diff(Operation.EQUAL, - text_insert.substring(0, commonlength))); - } - text_insert = text_insert.substring(commonlength); - text_delete = text_delete.substring(commonlength); - } - // Factor out any common suffixies. - commonlength = diff_commonSuffix(text_insert, text_delete); - if (commonlength != 0) { - thisDiff = pointer.next(); - thisDiff.text = text_insert.substring(text_insert.length() - - commonlength) + thisDiff.text; - text_insert = text_insert.substring(0, text_insert.length() - - commonlength); - text_delete = text_delete.substring(0, text_delete.length() - - commonlength); - pointer.previous(); - } - } - // Insert the merged records. - if (text_delete.length() != 0) { - pointer.add(new Diff(Operation.DELETE, text_delete)); - } - if (text_insert.length() != 0) { - pointer.add(new Diff(Operation.INSERT, text_insert)); - } - // Step forward to the equality. - thisDiff = pointer.hasNext() ? pointer.next() : null; - } else if (prevEqual != null) { - // Merge this equality with the previous one. - prevEqual.text += thisDiff.text; - pointer.remove(); - thisDiff = pointer.previous(); - pointer.next(); // Forward direction - } - count_insert = 0; - count_delete = 0; - text_delete = ""; - text_insert = ""; - prevEqual = thisDiff; - break; - } - thisDiff = pointer.hasNext() ? pointer.next() : null; - } - if (diffs.getLast().text.length() == 0) { - diffs.removeLast(); // Remove the dummy entry at the end. - } - - /* - * Second pass: look for single edits surrounded on both sides by equalities - * which can be shifted sideways to eliminate an equality. - * e.g: ABAC -> ABAC - */ - boolean changes = false; - // Create a new iterator at the start. - // (As opposed to walking the current one back.) - pointer = diffs.listIterator(); - Diff prevDiff = pointer.hasNext() ? pointer.next() : null; - thisDiff = pointer.hasNext() ? pointer.next() : null; - Diff nextDiff = pointer.hasNext() ? pointer.next() : null; - // Intentionally ignore the first and last element (don't need checking). - while (nextDiff != null) { - if (prevDiff.operation == Operation.EQUAL && - nextDiff.operation == Operation.EQUAL) { - // This is a single edit surrounded by equalities. - if (thisDiff.text.endsWith(prevDiff.text)) { - // Shift the edit over the previous equality. - thisDiff.text = prevDiff.text - + thisDiff.text.substring(0, thisDiff.text.length() - - prevDiff.text.length()); - nextDiff.text = prevDiff.text + nextDiff.text; - pointer.previous(); // Walk past nextDiff. - pointer.previous(); // Walk past thisDiff. - pointer.previous(); // Walk past prevDiff. - pointer.remove(); // Delete prevDiff. - pointer.next(); // Walk past thisDiff. - thisDiff = pointer.next(); // Walk past nextDiff. - nextDiff = pointer.hasNext() ? pointer.next() : null; - changes = true; - } else if (thisDiff.text.startsWith(nextDiff.text)) { - // Shift the edit over the next equality. - prevDiff.text += nextDiff.text; - thisDiff.text = thisDiff.text.substring(nextDiff.text.length()) - + nextDiff.text; - pointer.remove(); // Delete nextDiff. - nextDiff = pointer.hasNext() ? pointer.next() : null; - changes = true; - } - } - prevDiff = thisDiff; - thisDiff = nextDiff; - nextDiff = pointer.hasNext() ? pointer.next() : null; - } - // If shifts were made, the diff needs reordering and another shift sweep. - if (changes) { - diff_cleanupMerge(diffs); - } - } - - /** - * loc is a location in text1, compute and return the equivalent location in - * text2. - * e.g. "The cat" vs "The big cat", 1->1, 5->8 - * @param diffs List of Diff objects. - * @param loc Location within text1. - * @return Location within text2. - */ - public int diff_xIndex(List diffs, int loc) { - int chars1 = 0; - int chars2 = 0; - int last_chars1 = 0; - int last_chars2 = 0; - Diff lastDiff = null; - for (Diff aDiff : diffs) { - if (aDiff.operation != Operation.INSERT) { - // Equality or deletion. - chars1 += aDiff.text.length(); - } - if (aDiff.operation != Operation.DELETE) { - // Equality or insertion. - chars2 += aDiff.text.length(); - } - if (chars1 > loc) { - // Overshot the location. - lastDiff = aDiff; - break; - } - last_chars1 = chars1; - last_chars2 = chars2; - } - if (lastDiff != null && lastDiff.operation == Operation.DELETE) { - // The location was deleted. - return last_chars2; - } - // Add the remaining character length. - return last_chars2 + (loc - last_chars1); - } - - /** - * Convert a Diff list into a pretty HTML report. - * @param diffs List of Diff objects. - * @return HTML representation. - */ - public String diff_prettyHtml(List diffs) { - StringBuilder html = new StringBuilder(); - for (Diff aDiff : diffs) { - String text = aDiff.text.replace("&", "&").replace("<", "<") - .replace(">", ">").replace("\n", "¶
"); - switch (aDiff.operation) { - case INSERT: - html.append("").append(text) - .append(""); - break; - case DELETE: - html.append("").append(text) - .append(""); - break; - case EQUAL: - html.append("").append(text).append(""); - break; - } - } - return html.toString(); - } - - /** - * Compute and return the source text (all equalities and deletions). - * @param diffs List of Diff objects. - * @return Source text. - */ - public String diff_text1(List diffs) { - StringBuilder text = new StringBuilder(); - for (Diff aDiff : diffs) { - if (aDiff.operation != Operation.INSERT) { - text.append(aDiff.text); - } - } - return text.toString(); - } - - /** - * Compute and return the destination text (all equalities and insertions). - * @param diffs List of Diff objects. - * @return Destination text. - */ - public String diff_text2(List diffs) { - StringBuilder text = new StringBuilder(); - for (Diff aDiff : diffs) { - if (aDiff.operation != Operation.DELETE) { - text.append(aDiff.text); - } - } - return text.toString(); - } - - /** - * Compute the Levenshtein compare; the number of inserted, deleted or - * substituted characters. - * @param diffs List of Diff objects. - * @return Number of changes. - */ - public int diff_levenshtein(List diffs) { - int levenshtein = 0; - int insertions = 0; - int deletions = 0; - for (Diff aDiff : diffs) { - switch (aDiff.operation) { - case INSERT: - insertions += aDiff.text.length(); - break; - case DELETE: - deletions += aDiff.text.length(); - break; - case EQUAL: - // A deletion and an insertion is one substitution. - levenshtein += Math.max(insertions, deletions); - insertions = 0; - deletions = 0; - break; - } - } - levenshtein += Math.max(insertions, deletions); - return levenshtein; - } - - /** - * Crush the diff into an encoded string which describes the operations - * required to transform text1 into text2. - * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. - * Operations are tab-separated. Inserted text is escaped using %xx notation. - * @param diffs List of Diff objects. - * @return Delta text. - */ - public String diff_toDelta(List diffs) { - StringBuilder text = new StringBuilder(); - for (Diff aDiff : diffs) { - switch (aDiff.operation) { - case INSERT: - try { - text.append("+").append(URLEncoder.encode(aDiff.text, "UTF-8") - .replace('+', ' ')).append("\t"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } - break; - case DELETE: - text.append("-").append(aDiff.text.length()).append("\t"); - break; - case EQUAL: - text.append("=").append(aDiff.text.length()).append("\t"); - break; - } - } - String delta = text.toString(); - if (delta.length() != 0) { - // Strip off trailing tab character. - delta = delta.substring(0, delta.length() - 1); - delta = unescapeForEncodeUriCompatability(delta); - } - return delta; - } - - /** - * Given the original text1, and an encoded string which describes the - * operations required to transform text1 into text2, compute the full diff. - * @param text1 Source string for the diff. - * @param delta Delta text. - * @return Array of Diff objects or null if invalid. - * @throws IllegalArgumentException If invalid input. - */ - public LinkedList diff_fromDelta(String text1, String delta) - throws IllegalArgumentException { - LinkedList diffs = new LinkedList(); - int pointer = 0; // Cursor in text1 - String[] tokens = delta.split("\t"); - for (String token : tokens) { - if (token.length() == 0) { - // Blank tokens are ok (from a trailing \t). - continue; - } - // Each token begins with a one character parameter which specifies the - // operation of this token (delete, insert, equality). - String param = token.substring(1); - switch (token.charAt(0)) { - case '+': - // decode would change all "+" to " " - param = param.replace("+", "%2B"); - try { - param = URLDecoder.decode(param, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } catch (IllegalArgumentException e) { - // Malformed URI sequence. - throw new IllegalArgumentException( - "Illegal escape in diff_fromDelta: " + param, e); - } - diffs.add(new Diff(Operation.INSERT, param)); - break; - case '-': - // Fall through. - case '=': - int n; - try { - n = Integer.parseInt(param); - } catch (NumberFormatException e) { - throw new IllegalArgumentException( - "Invalid number in diff_fromDelta: " + param, e); - } - if (n < 0) { - throw new IllegalArgumentException( - "Negative number in diff_fromDelta: " + param); - } - String text; - try { - text = text1.substring(pointer, pointer += n); - } catch (StringIndexOutOfBoundsException e) { - throw new IllegalArgumentException("Delta length (" + pointer - + ") larger than source text length (" + text1.length() - + ").", e); - } - if (token.charAt(0) == '=') { - diffs.add(new Diff(Operation.EQUAL, text)); - } else { - diffs.add(new Diff(Operation.DELETE, text)); - } - break; - default: - // Anything else is an error. - throw new IllegalArgumentException( - "Invalid diff operation in diff_fromDelta: " + token.charAt(0)); - } - } - if (pointer != text1.length()) { - throw new IllegalArgumentException("Delta length (" + pointer - + ") smaller than source text length (" + text1.length() + ")."); - } - return diffs; - } - - - // MATCH FUNCTIONS - - - /** - * Locate the best instance of 'pattern' in 'text' near 'loc'. - * Returns -1 if no match found. - * @param text The text to search. - * @param pattern The pattern to search for. - * @param loc The location to search around. - * @return Best match index or -1. - */ - public int match_main(String text, String pattern, int loc) { - // Check for null inputs. - if (text == null || pattern == null) { - throw new IllegalArgumentException("Null inputs. (match_main)"); - } - - loc = Math.max(0, Math.min(loc, text.length())); - if (text.equals(pattern)) { - // Shortcut (potentially not guaranteed by the algorithm) - return 0; - } else if (text.length() == 0) { - // Nothing to match. - return -1; - } else if (loc + pattern.length() <= text.length() - && text.substring(loc, loc + pattern.length()).equals(pattern)) { - // Perfect match at the perfect spot! (Includes case of null pattern) - return loc; - } else { - // Do a fuzzy compare. - return match_bitap(text, pattern, loc); - } - } - - /** - * Locate the best instance of 'pattern' in 'text' near 'loc' using the - * Bitap algorithm. Returns -1 if no match found. - * @param text The text to search. - * @param pattern The pattern to search for. - * @param loc The location to search around. - * @return Best match index or -1. - */ - protected int match_bitap(String text, String pattern, int loc) { - assert (Match_MaxBits == 0 || pattern.length() <= Match_MaxBits) - : "Pattern too long for this application."; - - // Initialise the alphabet. - Map s = match_alphabet(pattern); - - // Highest score beyond which we give up. - double score_threshold = Match_Threshold; - // Is there a nearby exact match? (speedup) - int best_loc = text.indexOf(pattern, loc); - if (best_loc != -1) { - score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), - score_threshold); - // What about in the other direction? (speedup) - best_loc = text.lastIndexOf(pattern, loc + pattern.length()); - if (best_loc != -1) { - score_threshold = Math.min(match_bitapScore(0, best_loc, loc, pattern), - score_threshold); - } - } - - // Initialise the bit arrays. - int matchmask = 1 << (pattern.length() - 1); - best_loc = -1; - - int bin_min, bin_mid; - int bin_max = pattern.length() + text.length(); - // Empty initialization added to appease Java compiler. - int[] last_rd = new int[0]; - for (int d = 0; d < pattern.length(); d++) { - // Scan for the best match; each iteration allows for one more error. - // Run a binary search to determine how far from 'loc' we can stray at - // this error level. - bin_min = 0; - bin_mid = bin_max; - while (bin_min < bin_mid) { - if (match_bitapScore(d, loc + bin_mid, loc, pattern) - <= score_threshold) { - bin_min = bin_mid; - } else { - bin_max = bin_mid; - } - bin_mid = (bin_max - bin_min) / 2 + bin_min; - } - // Use the result from this iteration as the maximum for the next. - bin_max = bin_mid; - int start = Math.max(1, loc - bin_mid + 1); - int finish = Math.min(loc + bin_mid, text.length()) + pattern.length(); - - int[] rd = new int[finish + 2]; - rd[finish + 1] = (1 << d) - 1; - for (int j = finish; j >= start; j--) { - int charMatch; - if (text.length() <= j - 1 || !s.containsKey(text.charAt(j - 1))) { - // Out of range. - charMatch = 0; - } else { - charMatch = s.get(text.charAt(j - 1)); - } - if (d == 0) { - // First pass: exact match. - rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; - } else { - // Subsequent passes: fuzzy match. - rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) - | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; - } - if ((rd[j] & matchmask) != 0) { - double score = match_bitapScore(d, j - 1, loc, pattern); - // This match will almost certainly be better than any existing - // match. But check anyway. - if (score <= score_threshold) { - // Told you so. - score_threshold = score; - best_loc = j - 1; - if (best_loc > loc) { - // When passing loc, don't exceed our current compare from loc. - start = Math.max(1, 2 * loc - best_loc); - } else { - // Already passed loc, downhill from here on in. - break; - } - } - } - } - if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) { - // No hope for a (better) match at greater error levels. - break; - } - last_rd = rd; - } - return best_loc; - } - - /** - * Compute and return the score for a match with e errors and x location. - * @param e Number of errors in match. - * @param x Location of match. - * @param loc Expected location of match. - * @param pattern Pattern being sought. - * @return Overall score for match (0.0 = good, 1.0 = bad). - */ - private double match_bitapScore(int e, int x, int loc, String pattern) { - float accuracy = (float) e / pattern.length(); - int proximity = Math.abs(loc - x); - if (Match_Distance == 0) { - // Dodge divide by zero error. - return proximity == 0 ? accuracy : 1.0; - } - return accuracy + (proximity / (float) Match_Distance); - } - - /** - * Initialise the alphabet for the Bitap algorithm. - * @param pattern The text to encode. - * @return Hash of character locations. - */ - protected Map match_alphabet(String pattern) { - Map s = new HashMap(); - char[] char_pattern = pattern.toCharArray(); - for (char c : char_pattern) { - s.put(c, 0); - } - int i = 0; - for (char c : char_pattern) { - s.put(c, s.get(c) | (1 << (pattern.length() - i - 1))); - i++; - } - return s; - } - - - // PATCH FUNCTIONS - - - /** - * Increase the context until it is unique, - * but don't let the pattern expand beyond Match_MaxBits. - * @param patch The patch to grow. - * @param text Source text. - */ - protected void patch_addContext(Patch patch, String text) { - if (text.length() == 0) { - return; - } - String pattern = text.substring(patch.start2, patch.start2 + patch.length1); - int padding = 0; - - // Look for the first and last matches of pattern in text. If two different - // matches are found, increase the pattern length. - while (text.indexOf(pattern) != text.lastIndexOf(pattern) - && pattern.length() < Match_MaxBits - Patch_Margin - Patch_Margin) { - padding += Patch_Margin; - pattern = text.substring(Math.max(0, patch.start2 - padding), - Math.min(text.length(), patch.start2 + patch.length1 + padding)); - } - // Add one chunk for good luck. - padding += Patch_Margin; - - // Add the prefix. - String prefix = text.substring(Math.max(0, patch.start2 - padding), - patch.start2); - if (prefix.length() != 0) { - patch.diffs.addFirst(new Diff(Operation.EQUAL, prefix)); - } - // Add the suffix. - String suffix = text.substring(patch.start2 + patch.length1, - Math.min(text.length(), patch.start2 + patch.length1 + padding)); - if (suffix.length() != 0) { - patch.diffs.addLast(new Diff(Operation.EQUAL, suffix)); - } - - // Roll back the start points. - patch.start1 -= prefix.length(); - patch.start2 -= prefix.length(); - // Extend the lengths. - patch.length1 += prefix.length() + suffix.length(); - patch.length2 += prefix.length() + suffix.length(); - } - - /** - * Compute a list of patches to turn text1 into text2. - * A set of diffs will be computed. - * @param text1 Old text. - * @param text2 New text. - * @return LinkedList of Patch objects. - */ - public LinkedList patch_make(String text1, String text2) { - if (text1 == null || text2 == null) { - throw new IllegalArgumentException("Null inputs. (patch_make)"); - } - // No diffs provided, compute our own. - LinkedList diffs = diff_main(text1, text2, true); - if (diffs.size() > 2) { - diff_cleanupSemantic(diffs); - diff_cleanupEfficiency(diffs); - } - return patch_make(text1, diffs); - } - - /** - * Compute a list of patches to turn text1 into text2. - * text1 will be derived from the provided diffs. - * @param diffs Array of Diff objects for text1 to text2. - * @return LinkedList of Patch objects. - */ - public LinkedList patch_make(LinkedList diffs) { - if (diffs == null) { - throw new IllegalArgumentException("Null inputs. (patch_make)"); - } - // No origin string provided, compute our own. - String text1 = diff_text1(diffs); - return patch_make(text1, diffs); - } - - /** - * Compute a list of patches to turn text1 into text2. - * text2 is ignored, diffs are the delta between text1 and text2. - * @param text1 Old text - * @param text2 Ignored. - * @param diffs Array of Diff objects for text1 to text2. - * @return LinkedList of Patch objects. - * @deprecated Prefer patch_make(String text1, LinkedList diffs). - */ - @Deprecated public LinkedList patch_make(String text1, String text2, - LinkedList diffs) { - return patch_make(text1, diffs); - } - - /** - * Compute a list of patches to turn text1 into text2. - * text2 is not provided, diffs are the delta between text1 and text2. - * @param text1 Old text. - * @param diffs Array of Diff objects for text1 to text2. - * @return LinkedList of Patch objects. - */ - public LinkedList patch_make(String text1, LinkedList diffs) { - if (text1 == null || diffs == null) { - throw new IllegalArgumentException("Null inputs. (patch_make)"); - } - - LinkedList patches = new LinkedList(); - if (diffs.isEmpty()) { - return patches; // Get rid of the null case. - } - Patch patch = new Patch(); - int char_count1 = 0; // Number of characters into the text1 string. - int char_count2 = 0; // Number of characters into the text2 string. - // Start with text1 (prepatch_text) and apply the diffs until we arrive at - // text2 (postpatch_text). We recreate the patches one by one to determine - // context info. - String prepatch_text = text1; - String postpatch_text = text1; - for (Diff aDiff : diffs) { - if (patch.diffs.isEmpty() && aDiff.operation != Operation.EQUAL) { - // A new patch starts here. - patch.start1 = char_count1; - patch.start2 = char_count2; - } - - switch (aDiff.operation) { - case INSERT: - patch.diffs.add(aDiff); - patch.length2 += aDiff.text.length(); - postpatch_text = postpatch_text.substring(0, char_count2) - + aDiff.text + postpatch_text.substring(char_count2); - break; - case DELETE: - patch.length1 += aDiff.text.length(); - patch.diffs.add(aDiff); - postpatch_text = postpatch_text.substring(0, char_count2) - + postpatch_text.substring(char_count2 + aDiff.text.length()); - break; - case EQUAL: - if (aDiff.text.length() <= 2 * Patch_Margin - && !patch.diffs.isEmpty() && aDiff != diffs.getLast()) { - // Small equality inside a patch. - patch.diffs.add(aDiff); - patch.length1 += aDiff.text.length(); - patch.length2 += aDiff.text.length(); - } - - if (aDiff.text.length() >= 2 * Patch_Margin && !patch.diffs.isEmpty()) { - // Time for a new patch. - if (!patch.diffs.isEmpty()) { - patch_addContext(patch, prepatch_text); - patches.add(patch); - patch = new Patch(); - // Unlike Unidiff, our patch lists have a rolling context. - // https://github.com/google/diff-match-patch/wiki/Unidiff - // Update prepatch text & pos to reflect the application of the - // just completed patch. - prepatch_text = postpatch_text; - char_count1 = char_count2; - } - } - break; - } - - // Update the current character count. - if (aDiff.operation != Operation.INSERT) { - char_count1 += aDiff.text.length(); - } - if (aDiff.operation != Operation.DELETE) { - char_count2 += aDiff.text.length(); - } - } - // Pick up the leftover patch if not empty. - if (!patch.diffs.isEmpty()) { - patch_addContext(patch, prepatch_text); - patches.add(patch); - } - - return patches; - } - - /** - * Given an array of patches, return another array that is identical. - * @param patches Array of Patch objects. - * @return Array of Patch objects. - */ - public LinkedList patch_deepCopy(LinkedList patches) { - LinkedList patchesCopy = new LinkedList(); - for (Patch aPatch : patches) { - Patch patchCopy = new Patch(); - for (Diff aDiff : aPatch.diffs) { - Diff diffCopy = new Diff(aDiff.operation, aDiff.text); - patchCopy.diffs.add(diffCopy); - } - patchCopy.start1 = aPatch.start1; - patchCopy.start2 = aPatch.start2; - patchCopy.length1 = aPatch.length1; - patchCopy.length2 = aPatch.length2; - patchesCopy.add(patchCopy); - } - return patchesCopy; - } - - /** - * Merge a set of patches onto the text. Return a patched text, as well - * as an array of true/false values indicating which patches were applied. - * @param patches Array of Patch objects - * @param text Old text. - * @return Two element Object array, containing the new text and an array of - * boolean values. - */ - public Object[] patch_apply(LinkedList patches, String text) { - if (patches.isEmpty()) { - return new Object[]{text, new boolean[0]}; - } - - // Deep copy the patches so that no changes are made to originals. - patches = patch_deepCopy(patches); - - String nullPadding = patch_addPadding(patches); - text = nullPadding + text + nullPadding; - patch_splitMax(patches); - - int x = 0; - // delta keeps track of the offset between the expected and actual location - // of the previous patch. If there are patches expected at positions 10 and - // 20, but the first patch was found at 12, delta is 2 and the second patch - // has an effective expected position of 22. - int delta = 0; - boolean[] results = new boolean[patches.size()]; - for (Patch aPatch : patches) { - int expected_loc = aPatch.start2 + delta; - String text1 = diff_text1(aPatch.diffs); - int start_loc; - int end_loc = -1; - if (text1.length() > this.Match_MaxBits) { - // patch_splitMax will only provide an oversized pattern in the case of - // a monster delete. - start_loc = match_main(text, - text1.substring(0, this.Match_MaxBits), expected_loc); - if (start_loc != -1) { - end_loc = match_main(text, - text1.substring(text1.length() - this.Match_MaxBits), - expected_loc + text1.length() - this.Match_MaxBits); - if (end_loc == -1 || start_loc >= end_loc) { - // Can't find valid trailing context. Drop this patch. - start_loc = -1; - } - } - } else { - start_loc = match_main(text, text1, expected_loc); - } - if (start_loc == -1) { - // No match found. :( - results[x] = false; - // Subtract the delta for this failed patch from subsequent patches. - delta -= aPatch.length2 - aPatch.length1; - } else { - // Found a match. :) - results[x] = true; - delta = start_loc - expected_loc; - String text2; - if (end_loc == -1) { - text2 = text.substring(start_loc, - Math.min(start_loc + text1.length(), text.length())); - } else { - text2 = text.substring(start_loc, - Math.min(end_loc + this.Match_MaxBits, text.length())); - } - if (text1.equals(text2)) { - // Perfect match, just shove the replacement text in. - text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) - + text.substring(start_loc + text1.length()); - } else { - // Imperfect match. Run a diff to get a framework of equivalent - // indices. - LinkedList diffs = diff_main(text1, text2, false); - if (text1.length() > this.Match_MaxBits - && diff_levenshtein(diffs) / (float) text1.length() - > this.Patch_DeleteThreshold) { - // The end points match, but the content is unacceptably bad. - results[x] = false; - } else { - diff_cleanupSemanticLossless(diffs); - int index1 = 0; - for (Diff aDiff : aPatch.diffs) { - if (aDiff.operation != Operation.EQUAL) { - int index2 = diff_xIndex(diffs, index1); - if (aDiff.operation == Operation.INSERT) { - // Insertion - text = text.substring(0, start_loc + index2) + aDiff.text - + text.substring(start_loc + index2); - } else if (aDiff.operation == Operation.DELETE) { - // Deletion - text = text.substring(0, start_loc + index2) - + text.substring(start_loc + diff_xIndex(diffs, - index1 + aDiff.text.length())); - } - } - if (aDiff.operation != Operation.DELETE) { - index1 += aDiff.text.length(); - } - } - } - } - } - x++; - } - // Strip the padding off. - text = text.substring(nullPadding.length(), text.length() - - nullPadding.length()); - return new Object[]{text, results}; - } - - /** - * Add some padding on text start and end so that edges can match something. - * Intended to be called only from within patch_apply. - * @param patches Array of Patch objects. - * @return The padding string added to each side. - */ - public String patch_addPadding(LinkedList patches) { - short paddingLength = this.Patch_Margin; - String nullPadding = ""; - for (short x = 1; x <= paddingLength; x++) { - nullPadding += String.valueOf((char) x); - } - - // Bump all the patches forward. - for (Patch aPatch : patches) { - aPatch.start1 += paddingLength; - aPatch.start2 += paddingLength; - } - - // Add some padding on start of first diff. - Patch patch = patches.getFirst(); - LinkedList diffs = patch.diffs; - if (diffs.isEmpty() || diffs.getFirst().operation != Operation.EQUAL) { - // Add nullPadding equality. - diffs.addFirst(new Diff(Operation.EQUAL, nullPadding)); - patch.start1 -= paddingLength; // Should be 0. - patch.start2 -= paddingLength; // Should be 0. - patch.length1 += paddingLength; - patch.length2 += paddingLength; - } else if (paddingLength > diffs.getFirst().text.length()) { - // Grow first equality. - Diff firstDiff = diffs.getFirst(); - int extraLength = paddingLength - firstDiff.text.length(); - firstDiff.text = nullPadding.substring(firstDiff.text.length()) - + firstDiff.text; - patch.start1 -= extraLength; - patch.start2 -= extraLength; - patch.length1 += extraLength; - patch.length2 += extraLength; - } - - // Add some padding on end of last diff. - patch = patches.getLast(); - diffs = patch.diffs; - if (diffs.isEmpty() || diffs.getLast().operation != Operation.EQUAL) { - // Add nullPadding equality. - diffs.addLast(new Diff(Operation.EQUAL, nullPadding)); - patch.length1 += paddingLength; - patch.length2 += paddingLength; - } else if (paddingLength > diffs.getLast().text.length()) { - // Grow last equality. - Diff lastDiff = diffs.getLast(); - int extraLength = paddingLength - lastDiff.text.length(); - lastDiff.text += nullPadding.substring(0, extraLength); - patch.length1 += extraLength; - patch.length2 += extraLength; - } - - return nullPadding; - } - - /** - * Look through the patches and break up any which are longer than the - * maximum limit of the match algorithm. - * Intended to be called only from within patch_apply. - * @param patches LinkedList of Patch objects. - */ - public void patch_splitMax(LinkedList patches) { - short patch_size = Match_MaxBits; - String precontext, postcontext; - Patch patch; - int start1, start2; - boolean empty; - Operation diff_type; - String diff_text; - ListIterator pointer = patches.listIterator(); - Patch bigpatch = pointer.hasNext() ? pointer.next() : null; - while (bigpatch != null) { - if (bigpatch.length1 <= Match_MaxBits) { - bigpatch = pointer.hasNext() ? pointer.next() : null; - continue; - } - // Remove the big old patch. - pointer.remove(); - start1 = bigpatch.start1; - start2 = bigpatch.start2; - precontext = ""; - while (!bigpatch.diffs.isEmpty()) { - // Create one of several smaller patches. - patch = new Patch(); - empty = true; - patch.start1 = start1 - precontext.length(); - patch.start2 = start2 - precontext.length(); - if (precontext.length() != 0) { - patch.length1 = patch.length2 = precontext.length(); - patch.diffs.add(new Diff(Operation.EQUAL, precontext)); - } - while (!bigpatch.diffs.isEmpty() - && patch.length1 < patch_size - Patch_Margin) { - diff_type = bigpatch.diffs.getFirst().operation; - diff_text = bigpatch.diffs.getFirst().text; - if (diff_type == Operation.INSERT) { - // Insertions are harmless. - patch.length2 += diff_text.length(); - start2 += diff_text.length(); - patch.diffs.addLast(bigpatch.diffs.removeFirst()); - empty = false; - } else if (diff_type == Operation.DELETE && patch.diffs.size() == 1 - && patch.diffs.getFirst().operation == Operation.EQUAL - && diff_text.length() > 2 * patch_size) { - // This is a large deletion. Let it pass in one chunk. - patch.length1 += diff_text.length(); - start1 += diff_text.length(); - empty = false; - patch.diffs.add(new Diff(diff_type, diff_text)); - bigpatch.diffs.removeFirst(); - } else { - // Deletion or equality. Only take as much as we can stomach. - diff_text = diff_text.substring(0, Math.min(diff_text.length(), - patch_size - patch.length1 - Patch_Margin)); - patch.length1 += diff_text.length(); - start1 += diff_text.length(); - if (diff_type == Operation.EQUAL) { - patch.length2 += diff_text.length(); - start2 += diff_text.length(); - } else { - empty = false; - } - patch.diffs.add(new Diff(diff_type, diff_text)); - if (diff_text.equals(bigpatch.diffs.getFirst().text)) { - bigpatch.diffs.removeFirst(); - } else { - bigpatch.diffs.getFirst().text = bigpatch.diffs.getFirst().text - .substring(diff_text.length()); - } - } - } - // Compute the head context for the next patch. - precontext = diff_text2(patch.diffs); - precontext = precontext.substring(Math.max(0, precontext.length() - - Patch_Margin)); - // Append the end context for this patch. - if (diff_text1(bigpatch.diffs).length() > Patch_Margin) { - postcontext = diff_text1(bigpatch.diffs).substring(0, Patch_Margin); - } else { - postcontext = diff_text1(bigpatch.diffs); - } - if (postcontext.length() != 0) { - patch.length1 += postcontext.length(); - patch.length2 += postcontext.length(); - if (!patch.diffs.isEmpty() - && patch.diffs.getLast().operation == Operation.EQUAL) { - patch.diffs.getLast().text += postcontext; - } else { - patch.diffs.add(new Diff(Operation.EQUAL, postcontext)); - } - } - if (!empty) { - pointer.add(patch); - } - } - bigpatch = pointer.hasNext() ? pointer.next() : null; - } - } - - /** - * Take a list of patches and return a textual representation. - * @param patches List of Patch objects. - * @return Text representation of patches. - */ - public String patch_toText(List patches) { - StringBuilder text = new StringBuilder(); - for (Patch aPatch : patches) { - text.append(aPatch); - } - return text.toString(); - } - - /** - * Parse a textual representation of patches and return a List of Patch - * objects. - * @param textline Text representation of patches. - * @return List of Patch objects. - * @throws IllegalArgumentException If invalid input. - */ - public List patch_fromText(String textline) - throws IllegalArgumentException { - List patches = new LinkedList(); - if (textline.length() == 0) { - return patches; - } - List textList = Arrays.asList(textline.split("\n")); - LinkedList text = new LinkedList(textList); - Patch patch; - Pattern patchHeader - = Pattern.compile("^@@ -(\\d+),?(\\d*) \\+(\\d+),?(\\d*) @@$"); - Matcher m; - char sign; - String line; - while (!text.isEmpty()) { - m = patchHeader.matcher(text.getFirst()); - if (!m.matches()) { - throw new IllegalArgumentException( - "Invalid patch string: " + text.getFirst()); - } - patch = new Patch(); - patches.add(patch); - patch.start1 = Integer.parseInt(m.group(1)); - if (m.group(2).length() == 0) { - patch.start1--; - patch.length1 = 1; - } else if (m.group(2).equals("0")) { - patch.length1 = 0; - } else { - patch.start1--; - patch.length1 = Integer.parseInt(m.group(2)); - } - - patch.start2 = Integer.parseInt(m.group(3)); - if (m.group(4).length() == 0) { - patch.start2--; - patch.length2 = 1; - } else if (m.group(4).equals("0")) { - patch.length2 = 0; - } else { - patch.start2--; - patch.length2 = Integer.parseInt(m.group(4)); - } - text.removeFirst(); - - while (!text.isEmpty()) { - try { - sign = text.getFirst().charAt(0); - } catch (IndexOutOfBoundsException e) { - // Blank line? Whatever. - text.removeFirst(); - continue; - } - line = text.getFirst().substring(1); - line = line.replace("+", "%2B"); // decode would change all "+" to " " - try { - line = URLDecoder.decode(line, "UTF-8"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } catch (IllegalArgumentException e) { - // Malformed URI sequence. - throw new IllegalArgumentException( - "Illegal escape in patch_fromText: " + line, e); - } - if (sign == '-') { - // Deletion. - patch.diffs.add(new Diff(Operation.DELETE, line)); - } else if (sign == '+') { - // Insertion. - patch.diffs.add(new Diff(Operation.INSERT, line)); - } else if (sign == ' ') { - // Minor equality. - patch.diffs.add(new Diff(Operation.EQUAL, line)); - } else if (sign == '@') { - // Start of next patch. - break; - } else { - // WTF? - throw new IllegalArgumentException( - "Invalid patch mode '" + sign + "' in: " + line); - } - text.removeFirst(); - } - } - return patches; - } - - - /** - * Class representing one diff operation. - */ - public static class Diff { - /** - * One of: INSERT, DELETE or EQUAL. - */ - public Operation operation; - /** - * The text associated with this diff operation. - */ - public String text; - - /** - * Constructor. Initializes the diff with the provided values. - * @param operation One of INSERT, DELETE or EQUAL. - * @param text The text being applied. - */ - public Diff(Operation operation, String text) { - // Construct a diff with the specified operation and text. - this.operation = operation; - this.text = text; - } - - /** - * Display a human-readable version of this Diff. - * @return text version. - */ - public String toString() { - String prettyText = this.text.replace('\n', '\u00b6'); - return "Diff(" + this.operation + ",\"" + prettyText + "\")"; - } - - /** - * Create a numeric hash value for a Diff. - * This function is not used by DMP. - * @return Hash value. - */ - @Override - public int hashCode() { - final int prime = 31; - int result = (operation == null) ? 0 : operation.hashCode(); - result += prime * ((text == null) ? 0 : text.hashCode()); - return result; - } - - /** - * Is this Diff equivalent to another Diff? - * @param obj Another Diff to compare against. - * @return true or false. - */ - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - Diff other = (Diff) obj; - if (operation != other.operation) { - return false; - } - if (text == null) { - if (other.text != null) { - return false; - } - } else if (!text.equals(other.text)) { - return false; - } - return true; - } - } - - - /** - * Class representing one patch operation. - */ - public static class Patch { - public LinkedList diffs; - public int start1; - public int start2; - public int length1; - public int length2; - - /** - * Constructor. Initializes with an empty list of diffs. - */ - public Patch() { - this.diffs = new LinkedList(); - } - - /** - * Emulate GNU diff's format. - * Header: @@ -382,8 +481,9 @@ - * Indices are printed as 1-based, not 0-based. - * @return The GNU diff string. - */ - public String toString() { - String coords1, coords2; - if (this.length1 == 0) { - coords1 = this.start1 + ",0"; - } else if (this.length1 == 1) { - coords1 = Integer.toString(this.start1 + 1); - } else { - coords1 = (this.start1 + 1) + "," + this.length1; - } - if (this.length2 == 0) { - coords2 = this.start2 + ",0"; - } else if (this.length2 == 1) { - coords2 = Integer.toString(this.start2 + 1); - } else { - coords2 = (this.start2 + 1) + "," + this.length2; - } - StringBuilder text = new StringBuilder(); - text.append("@@ -").append(coords1).append(" +").append(coords2) - .append(" @@\n"); - // Escape the body of the patch with %xx notation. - for (Diff aDiff : this.diffs) { - switch (aDiff.operation) { - case INSERT: - text.append('+'); - break; - case DELETE: - text.append('-'); - break; - case EQUAL: - text.append(' '); - break; - } - try { - text.append(URLEncoder.encode(aDiff.text, "UTF-8").replace('+', ' ')) - .append("\n"); - } catch (UnsupportedEncodingException e) { - // Not likely on modern system. - throw new Error("This system does not support UTF-8.", e); - } - } - return unescapeForEncodeUriCompatability(text.toString()); - } - } - - /** - * Unescape selected chars for compatability with JavaScript's encodeURI. - * In speed critical applications this could be dropped since the - * receiving application will certainly decode these fine. - * Note that this function is case-sensitive. Thus "%3f" would not be - * unescaped. But this is ok because it is only called with the output of - * URLEncoder.encode which returns uppercase hex. - * - * Example: "%3F" -> "?", "%24" -> "$", etc. - * - * @param str The string to escape. - * @return The escaped string. - */ - private static String unescapeForEncodeUriCompatability(String str) { - return str.replace("%21", "!").replace("%7E", "~") - .replace("%27", "'").replace("%28", "(").replace("%29", ")") - .replace("%3B", ";").replace("%2F", "/").replace("%3F", "?") - .replace("%3A", ":").replace("%40", "@").replace("%26", "&") - .replace("%3D", "=").replace("%2B", "+").replace("%24", "$") - .replace("%2C", ",").replace("%23", "#"); - } + // Defaults. + // Set these on your diff_match_patch instance to override the defaults. + + /** + * Number of seconds to map a diff before giving up (0 for infinity). + */ + public float Diff_Timeout = 1.0f; + /** + * Cost of an empty edit operation in terms of edit characters. + */ + public short Diff_EditCost = 4; + /** + * At what point is no match declared (0.0 = perfection, 1.0 = very loose). + */ + public float Match_Threshold = 0.5f; + /** + * How far to search for a match (0 = exact location, 1000+ = broad match). + * A match this many characters away from the expected location will add + * 1.0 to the score (0.0 is a perfect match). + */ + public int Match_Distance = 1000; + /** + * When deleting a large block of text (over ~64 characters), how close do + * the contents have to be to match the expected contents. (0.0 = perfection, + * 1.0 = very loose). Note that Match_Threshold controls how closely the + * end points of a delete need to match. + */ + public float Patch_DeleteThreshold = 0.5f; + /** + * Chunk size for context length. + */ + public short Patch_Margin = 4; + + /** + * The number of bits in an int. + */ + private short Match_MaxBits = 32; + + /** + * Internal class for returning results from diff_linesToChars(). + * Other less paranoid languages just use a three-element array. + */ + protected static class LinesToCharsResult { + protected String chars1; + protected String chars2; + protected List lineArray; + + protected LinesToCharsResult(String chars1, String chars2, + List lineArray) { + this.chars1 = chars1; + this.chars2 = chars2; + this.lineArray = lineArray; + } + } + + // DIFF FUNCTIONS + + /** + * The data structure representing a diff is a Linked list of Diff objects: + * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"), + * Diff(Operation.EQUAL, " world.")} + * which means: delete "Hello", add "Goodbye" and keep " world." + */ + public enum Operation { + DELETE, INSERT, EQUAL + } + + /** + * Find the differences between two texts. + * Run a faster, slightly less optimal diff. + * This method allows the 'checklines' of diff_main() to be optional. + * Most of the time checklines is wanted, so default to true. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @return Linked List of Diff objects. + */ + public LinkedList diff_main(String text1, String text2) { + return diff_main(text1, text2, true); + } + + /** + * Find the differences between two texts. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @return Linked List of Diff objects. + */ + public LinkedList diff_main(String text1, String text2, + boolean checklines) { + // Set a deadline by which time the diff must be complete. + long deadline; + if (Diff_Timeout <= 0) { + deadline = Long.MAX_VALUE; + } else { + deadline = System.currentTimeMillis() + (long) (Diff_Timeout * 1000); + } + return diff_main(text1, text2, checklines, deadline); + } + + /** + * Find the differences between two texts. Simplifies the problem by + * stripping any common prefix or suffix off the texts before diffing. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @param deadline Time when the diff should be complete by. Used + * internally for recursive calls. Users should set DiffTimeout instead. + * @return Linked List of Diff objects. + */ + private LinkedList diff_main(String text1, String text2, + boolean checklines, long deadline) { + // Check for null inputs. + if (text1 == null || text2 == null) { + throw new IllegalArgumentException("Null inputs. (diff_main)"); + } + + // Check for equality (speedup). + LinkedList diffs; + if (text1.equals(text2)) { + diffs = new LinkedList(); + if (text1.length() != 0) { + diffs.add(new Diff(Operation.EQUAL, text1)); + } + return diffs; + } + + // Trim off common prefix (speedup). + int commonlength = diff_commonPrefix(text1, text2); + String commonprefix = text1.substring(0, commonlength); + text1 = text1.substring(commonlength); + text2 = text2.substring(commonlength); + + // Trim off common suffix (speedup). + commonlength = diff_commonSuffix(text1, text2); + String commonsuffix = text1.substring(text1.length() - commonlength); + text1 = text1.substring(0, text1.length() - commonlength); + text2 = text2.substring(0, text2.length() - commonlength); + + // Compute the diff on the middle block. + diffs = diff_compute(text1, text2, checklines, deadline); + + // Restore the prefix and suffix. + if (commonprefix.length() != 0) { + diffs.addFirst(new Diff(Operation.EQUAL, commonprefix)); + } + if (commonsuffix.length() != 0) { + diffs.addLast(new Diff(Operation.EQUAL, commonsuffix)); + } + + diff_cleanupMerge(diffs); + return diffs; + } + + /** + * Find the differences between two texts. Assumes that the texts do not + * have any common prefix or suffix. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param checklines Speedup flag. If false, then don't run a + * line-level diff first to identify the changed areas. + * If true, then run a faster slightly less optimal diff. + * @param deadline Time when the diff should be complete by. + * @return Linked List of Diff objects. + */ + private LinkedList diff_compute(String text1, String text2, + boolean checklines, long deadline) { + LinkedList diffs = new LinkedList(); + + if (text1.length() == 0) { + // Just add some text (speedup). + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + if (text2.length() == 0) { + // Just delete some text (speedup). + diffs.add(new Diff(Operation.DELETE, text1)); + return diffs; + } + + String longtext = text1.length() > text2.length() ? text1 : text2; + String shorttext = text1.length() > text2.length() ? text2 : text1; + int i = longtext.indexOf(shorttext); + if (i != -1) { + // Shorter text is inside the longer text (speedup). + Operation op = (text1.length() > text2.length()) ? Operation.DELETE : Operation.INSERT; + diffs.add(new Diff(op, longtext.substring(0, i))); + diffs.add(new Diff(Operation.EQUAL, shorttext)); + diffs.add(new Diff(op, longtext.substring(i + shorttext.length()))); + return diffs; + } + + if (shorttext.length() == 1) { + // Single character string. + // After the previous speedup, the character can't be an equality. + diffs.add(new Diff(Operation.DELETE, text1)); + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + // Check to see if the problem can be split in two. + String[] hm = diff_halfMatch(text1, text2); + if (hm != null) { + // A half-match was found, sort out the return data. + String text1_a = hm[0]; + String text1_b = hm[1]; + String text2_a = hm[2]; + String text2_b = hm[3]; + String mid_common = hm[4]; + // Send both pairs off for separate processing. + LinkedList diffs_a = diff_main( + text1_a, text2_a, + checklines, deadline); + LinkedList diffs_b = diff_main( + text1_b, text2_b, + checklines, deadline); + // Merge the results. + diffs = diffs_a; + diffs.add(new Diff(Operation.EQUAL, mid_common)); + diffs.addAll(diffs_b); + return diffs; + } + + if (checklines && text1.length() > 100 && text2.length() > 100) { + return diff_lineMode(text1, text2, deadline); + } + + return diff_bisect(text1, text2, deadline); + } + + /** + * Do a quick line-level diff on both strings, then rediff the parts for + * greater accuracy. + * This speedup can produce non-minimal diffs. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param deadline Time when the diff should be complete by. + * @return Linked List of Diff objects. + */ + private LinkedList diff_lineMode(String text1, String text2, + long deadline) { + // Scan the text on a line-by-line basis first. + LinesToCharsResult a = diff_linesToChars(text1, text2); + text1 = a.chars1; + text2 = a.chars2; + List linearray = a.lineArray; + + LinkedList diffs = diff_main(text1, text2, false, deadline); + + // Convert the diff back to original text. + diff_charsToLines(diffs, linearray); + // Eliminate freak matches (e.g. blank lines) + diff_cleanupSemantic(diffs); + + // Rediff any replacement blocks, this time character-by-character. + // Add a dummy entry at the end. + diffs.add(new Diff(Operation.EQUAL, "")); + int count_delete = 0; + int count_insert = 0; + String text_delete = ""; + String text_insert = ""; + ListIterator pointer = diffs.listIterator(); + Diff thisDiff = pointer.next(); + while (thisDiff != null) { + switch (thisDiff.operation) { + case INSERT: + count_insert++; + text_insert += thisDiff.text; + break; + case DELETE: + count_delete++; + text_delete += thisDiff.text; + break; + case EQUAL: + // Upon reaching an equality, check for prior redundancies. + if (count_delete >= 1 && count_insert >= 1) { + // Delete the offending records and add the merged ones. + pointer.previous(); + for (int j = 0; j < count_delete + count_insert; j++) { + pointer.previous(); + pointer.remove(); + } + for (Diff subDiff : diff_main( + text_delete, text_insert, false, + deadline)) { + pointer.add(subDiff); + } + } + count_insert = 0; + count_delete = 0; + text_delete = ""; + text_insert = ""; + break; + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + diffs.removeLast(); // Remove the dummy entry at the end. + + return diffs; + } + + /** + * Find the 'middle snake' of a diff, split the problem in two + * and return the recursively constructed diff. + * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param deadline Time at which to bail if not yet complete. + * @return LinkedList of Diff objects. + */ + protected LinkedList diff_bisect(String text1, String text2, + long deadline) { + // Cache the text lengths to prevent multiple calls. + int text1_length = text1.length(); + int text2_length = text2.length(); + int max_d = (text1_length + text2_length + 1) / 2; + int v_offset = max_d; + int v_length = 2 * max_d; + int[] v1 = new int[v_length]; + int[] v2 = new int[v_length]; + for (int x = 0; x < v_length; x++) { + v1[x] = -1; + v2[x] = -1; + } + v1[v_offset + 1] = 0; + v2[v_offset + 1] = 0; + int delta = text1_length - text2_length; + // If the total number of characters is odd, then the front path will + // collide with the reverse path. + boolean front = (delta % 2 != 0); + // Offsets for start and end of k loop. + // Prevents mapping of space beyond the grid. + int k1start = 0; + int k1end = 0; + int k2start = 0; + int k2end = 0; + for (int d = 0; d < max_d; d++) { + // Bail out if deadline is reached. + if (System.currentTimeMillis() > deadline) { + break; + } + + // Walk the front path one step. + for (int k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { + int k1_offset = v_offset + k1; + int x1; + if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) { + x1 = v1[k1_offset + 1]; + } else { + x1 = v1[k1_offset - 1] + 1; + } + int y1 = x1 - k1; + while (x1 < text1_length && y1 < text2_length + && text1.charAt(x1) == text2.charAt(y1)) { + x1++; + y1++; + } + v1[k1_offset] = x1; + if (x1 > text1_length) { + // Ran off the right of the graph. + k1end += 2; + } else if (y1 > text2_length) { + // Ran off the bottom of the graph. + k1start += 2; + } else if (front) { + int k2_offset = v_offset + delta - k1; + if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) { + // Mirror x2 onto top-left coordinate system. + int x2 = text1_length - v2[k2_offset]; + if (x1 >= x2) { + // Overlap detected. + return diff_bisectSplit(text1, text2, x1, y1, deadline); + } + } + } + } + + // Walk the reverse path one step. + for (int k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { + int k2_offset = v_offset + k2; + int x2; + if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) { + x2 = v2[k2_offset + 1]; + } else { + x2 = v2[k2_offset - 1] + 1; + } + int y2 = x2 - k2; + while (x2 < text1_length && y2 < text2_length + && text1.charAt(text1_length - x2 - 1) == text2.charAt(text2_length - y2 - 1)) { + x2++; + y2++; + } + v2[k2_offset] = x2; + if (x2 > text1_length) { + // Ran off the left of the graph. + k2end += 2; + } else if (y2 > text2_length) { + // Ran off the top of the graph. + k2start += 2; + } else if (!front) { + int k1_offset = v_offset + delta - k2; + if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) { + int x1 = v1[k1_offset]; + int y1 = v_offset + x1 - k1_offset; + // Mirror x2 onto top-left coordinate system. + x2 = text1_length - x2; + if (x1 >= x2) { + // Overlap detected. + return diff_bisectSplit(text1, text2, x1, y1, deadline); + } + } + } + } + } + // Diff took too long and hit the deadline or + // number of diffs equals number of characters, no commonality at all. + LinkedList diffs = new LinkedList(); + diffs.add(new Diff(Operation.DELETE, text1)); + diffs.add(new Diff(Operation.INSERT, text2)); + return diffs; + } + + /** + * Given the location of the 'middle snake', split the diff in two parts + * and recurse. + * @param text1 Old string to be diffed. + * @param text2 New string to be diffed. + * @param x Index of split point in text1. + * @param y Index of split point in text2. + * @param deadline Time at which to bail if not yet complete. + * @return LinkedList of Diff objects. + */ + private LinkedList diff_bisectSplit(String text1, String text2, + int x, int y, long deadline) { + String text1a = text1.substring(0, x); + String text2a = text2.substring(0, y); + String text1b = text1.substring(x); + String text2b = text2.substring(y); + + // Compute both diffs serially. + LinkedList diffs = diff_main(text1a, text2a, false, deadline); + LinkedList diffsb = diff_main(text1b, text2b, false, deadline); + + diffs.addAll(diffsb); + return diffs; + } + + /** + * Split two texts into a list of strings. Reduce the texts to a string of + * hashes where each Unicode character represents one line. + * @param text1 First string. + * @param text2 Second string. + * @return An object containing the encoded text1, the encoded text2 and + * the List of unique strings. The zeroth element of the List of + * unique strings is intentionally blank. + */ + protected LinesToCharsResult diff_linesToChars(String text1, String text2) { + List lineArray = new ArrayList(); + Map lineHash = new HashMap(); + // e.g. linearray[4] == "Hello\n" + // e.g. linehash.get("Hello\n") == 4 + + // "\x00" is a valid character, but various debuggers don't like it. + // So we'll insert a junk entry to avoid generating a null character. + lineArray.add(""); + + // Allocate 2/3rds of the space for text1, the rest for text2. + String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash, 40000); + String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash, 65535); + return new LinesToCharsResult(chars1, chars2, lineArray); + } + + /** + * Split a text into a list of strings. Reduce the texts to a string of + * hashes where each Unicode character represents one line. + * @param text String to encode. + * @param lineArray List of unique strings. + * @param lineHash Map of strings to indices. + * @param maxLines Maximum length of lineArray. + * @return Encoded string. + */ + private String diff_linesToCharsMunge(String text, List lineArray, + Map lineHash, int maxLines) { + int lineStart = 0; + int lineEnd = -1; + String line; + StringBuilder chars = new StringBuilder(); + // Walk the text, pulling out a substring for each line. + // text.split('\n') would would temporarily double our memory footprint. + // Modifying text would create many large strings to garbage collect. + while (lineEnd < text.length() - 1) { + lineEnd = text.indexOf('\n', lineStart); + if (lineEnd == -1) { + lineEnd = text.length() - 1; + } + line = text.substring(lineStart, lineEnd + 1); + + if (lineHash.containsKey(line)) { + chars.append(String.valueOf((char) (int) lineHash.get(line))); + } else { + if (lineArray.size() == maxLines) { + // Bail out at 65535 because + // String.valueOf((char) 65536).equals(String.valueOf(((char) 0))) + line = text.substring(lineStart); + lineEnd = text.length(); + } + lineArray.add(line); + lineHash.put(line, lineArray.size() - 1); + chars.append(String.valueOf((char) (lineArray.size() - 1))); + } + lineStart = lineEnd + 1; + } + return chars.toString(); + } + + /** + * Rehydrate the text in a diff from a string of line hashes to real lines of + * text. + * @param diffs List of Diff objects. + * @param lineArray List of unique strings. + */ + protected void diff_charsToLines(List diffs, + List lineArray) { + StringBuilder text; + for (Diff diff : diffs) { + text = new StringBuilder(); + for (int j = 0; j < diff.text.length(); j++) { + text.append(lineArray.get(diff.text.charAt(j))); + } + diff.text = text.toString(); + } + } + + /** + * Determine the common prefix of two strings + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the start of each string. + */ + public int diff_commonPrefix(String text1, String text2) { + // Performance analysis: https://neil.fraser.name/news/2007/10/09/ + int n = Math.min(text1.length(), text2.length()); + for (int i = 0; i < n; i++) { + if (text1.charAt(i) != text2.charAt(i)) { + return i; + } + } + return n; + } + + /** + * Determine the common suffix of two strings + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the end of each string. + */ + public int diff_commonSuffix(String text1, String text2) { + // Performance analysis: https://neil.fraser.name/news/2007/10/09/ + int text1_length = text1.length(); + int text2_length = text2.length(); + int n = Math.min(text1_length, text2_length); + for (int i = 1; i <= n; i++) { + if (text1.charAt(text1_length - i) != text2.charAt(text2_length - i)) { + return i - 1; + } + } + return n; + } + + /** + * Determine if the suffix of one string is the prefix of another. + * @param text1 First string. + * @param text2 Second string. + * @return The number of characters common to the end of the first + * string and the start of the second string. + */ + protected int diff_commonOverlap(String text1, String text2) { + // Cache the text lengths to prevent multiple calls. + int text1_length = text1.length(); + int text2_length = text2.length(); + // Eliminate the null case. + if (text1_length == 0 || text2_length == 0) { + return 0; + } + // Truncate the longer string. + if (text1_length > text2_length) { + text1 = text1.substring(text1_length - text2_length); + } else if (text1_length < text2_length) { + text2 = text2.substring(0, text1_length); + } + int text_length = Math.min(text1_length, text2_length); + // Quick check for the worst case. + if (text1.equals(text2)) { + return text_length; + } + + // Start by looking for a single character match + // and increase length until no match is found. + // Performance analysis: https://neil.fraser.name/news/2010/11/04/ + int best = 0; + int length = 1; + while (true) { + String pattern = text1.substring(text_length - length); + int found = text2.indexOf(pattern); + if (found == -1) { + return best; + } + length += found; + if (found == 0 || text1 + .substring(text_length - length) + .equals( + text2.substring(0, length))) { + best = length; + length++; + } + } + } + + /** + * Do the two texts share a substring which is at least half the length of + * the longer text? + * This speedup can produce non-minimal diffs. + * @param text1 First string. + * @param text2 Second string. + * @return Five element String array, containing the prefix of text1, the + * suffix of text1, the prefix of text2, the suffix of text2 and the + * common middle. Or null if there was no match. + */ + protected String[] diff_halfMatch(String text1, String text2) { + if (Diff_Timeout <= 0) { + // Don't risk returning a non-optimal diff if we have unlimited time. + return null; + } + String longtext = text1.length() > text2.length() ? text1 : text2; + String shorttext = text1.length() > text2.length() ? text2 : text1; + if (longtext.length() < 4 || shorttext.length() * 2 < longtext.length()) { + return null; // Pointless. + } + + // First check if the second quarter is the seed for a half-match. + String[] hm1 = diff_halfMatchI( + longtext, shorttext, + (longtext.length() + 3) / 4); + // Check again based on the third quarter. + String[] hm2 = diff_halfMatchI( + longtext, shorttext, + (longtext.length() + 1) / 2); + String[] hm; + if (hm1 == null && hm2 == null) { + return null; + } else if (hm2 == null) { + hm = hm1; + } else if (hm1 == null) { + hm = hm2; + } else { + // Both matched. Select the longest. + hm = hm1[4].length() > hm2[4].length() ? hm1 : hm2; + } + + // A half-match was found, sort out the return data. + if (text1.length() > text2.length()) { + return hm; + // return new String[]{hm[0], hm[1], hm[2], hm[3], hm[4]}; + } else { + return new String[] { + hm[2], hm[3], hm[0], hm[1], hm[4] + }; + } + } + + /** + * Does a substring of shorttext exist within longtext such that the + * substring is at least half the length of longtext? + * @param longtext Longer string. + * @param shorttext Shorter string. + * @param i Start index of quarter length substring within longtext. + * @return Five element String array, containing the prefix of longtext, the + * suffix of longtext, the prefix of shorttext, the suffix of shorttext + * and the common middle. Or null if there was no match. + */ + private String[] diff_halfMatchI(String longtext, String shorttext, int i) { + // Start with a 1/4 length substring at position i as a seed. + String seed = longtext.substring(i, i + longtext.length() / 4); + int j = -1; + String best_common = ""; + String best_longtext_a = "", best_longtext_b = ""; + String best_shorttext_a = "", best_shorttext_b = ""; + while ((j = shorttext.indexOf(seed, j + 1)) != -1) { + int prefixLength = diff_commonPrefix( + longtext.substring(i), + shorttext.substring(j)); + int suffixLength = diff_commonSuffix( + longtext.substring(0, i), + shorttext.substring(0, j)); + if (best_common.length() < suffixLength + prefixLength) { + best_common = shorttext.substring(j - suffixLength, j) + + shorttext.substring(j, j + prefixLength); + best_longtext_a = longtext.substring(0, i - suffixLength); + best_longtext_b = longtext.substring(i + prefixLength); + best_shorttext_a = shorttext.substring(0, j - suffixLength); + best_shorttext_b = shorttext.substring(j + prefixLength); + } + } + if (best_common.length() * 2 >= longtext.length()) { + return new String[] { + best_longtext_a, best_longtext_b, + best_shorttext_a, best_shorttext_b, best_common + }; + } else { + return null; + } + } + + /** + * Reduce the number of edits by eliminating semantically trivial equalities. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupSemantic(LinkedList diffs) { + if (diffs.isEmpty()) { + return; + } + boolean changes = false; + Deque equalities = new ArrayDeque(); // Double-ended queue of qualities. + String lastEquality = null; // Always equal to equalities.peek().text + ListIterator pointer = diffs.listIterator(); + // Number of characters that changed prior to the equality. + int length_insertions1 = 0; + int length_deletions1 = 0; + // Number of characters that changed after the equality. + int length_insertions2 = 0; + int length_deletions2 = 0; + Diff thisDiff = pointer.next(); + while (thisDiff != null) { + if (thisDiff.operation == Operation.EQUAL) { + // Equality found. + equalities.push(thisDiff); + length_insertions1 = length_insertions2; + length_deletions1 = length_deletions2; + length_insertions2 = 0; + length_deletions2 = 0; + lastEquality = thisDiff.text; + } else { + // An insertion or deletion. + if (thisDiff.operation == Operation.INSERT) { + length_insertions2 += thisDiff.text.length(); + } else { + length_deletions2 += thisDiff.text.length(); + } + // Eliminate an equality that is smaller or equal to the edits on both + // sides of it. + if (lastEquality != null && (lastEquality.length() <= Math.max(length_insertions1, length_deletions1)) + && (lastEquality.length() <= Math.max(length_insertions2, length_deletions2))) { + // System.out.println("Splitting: '" + lastEquality + "'"); + // Walk back to offending equality. + while (thisDiff != equalities.peek()) { + thisDiff = pointer.previous(); + } + pointer.next(); + + // Replace equality with a delete. + pointer.set(new Diff(Operation.DELETE, lastEquality)); + // Insert a corresponding an insert. + pointer.add(new Diff(Operation.INSERT, lastEquality)); + + equalities.pop(); // Throw away the equality we just deleted. + if (!equalities.isEmpty()) { + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop(); + } + if (equalities.isEmpty()) { + // There are no previous equalities, walk back to the start. + while (pointer.hasPrevious()) { + pointer.previous(); + } + } else { + // There is a safe equality we can fall back to. + thisDiff = equalities.peek(); + while (thisDiff != pointer.previous()) { + // Intentionally empty loop. + } + } + + length_insertions1 = 0; // Reset the counters. + length_insertions2 = 0; + length_deletions1 = 0; + length_deletions2 = 0; + lastEquality = null; + changes = true; + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + + // Normalize the diff. + if (changes) { + diff_cleanupMerge(diffs); + } + diff_cleanupSemanticLossless(diffs); + + // Find any overlaps between deletions and insertions. + // e.g: abcxxxxxxdef + // -> abcxxxdef + // e.g: xxxabcdefxxx + // -> defxxxabc + // Only extract an overlap if it is as big as the edit ahead or behind it. + pointer = diffs.listIterator(); + Diff prevDiff = null; + thisDiff = null; + if (pointer.hasNext()) { + prevDiff = pointer.next(); + if (pointer.hasNext()) { + thisDiff = pointer.next(); + } + } + while (thisDiff != null) { + if (prevDiff.operation == Operation.DELETE && + thisDiff.operation == Operation.INSERT) { + String deletion = prevDiff.text; + String insertion = thisDiff.text; + int overlap_length1 = this.diff_commonOverlap(deletion, insertion); + int overlap_length2 = this.diff_commonOverlap(insertion, deletion); + if (overlap_length1 >= overlap_length2) { + if (overlap_length1 >= deletion.length() / 2.0 || + overlap_length1 >= insertion.length() / 2.0) { + // Overlap found. Insert an equality and trim the surrounding edits. + pointer.previous(); + pointer + .add( + new Diff(Operation.EQUAL, + insertion.substring(0, overlap_length1))); + prevDiff.text = deletion.substring(0, deletion.length() - overlap_length1); + thisDiff.text = insertion.substring(overlap_length1); + // pointer.add inserts the element before the cursor, so there is + // no need to step past the new element. + } + } else { + if (overlap_length2 >= deletion.length() / 2.0 || + overlap_length2 >= insertion.length() / 2.0) { + // Reverse overlap found. + // Insert an equality and swap and trim the surrounding edits. + pointer.previous(); + pointer + .add( + new Diff(Operation.EQUAL, + deletion.substring(0, overlap_length2))); + prevDiff.operation = Operation.INSERT; + prevDiff.text = insertion.substring(0, insertion.length() - overlap_length2); + thisDiff.operation = Operation.DELETE; + thisDiff.text = deletion.substring(overlap_length2); + // pointer.add inserts the element before the cursor, so there is + // no need to step past the new element. + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + prevDiff = thisDiff; + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Look for single edits surrounded on both sides by equalities + * which can be shifted sideways to align the edit to a word boundary. + * e.g: The cat came. -> The cat came. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupSemanticLossless(LinkedList diffs) { + String equality1, edit, equality2; + String commonString; + int commonOffset; + int score, bestScore; + String bestEquality1, bestEdit, bestEquality2; + // Create a new iterator at the start. + ListIterator pointer = diffs.listIterator(); + Diff prevDiff = pointer.hasNext() ? pointer.next() : null; + Diff thisDiff = pointer.hasNext() ? pointer.next() : null; + Diff nextDiff = pointer.hasNext() ? pointer.next() : null; + // Intentionally ignore the first and last element (don't need checking). + while (nextDiff != null) { + if (prevDiff.operation == Operation.EQUAL && + nextDiff.operation == Operation.EQUAL) { + // This is a single edit surrounded by equalities. + equality1 = prevDiff.text; + edit = thisDiff.text; + equality2 = nextDiff.text; + + // First, shift the edit as far left as possible. + commonOffset = diff_commonSuffix(equality1, edit); + if (commonOffset != 0) { + commonString = edit.substring(edit.length() - commonOffset); + equality1 = equality1.substring(0, equality1.length() - commonOffset); + edit = commonString + edit.substring(0, edit.length() - commonOffset); + equality2 = commonString + equality2; + } + + // Second, step character by character right, looking for the best fit. + bestEquality1 = equality1; + bestEdit = edit; + bestEquality2 = equality2; + bestScore = diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2); + while (edit.length() != 0 && equality2.length() != 0 + && edit.charAt(0) == equality2.charAt(0)) { + equality1 += edit.charAt(0); + edit = edit.substring(1) + equality2.charAt(0); + equality2 = equality2.substring(1); + score = diff_cleanupSemanticScore(equality1, edit) + + diff_cleanupSemanticScore(edit, equality2); + // The >= encourages trailing rather than leading whitespace on edits. + if (score >= bestScore) { + bestScore = score; + bestEquality1 = equality1; + bestEdit = edit; + bestEquality2 = equality2; + } + } + + if (!prevDiff.text.equals(bestEquality1)) { + // We have an improvement, save it back to the diff. + if (bestEquality1.length() != 0) { + prevDiff.text = bestEquality1; + } else { + pointer.previous(); // Walk past nextDiff. + pointer.previous(); // Walk past thisDiff. + pointer.previous(); // Walk past prevDiff. + pointer.remove(); // Delete prevDiff. + pointer.next(); // Walk past thisDiff. + pointer.next(); // Walk past nextDiff. + } + thisDiff.text = bestEdit; + if (bestEquality2.length() != 0) { + nextDiff.text = bestEquality2; + } else { + pointer.remove(); // Delete nextDiff. + nextDiff = thisDiff; + thisDiff = prevDiff; + } + } + } + prevDiff = thisDiff; + thisDiff = nextDiff; + nextDiff = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Given two strings, compute a score representing whether the internal + * boundary falls on logical boundaries. + * Scores range from 6 (best) to 0 (worst). + * @param one First string. + * @param two Second string. + * @return The score. + */ + private int diff_cleanupSemanticScore(String one, String two) { + if (one.length() == 0 || two.length() == 0) { + // Edges are the best. + return 6; + } + + // Each port of this function behaves slightly differently due to + // subtle differences in each language's definition of things like + // 'whitespace'. Since this function's purpose is largely cosmetic, + // the choice has been made to use each language's native features + // rather than force total conformity. + char char1 = one.charAt(one.length() - 1); + char char2 = two.charAt(0); + boolean nonAlphaNumeric1 = !Character.isLetterOrDigit(char1); + boolean nonAlphaNumeric2 = !Character.isLetterOrDigit(char2); + boolean whitespace1 = nonAlphaNumeric1 && Character.isWhitespace(char1); + boolean whitespace2 = nonAlphaNumeric2 && Character.isWhitespace(char2); + boolean lineBreak1 = whitespace1 + && Character.getType(char1) == Character.CONTROL; + boolean lineBreak2 = whitespace2 + && Character.getType(char2) == Character.CONTROL; + boolean blankLine1 = lineBreak1 && BLANKLINEEND.matcher(one).find(); + boolean blankLine2 = lineBreak2 && BLANKLINESTART.matcher(two).find(); + + if (blankLine1 || blankLine2) { + // Five points for blank lines. + return 5; + } else if (lineBreak1 || lineBreak2) { + // Four points for line breaks. + return 4; + } else if (nonAlphaNumeric1 && !whitespace1 && whitespace2) { + // Three points for end of sentences. + return 3; + } else if (whitespace1 || whitespace2) { + // Two points for whitespace. + return 2; + } else if (nonAlphaNumeric1 || nonAlphaNumeric2) { + // One point for non-alphanumeric. + return 1; + } + return 0; + } + + // Define some regex patterns for matching boundaries. + private Pattern BLANKLINEEND = Pattern.compile("\\n\\r?\\n\\Z", Pattern.DOTALL); + private Pattern BLANKLINESTART = Pattern.compile("\\A\\r?\\n\\r?\\n", Pattern.DOTALL); + + /** + * Reduce the number of edits by eliminating operationally trivial equalities. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupEfficiency(LinkedList diffs) { + if (diffs.isEmpty()) { + return; + } + boolean changes = false; + Deque equalities = new ArrayDeque(); // Double-ended queue of equalities. + String lastEquality = null; // Always equal to equalities.peek().text + ListIterator pointer = diffs.listIterator(); + // Is there an insertion operation before the last equality. + boolean pre_ins = false; + // Is there a deletion operation before the last equality. + boolean pre_del = false; + // Is there an insertion operation after the last equality. + boolean post_ins = false; + // Is there a deletion operation after the last equality. + boolean post_del = false; + Diff thisDiff = pointer.next(); + Diff safeDiff = thisDiff; // The last Diff that is known to be unsplittable. + while (thisDiff != null) { + if (thisDiff.operation == Operation.EQUAL) { + // Equality found. + if (thisDiff.text.length() < Diff_EditCost && (post_ins || post_del)) { + // Candidate found. + equalities.push(thisDiff); + pre_ins = post_ins; + pre_del = post_del; + lastEquality = thisDiff.text; + } else { + // Not a candidate, and can never become one. + equalities.clear(); + lastEquality = null; + safeDiff = thisDiff; + } + post_ins = post_del = false; + } else { + // An insertion or deletion. + if (thisDiff.operation == Operation.DELETE) { + post_del = true; + } else { + post_ins = true; + } + /* + * Five types to be split: ABXYCD + * AXCD ABXC + * AXCD ABXC + */ + if (lastEquality != null + && ((pre_ins && pre_del && post_ins && post_del) + || ((lastEquality.length() < Diff_EditCost / 2) + && ((pre_ins ? 1 : 0) + (pre_del ? 1 : 0) + + (post_ins ? 1 : 0) + (post_del ? 1 : 0)) == 3))) { + // System.out.println("Splitting: '" + lastEquality + "'"); + // Walk back to offending equality. + while (thisDiff != equalities.peek()) { + thisDiff = pointer.previous(); + } + pointer.next(); + + // Replace equality with a delete. + pointer.set(new Diff(Operation.DELETE, lastEquality)); + // Insert a corresponding an insert. + pointer.add(thisDiff = new Diff(Operation.INSERT, lastEquality)); + + equalities.pop(); // Throw away the equality we just deleted. + lastEquality = null; + if (pre_ins && pre_del) { + // No changes made which could affect previous entry, keep going. + post_ins = post_del = true; + equalities.clear(); + safeDiff = thisDiff; + } else { + if (!equalities.isEmpty()) { + // Throw away the previous equality (it needs to be reevaluated). + equalities.pop(); + } + if (equalities.isEmpty()) { + // There are no previous questionable equalities, + // walk back to the last known safe diff. + thisDiff = safeDiff; + } else { + // There is an equality we can fall back to. + thisDiff = equalities.peek(); + } + while (thisDiff != pointer.previous()) { + // Intentionally empty loop. + } + post_ins = post_del = false; + } + + changes = true; + } + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + + if (changes) { + diff_cleanupMerge(diffs); + } + } + + /** + * Reorder and merge like edit sections. Merge equalities. + * Any edit section can move as long as it doesn't cross an equality. + * @param diffs LinkedList of Diff objects. + */ + public void diff_cleanupMerge(LinkedList diffs) { + diffs.add(new Diff(Operation.EQUAL, "")); // Add a dummy entry at the end. + ListIterator pointer = diffs.listIterator(); + int count_delete = 0; + int count_insert = 0; + String text_delete = ""; + String text_insert = ""; + Diff thisDiff = pointer.next(); + Diff prevEqual = null; + int commonlength; + while (thisDiff != null) { + switch (thisDiff.operation) { + case INSERT: + count_insert++; + text_insert += thisDiff.text; + prevEqual = null; + break; + case DELETE: + count_delete++; + text_delete += thisDiff.text; + prevEqual = null; + break; + case EQUAL: + if (count_delete + count_insert > 1) { + boolean both_types = count_delete != 0 && count_insert != 0; + // Delete the offending records. + pointer.previous(); // Reverse direction. + while (count_delete-- > 0) { + pointer.previous(); + pointer.remove(); + } + while (count_insert-- > 0) { + pointer.previous(); + pointer.remove(); + } + if (both_types) { + // Factor out any common prefixies. + commonlength = diff_commonPrefix(text_insert, text_delete); + if (commonlength != 0) { + if (pointer.hasPrevious()) { + thisDiff = pointer.previous(); + assert thisDiff.operation == Operation.EQUAL : "Previous diff should have been an equality."; + thisDiff.text += text_insert.substring(0, commonlength); + pointer.next(); + } else { + pointer + .add( + new Diff(Operation.EQUAL, + text_insert.substring(0, commonlength))); + } + text_insert = text_insert.substring(commonlength); + text_delete = text_delete.substring(commonlength); + } + // Factor out any common suffixies. + commonlength = diff_commonSuffix(text_insert, text_delete); + if (commonlength != 0) { + thisDiff = pointer.next(); + thisDiff.text = text_insert + .substring( + text_insert.length() + - commonlength) + + thisDiff.text; + text_insert = text_insert + .substring( + 0, text_insert.length() + - commonlength); + text_delete = text_delete + .substring( + 0, text_delete.length() + - commonlength); + pointer.previous(); + } + } + // Insert the merged records. + if (text_delete.length() != 0) { + pointer.add(new Diff(Operation.DELETE, text_delete)); + } + if (text_insert.length() != 0) { + pointer.add(new Diff(Operation.INSERT, text_insert)); + } + // Step forward to the equality. + thisDiff = pointer.hasNext() ? pointer.next() : null; + } else if (prevEqual != null) { + // Merge this equality with the previous one. + prevEqual.text += thisDiff.text; + pointer.remove(); + thisDiff = pointer.previous(); + pointer.next(); // Forward direction + } + count_insert = 0; + count_delete = 0; + text_delete = ""; + text_insert = ""; + prevEqual = thisDiff; + break; + } + thisDiff = pointer.hasNext() ? pointer.next() : null; + } + if (diffs.getLast().text.length() == 0) { + diffs.removeLast(); // Remove the dummy entry at the end. + } + + /* + * Second pass: look for single edits surrounded on both sides by equalities which can be shifted sideways to + * eliminate an equality. e.g: ABAC -> ABAC + */ + boolean changes = false; + // Create a new iterator at the start. + // (As opposed to walking the current one back.) + pointer = diffs.listIterator(); + Diff prevDiff = pointer.hasNext() ? pointer.next() : null; + thisDiff = pointer.hasNext() ? pointer.next() : null; + Diff nextDiff = pointer.hasNext() ? pointer.next() : null; + // Intentionally ignore the first and last element (don't need checking). + while (nextDiff != null) { + if (prevDiff.operation == Operation.EQUAL && + nextDiff.operation == Operation.EQUAL) { + // This is a single edit surrounded by equalities. + if (thisDiff.text.endsWith(prevDiff.text)) { + // Shift the edit over the previous equality. + thisDiff.text = prevDiff.text + + thisDiff.text + .substring( + 0, thisDiff.text.length() + - prevDiff.text.length()); + nextDiff.text = prevDiff.text + nextDiff.text; + pointer.previous(); // Walk past nextDiff. + pointer.previous(); // Walk past thisDiff. + pointer.previous(); // Walk past prevDiff. + pointer.remove(); // Delete prevDiff. + pointer.next(); // Walk past thisDiff. + thisDiff = pointer.next(); // Walk past nextDiff. + nextDiff = pointer.hasNext() ? pointer.next() : null; + changes = true; + } else if (thisDiff.text.startsWith(nextDiff.text)) { + // Shift the edit over the next equality. + prevDiff.text += nextDiff.text; + thisDiff.text = thisDiff.text.substring(nextDiff.text.length()) + + nextDiff.text; + pointer.remove(); // Delete nextDiff. + nextDiff = pointer.hasNext() ? pointer.next() : null; + changes = true; + } + } + prevDiff = thisDiff; + thisDiff = nextDiff; + nextDiff = pointer.hasNext() ? pointer.next() : null; + } + // If shifts were made, the diff needs reordering and another shift sweep. + if (changes) { + diff_cleanupMerge(diffs); + } + } + + /** + * loc is a location in text1, compute and return the equivalent location in + * text2. + * e.g. "The cat" vs "The big cat", 1->1, 5->8 + * @param diffs List of Diff objects. + * @param loc Location within text1. + * @return Location within text2. + */ + public int diff_xIndex(List diffs, int loc) { + int chars1 = 0; + int chars2 = 0; + int last_chars1 = 0; + int last_chars2 = 0; + Diff lastDiff = null; + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.INSERT) { + // Equality or deletion. + chars1 += aDiff.text.length(); + } + if (aDiff.operation != Operation.DELETE) { + // Equality or insertion. + chars2 += aDiff.text.length(); + } + if (chars1 > loc) { + // Overshot the location. + lastDiff = aDiff; + break; + } + last_chars1 = chars1; + last_chars2 = chars2; + } + if (lastDiff != null && lastDiff.operation == Operation.DELETE) { + // The location was deleted. + return last_chars2; + } + // Add the remaining character length. + return last_chars2 + (loc - last_chars1); + } + + /** + * Convert a Diff list into a pretty HTML report. + * @param diffs List of Diff objects. + * @return HTML representation. + */ + public String diff_prettyHtml(List diffs) { + StringBuilder html = new StringBuilder(); + for (Diff aDiff : diffs) { + String text = aDiff.text + .replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace("\n", "¶
"); + switch (aDiff.operation) { + case INSERT: + html + .append("") + .append(text) + .append(""); + break; + case DELETE: + html + .append("") + .append(text) + .append(""); + break; + case EQUAL: + html.append("").append(text).append(""); + break; + } + } + return html.toString(); + } + + /** + * Compute and return the source text (all equalities and deletions). + * @param diffs List of Diff objects. + * @return Source text. + */ + public String diff_text1(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.INSERT) { + text.append(aDiff.text); + } + } + return text.toString(); + } + + /** + * Compute and return the destination text (all equalities and insertions). + * @param diffs List of Diff objects. + * @return Destination text. + */ + public String diff_text2(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + if (aDiff.operation != Operation.DELETE) { + text.append(aDiff.text); + } + } + return text.toString(); + } + + /** + * Compute the Levenshtein compare; the number of inserted, deleted or + * substituted characters. + * @param diffs List of Diff objects. + * @return Number of changes. + */ + public int diff_levenshtein(List diffs) { + int levenshtein = 0; + int insertions = 0; + int deletions = 0; + for (Diff aDiff : diffs) { + switch (aDiff.operation) { + case INSERT: + insertions += aDiff.text.length(); + break; + case DELETE: + deletions += aDiff.text.length(); + break; + case EQUAL: + // A deletion and an insertion is one substitution. + levenshtein += Math.max(insertions, deletions); + insertions = 0; + deletions = 0; + break; + } + } + levenshtein += Math.max(insertions, deletions); + return levenshtein; + } + + /** + * Crush the diff into an encoded string which describes the operations + * required to transform text1 into text2. + * E.g. =3\t-2\t+ing -> Keep 3 chars, delete 2 chars, insert 'ing'. + * Operations are tab-separated. Inserted text is escaped using %xx notation. + * @param diffs List of Diff objects. + * @return Delta text. + */ + public String diff_toDelta(List diffs) { + StringBuilder text = new StringBuilder(); + for (Diff aDiff : diffs) { + switch (aDiff.operation) { + case INSERT: + try { + text + .append("+") + .append( + URLEncoder + .encode(aDiff.text, "UTF-8") + .replace('+', ' ')) + .append("\t"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } + break; + case DELETE: + text.append("-").append(aDiff.text.length()).append("\t"); + break; + case EQUAL: + text.append("=").append(aDiff.text.length()).append("\t"); + break; + } + } + String delta = text.toString(); + if (delta.length() != 0) { + // Strip off trailing tab character. + delta = delta.substring(0, delta.length() - 1); + delta = unescapeForEncodeUriCompatability(delta); + } + return delta; + } + + /** + * Given the original text1, and an encoded string which describes the + * operations required to transform text1 into text2, compute the full diff. + * @param text1 Source string for the diff. + * @param delta Delta text. + * @return Array of Diff objects or null if invalid. + * @throws IllegalArgumentException If invalid input. + */ + public LinkedList diff_fromDelta(String text1, String delta) + throws IllegalArgumentException { + LinkedList diffs = new LinkedList(); + int pointer = 0; // Cursor in text1 + String[] tokens = delta.split("\t"); + for (String token : tokens) { + if (token.length() == 0) { + // Blank tokens are ok (from a trailing \t). + continue; + } + // Each token begins with a one character parameter which specifies the + // operation of this token (delete, insert, equality). + String param = token.substring(1); + switch (token.charAt(0)) { + case '+': + // decode would change all "+" to " " + param = param.replace("+", "%2B"); + try { + param = URLDecoder.decode(param, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } catch (IllegalArgumentException e) { + // Malformed URI sequence. + throw new IllegalArgumentException( + "Illegal escape in diff_fromDelta: " + param, e); + } + diffs.add(new Diff(Operation.INSERT, param)); + break; + case '-': + // Fall through. + case '=': + int n; + try { + n = Integer.parseInt(param); + } catch (NumberFormatException e) { + throw new IllegalArgumentException( + "Invalid number in diff_fromDelta: " + param, e); + } + if (n < 0) { + throw new IllegalArgumentException( + "Negative number in diff_fromDelta: " + param); + } + String text; + try { + text = text1.substring(pointer, pointer += n); + } catch (StringIndexOutOfBoundsException e) { + throw new IllegalArgumentException("Delta length (" + pointer + + ") larger than source text length (" + text1.length() + + ").", e); + } + if (token.charAt(0) == '=') { + diffs.add(new Diff(Operation.EQUAL, text)); + } else { + diffs.add(new Diff(Operation.DELETE, text)); + } + break; + default: + // Anything else is an error. + throw new IllegalArgumentException( + "Invalid diff operation in diff_fromDelta: " + token.charAt(0)); + } + } + if (pointer != text1.length()) { + throw new IllegalArgumentException("Delta length (" + pointer + + ") smaller than source text length (" + text1.length() + ")."); + } + return diffs; + } + + // MATCH FUNCTIONS + + /** + * Locate the best instance of 'pattern' in 'text' near 'loc'. + * Returns -1 if no match found. + * @param text The text to search. + * @param pattern The pattern to search for. + * @param loc The location to search around. + * @return Best match index or -1. + */ + public int match_main(String text, String pattern, int loc) { + // Check for null inputs. + if (text == null || pattern == null) { + throw new IllegalArgumentException("Null inputs. (match_main)"); + } + + loc = Math.max(0, Math.min(loc, text.length())); + if (text.equals(pattern)) { + // Shortcut (potentially not guaranteed by the algorithm) + return 0; + } else if (text.length() == 0) { + // Nothing to match. + return -1; + } else if (loc + pattern.length() <= text.length() + && text.substring(loc, loc + pattern.length()).equals(pattern)) { + // Perfect match at the perfect spot! (Includes case of null pattern) + return loc; + } else { + // Do a fuzzy compare. + return match_bitap(text, pattern, loc); + } + } + + /** + * Locate the best instance of 'pattern' in 'text' near 'loc' using the + * Bitap algorithm. Returns -1 if no match found. + * @param text The text to search. + * @param pattern The pattern to search for. + * @param loc The location to search around. + * @return Best match index or -1. + */ + protected int match_bitap(String text, String pattern, int loc) { + assert (Match_MaxBits == 0 || pattern.length() <= Match_MaxBits) : "Pattern too long for this application."; + + // Initialise the alphabet. + Map s = match_alphabet(pattern); + + // Highest score beyond which we give up. + double score_threshold = Match_Threshold; + // Is there a nearby exact match? (speedup) + int best_loc = text.indexOf(pattern, loc); + if (best_loc != -1) { + score_threshold = Math + .min( + match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + // What about in the other direction? (speedup) + best_loc = text.lastIndexOf(pattern, loc + pattern.length()); + if (best_loc != -1) { + score_threshold = Math + .min( + match_bitapScore(0, best_loc, loc, pattern), + score_threshold); + } + } + + // Initialise the bit arrays. + int matchmask = 1 << (pattern.length() - 1); + best_loc = -1; + + int bin_min, bin_mid; + int bin_max = pattern.length() + text.length(); + // Empty initialization added to appease Java compiler. + int[] last_rd = new int[0]; + for (int d = 0; d < pattern.length(); d++) { + // Scan for the best match; each iteration allows for one more error. + // Run a binary search to determine how far from 'loc' we can stray at + // this error level. + bin_min = 0; + bin_mid = bin_max; + while (bin_min < bin_mid) { + if (match_bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold) { + bin_min = bin_mid; + } else { + bin_max = bin_mid; + } + bin_mid = (bin_max - bin_min) / 2 + bin_min; + } + // Use the result from this iteration as the maximum for the next. + bin_max = bin_mid; + int start = Math.max(1, loc - bin_mid + 1); + int finish = Math.min(loc + bin_mid, text.length()) + pattern.length(); + + int[] rd = new int[finish + 2]; + rd[finish + 1] = (1 << d) - 1; + for (int j = finish; j >= start; j--) { + int charMatch; + if (text.length() <= j - 1 || !s.containsKey(text.charAt(j - 1))) { + // Out of range. + charMatch = 0; + } else { + charMatch = s.get(text.charAt(j - 1)); + } + if (d == 0) { + // First pass: exact match. + rd[j] = ((rd[j + 1] << 1) | 1) & charMatch; + } else { + // Subsequent passes: fuzzy match. + rd[j] = (((rd[j + 1] << 1) | 1) & charMatch) + | (((last_rd[j + 1] | last_rd[j]) << 1) | 1) | last_rd[j + 1]; + } + if ((rd[j] & matchmask) != 0) { + double score = match_bitapScore(d, j - 1, loc, pattern); + // This match will almost certainly be better than any existing + // match. But check anyway. + if (score <= score_threshold) { + // Told you so. + score_threshold = score; + best_loc = j - 1; + if (best_loc > loc) { + // When passing loc, don't exceed our current compare from loc. + start = Math.max(1, 2 * loc - best_loc); + } else { + // Already passed loc, downhill from here on in. + break; + } + } + } + } + if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) { + // No hope for a (better) match at greater error levels. + break; + } + last_rd = rd; + } + return best_loc; + } + + /** + * Compute and return the score for a match with e errors and x location. + * @param e Number of errors in match. + * @param x Location of match. + * @param loc Expected location of match. + * @param pattern Pattern being sought. + * @return Overall score for match (0.0 = good, 1.0 = bad). + */ + private double match_bitapScore(int e, int x, int loc, String pattern) { + float accuracy = (float) e / pattern.length(); + int proximity = Math.abs(loc - x); + if (Match_Distance == 0) { + // Dodge divide by zero error. + return proximity == 0 ? accuracy : 1.0; + } + return accuracy + (proximity / (float) Match_Distance); + } + + /** + * Initialise the alphabet for the Bitap algorithm. + * @param pattern The text to encode. + * @return Hash of character locations. + */ + protected Map match_alphabet(String pattern) { + Map s = new HashMap(); + char[] char_pattern = pattern.toCharArray(); + for (char c : char_pattern) { + s.put(c, 0); + } + int i = 0; + for (char c : char_pattern) { + s.put(c, s.get(c) | (1 << (pattern.length() - i - 1))); + i++; + } + return s; + } + + // PATCH FUNCTIONS + + /** + * Increase the context until it is unique, + * but don't let the pattern expand beyond Match_MaxBits. + * @param patch The patch to grow. + * @param text Source text. + */ + protected void patch_addContext(Patch patch, String text) { + if (text.length() == 0) { + return; + } + String pattern = text.substring(patch.start2, patch.start2 + patch.length1); + int padding = 0; + + // Look for the first and last matches of pattern in text. If two different + // matches are found, increase the pattern length. + while (text.indexOf(pattern) != text.lastIndexOf(pattern) + && pattern.length() < Match_MaxBits - Patch_Margin - Patch_Margin) { + padding += Patch_Margin; + pattern = text + .substring( + Math.max(0, patch.start2 - padding), + Math.min(text.length(), patch.start2 + patch.length1 + padding)); + } + // Add one chunk for good luck. + padding += Patch_Margin; + + // Add the prefix. + String prefix = text + .substring( + Math.max(0, patch.start2 - padding), + patch.start2); + if (prefix.length() != 0) { + patch.diffs.addFirst(new Diff(Operation.EQUAL, prefix)); + } + // Add the suffix. + String suffix = text + .substring( + patch.start2 + patch.length1, + Math.min(text.length(), patch.start2 + patch.length1 + padding)); + if (suffix.length() != 0) { + patch.diffs.addLast(new Diff(Operation.EQUAL, suffix)); + } + + // Roll back the start points. + patch.start1 -= prefix.length(); + patch.start2 -= prefix.length(); + // Extend the lengths. + patch.length1 += prefix.length() + suffix.length(); + patch.length2 += prefix.length() + suffix.length(); + } + + /** + * Compute a list of patches to turn text1 into text2. + * A set of diffs will be computed. + * @param text1 Old text. + * @param text2 New text. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(String text1, String text2) { + if (text1 == null || text2 == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + // No diffs provided, compute our own. + LinkedList diffs = diff_main(text1, text2, true); + if (diffs.size() > 2) { + diff_cleanupSemantic(diffs); + diff_cleanupEfficiency(diffs); + } + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text1 will be derived from the provided diffs. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(LinkedList diffs) { + if (diffs == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + // No origin string provided, compute our own. + String text1 = diff_text1(diffs); + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text2 is ignored, diffs are the delta between text1 and text2. + * @param text1 Old text + * @param text2 Ignored. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + * @deprecated Prefer patch_make(String text1, LinkedList diffs). + */ + @Deprecated + public LinkedList patch_make(String text1, String text2, + LinkedList diffs) { + return patch_make(text1, diffs); + } + + /** + * Compute a list of patches to turn text1 into text2. + * text2 is not provided, diffs are the delta between text1 and text2. + * @param text1 Old text. + * @param diffs Array of Diff objects for text1 to text2. + * @return LinkedList of Patch objects. + */ + public LinkedList patch_make(String text1, LinkedList diffs) { + if (text1 == null || diffs == null) { + throw new IllegalArgumentException("Null inputs. (patch_make)"); + } + + LinkedList patches = new LinkedList(); + if (diffs.isEmpty()) { + return patches; // Get rid of the null case. + } + Patch patch = new Patch(); + int char_count1 = 0; // Number of characters into the text1 string. + int char_count2 = 0; // Number of characters into the text2 string. + // Start with text1 (prepatch_text) and apply the diffs until we arrive at + // text2 (postpatch_text). We recreate the patches one by one to determine + // context info. + String prepatch_text = text1; + String postpatch_text = text1; + for (Diff aDiff : diffs) { + if (patch.diffs.isEmpty() && aDiff.operation != Operation.EQUAL) { + // A new patch starts here. + patch.start1 = char_count1; + patch.start2 = char_count2; + } + + switch (aDiff.operation) { + case INSERT: + patch.diffs.add(aDiff); + patch.length2 += aDiff.text.length(); + postpatch_text = postpatch_text.substring(0, char_count2) + + aDiff.text + postpatch_text.substring(char_count2); + break; + case DELETE: + patch.length1 += aDiff.text.length(); + patch.diffs.add(aDiff); + postpatch_text = postpatch_text.substring(0, char_count2) + + postpatch_text.substring(char_count2 + aDiff.text.length()); + break; + case EQUAL: + if (aDiff.text.length() <= 2 * Patch_Margin + && !patch.diffs.isEmpty() && aDiff != diffs.getLast()) { + // Small equality inside a patch. + patch.diffs.add(aDiff); + patch.length1 += aDiff.text.length(); + patch.length2 += aDiff.text.length(); + } + + if (aDiff.text.length() >= 2 * Patch_Margin && !patch.diffs.isEmpty()) { + // Time for a new patch. + if (!patch.diffs.isEmpty()) { + patch_addContext(patch, prepatch_text); + patches.add(patch); + patch = new Patch(); + // Unlike Unidiff, our patch lists have a rolling context. + // https://github.com/google/diff-match-patch/wiki/Unidiff + // Update prepatch text & pos to reflect the application of the + // just completed patch. + prepatch_text = postpatch_text; + char_count1 = char_count2; + } + } + break; + } + + // Update the current character count. + if (aDiff.operation != Operation.INSERT) { + char_count1 += aDiff.text.length(); + } + if (aDiff.operation != Operation.DELETE) { + char_count2 += aDiff.text.length(); + } + } + // Pick up the leftover patch if not empty. + if (!patch.diffs.isEmpty()) { + patch_addContext(patch, prepatch_text); + patches.add(patch); + } + + return patches; + } + + /** + * Given an array of patches, return another array that is identical. + * @param patches Array of Patch objects. + * @return Array of Patch objects. + */ + public LinkedList patch_deepCopy(LinkedList patches) { + LinkedList patchesCopy = new LinkedList(); + for (Patch aPatch : patches) { + Patch patchCopy = new Patch(); + for (Diff aDiff : aPatch.diffs) { + Diff diffCopy = new Diff(aDiff.operation, aDiff.text); + patchCopy.diffs.add(diffCopy); + } + patchCopy.start1 = aPatch.start1; + patchCopy.start2 = aPatch.start2; + patchCopy.length1 = aPatch.length1; + patchCopy.length2 = aPatch.length2; + patchesCopy.add(patchCopy); + } + return patchesCopy; + } + + /** + * Merge a set of patches onto the text. Return a patched text, as well + * as an array of true/false values indicating which patches were applied. + * @param patches Array of Patch objects + * @param text Old text. + * @return Two element Object array, containing the new text and an array of + * boolean values. + */ + public Object[] patch_apply(LinkedList patches, String text) { + if (patches.isEmpty()) { + return new Object[] { + text, new boolean[0] + }; + } + + // Deep copy the patches so that no changes are made to originals. + patches = patch_deepCopy(patches); + + String nullPadding = patch_addPadding(patches); + text = nullPadding + text + nullPadding; + patch_splitMax(patches); + + int x = 0; + // delta keeps track of the offset between the expected and actual location + // of the previous patch. If there are patches expected at positions 10 and + // 20, but the first patch was found at 12, delta is 2 and the second patch + // has an effective expected position of 22. + int delta = 0; + boolean[] results = new boolean[patches.size()]; + for (Patch aPatch : patches) { + int expected_loc = aPatch.start2 + delta; + String text1 = diff_text1(aPatch.diffs); + int start_loc; + int end_loc = -1; + if (text1.length() > this.Match_MaxBits) { + // patch_splitMax will only provide an oversized pattern in the case of + // a monster delete. + start_loc = match_main( + text, + text1.substring(0, this.Match_MaxBits), expected_loc); + if (start_loc != -1) { + end_loc = match_main( + text, + text1.substring(text1.length() - this.Match_MaxBits), + expected_loc + text1.length() - this.Match_MaxBits); + if (end_loc == -1 || start_loc >= end_loc) { + // Can't find valid trailing context. Drop this patch. + start_loc = -1; + } + } + } else { + start_loc = match_main(text, text1, expected_loc); + } + if (start_loc == -1) { + // No match found. :( + results[x] = false; + // Subtract the delta for this failed patch from subsequent patches. + delta -= aPatch.length2 - aPatch.length1; + } else { + // Found a match. :) + results[x] = true; + delta = start_loc - expected_loc; + String text2; + if (end_loc == -1) { + text2 = text + .substring( + start_loc, + Math.min(start_loc + text1.length(), text.length())); + } else { + text2 = text + .substring( + start_loc, + Math.min(end_loc + this.Match_MaxBits, text.length())); + } + if (text1.equals(text2)) { + // Perfect match, just shove the replacement text in. + text = text.substring(0, start_loc) + diff_text2(aPatch.diffs) + + text.substring(start_loc + text1.length()); + } else { + // Imperfect match. Run a diff to get a framework of equivalent + // indices. + LinkedList diffs = diff_main(text1, text2, false); + if (text1.length() > this.Match_MaxBits + && diff_levenshtein(diffs) / (float) text1.length() > this.Patch_DeleteThreshold) { + // The end points match, but the content is unacceptably bad. + results[x] = false; + } else { + diff_cleanupSemanticLossless(diffs); + int index1 = 0; + for (Diff aDiff : aPatch.diffs) { + if (aDiff.operation != Operation.EQUAL) { + int index2 = diff_xIndex(diffs, index1); + if (aDiff.operation == Operation.INSERT) { + // Insertion + text = text.substring(0, start_loc + index2) + aDiff.text + + text.substring(start_loc + index2); + } else if (aDiff.operation == Operation.DELETE) { + // Deletion + text = text.substring(0, start_loc + index2) + + text + .substring( + start_loc + diff_xIndex( + diffs, + index1 + aDiff.text.length())); + } + } + if (aDiff.operation != Operation.DELETE) { + index1 += aDiff.text.length(); + } + } + } + } + } + x++; + } + // Strip the padding off. + text = text + .substring( + nullPadding.length(), text.length() + - nullPadding.length()); + return new Object[] { + text, results + }; + } + + /** + * Add some padding on text start and end so that edges can match something. + * Intended to be called only from within patch_apply. + * @param patches Array of Patch objects. + * @return The padding string added to each side. + */ + public String patch_addPadding(LinkedList patches) { + short paddingLength = this.Patch_Margin; + String nullPadding = ""; + for (short x = 1; x <= paddingLength; x++) { + nullPadding += String.valueOf((char) x); + } + + // Bump all the patches forward. + for (Patch aPatch : patches) { + aPatch.start1 += paddingLength; + aPatch.start2 += paddingLength; + } + + // Add some padding on start of first diff. + Patch patch = patches.getFirst(); + LinkedList diffs = patch.diffs; + if (diffs.isEmpty() || diffs.getFirst().operation != Operation.EQUAL) { + // Add nullPadding equality. + diffs.addFirst(new Diff(Operation.EQUAL, nullPadding)); + patch.start1 -= paddingLength; // Should be 0. + patch.start2 -= paddingLength; // Should be 0. + patch.length1 += paddingLength; + patch.length2 += paddingLength; + } else if (paddingLength > diffs.getFirst().text.length()) { + // Grow first equality. + Diff firstDiff = diffs.getFirst(); + int extraLength = paddingLength - firstDiff.text.length(); + firstDiff.text = nullPadding.substring(firstDiff.text.length()) + + firstDiff.text; + patch.start1 -= extraLength; + patch.start2 -= extraLength; + patch.length1 += extraLength; + patch.length2 += extraLength; + } + + // Add some padding on end of last diff. + patch = patches.getLast(); + diffs = patch.diffs; + if (diffs.isEmpty() || diffs.getLast().operation != Operation.EQUAL) { + // Add nullPadding equality. + diffs.addLast(new Diff(Operation.EQUAL, nullPadding)); + patch.length1 += paddingLength; + patch.length2 += paddingLength; + } else if (paddingLength > diffs.getLast().text.length()) { + // Grow last equality. + Diff lastDiff = diffs.getLast(); + int extraLength = paddingLength - lastDiff.text.length(); + lastDiff.text += nullPadding.substring(0, extraLength); + patch.length1 += extraLength; + patch.length2 += extraLength; + } + + return nullPadding; + } + + /** + * Look through the patches and break up any which are longer than the + * maximum limit of the match algorithm. + * Intended to be called only from within patch_apply. + * @param patches LinkedList of Patch objects. + */ + public void patch_splitMax(LinkedList patches) { + short patch_size = Match_MaxBits; + String precontext, postcontext; + Patch patch; + int start1, start2; + boolean empty; + Operation diff_type; + String diff_text; + ListIterator pointer = patches.listIterator(); + Patch bigpatch = pointer.hasNext() ? pointer.next() : null; + while (bigpatch != null) { + if (bigpatch.length1 <= Match_MaxBits) { + bigpatch = pointer.hasNext() ? pointer.next() : null; + continue; + } + // Remove the big old patch. + pointer.remove(); + start1 = bigpatch.start1; + start2 = bigpatch.start2; + precontext = ""; + while (!bigpatch.diffs.isEmpty()) { + // Create one of several smaller patches. + patch = new Patch(); + empty = true; + patch.start1 = start1 - precontext.length(); + patch.start2 = start2 - precontext.length(); + if (precontext.length() != 0) { + patch.length1 = patch.length2 = precontext.length(); + patch.diffs.add(new Diff(Operation.EQUAL, precontext)); + } + while (!bigpatch.diffs.isEmpty() + && patch.length1 < patch_size - Patch_Margin) { + diff_type = bigpatch.diffs.getFirst().operation; + diff_text = bigpatch.diffs.getFirst().text; + if (diff_type == Operation.INSERT) { + // Insertions are harmless. + patch.length2 += diff_text.length(); + start2 += diff_text.length(); + patch.diffs.addLast(bigpatch.diffs.removeFirst()); + empty = false; + } else if (diff_type == Operation.DELETE && patch.diffs.size() == 1 + && patch.diffs.getFirst().operation == Operation.EQUAL + && diff_text.length() > 2 * patch_size) { + // This is a large deletion. Let it pass in one chunk. + patch.length1 += diff_text.length(); + start1 += diff_text.length(); + empty = false; + patch.diffs.add(new Diff(diff_type, diff_text)); + bigpatch.diffs.removeFirst(); + } else { + // Deletion or equality. Only take as much as we can stomach. + diff_text = diff_text + .substring( + 0, Math + .min( + diff_text.length(), + patch_size - patch.length1 - Patch_Margin)); + patch.length1 += diff_text.length(); + start1 += diff_text.length(); + if (diff_type == Operation.EQUAL) { + patch.length2 += diff_text.length(); + start2 += diff_text.length(); + } else { + empty = false; + } + patch.diffs.add(new Diff(diff_type, diff_text)); + if (diff_text.equals(bigpatch.diffs.getFirst().text)) { + bigpatch.diffs.removeFirst(); + } else { + bigpatch.diffs.getFirst().text = bigpatch.diffs.getFirst().text + .substring(diff_text.length()); + } + } + } + // Compute the head context for the next patch. + precontext = diff_text2(patch.diffs); + precontext = precontext + .substring( + Math + .max( + 0, precontext.length() + - Patch_Margin)); + // Append the end context for this patch. + if (diff_text1(bigpatch.diffs).length() > Patch_Margin) { + postcontext = diff_text1(bigpatch.diffs).substring(0, Patch_Margin); + } else { + postcontext = diff_text1(bigpatch.diffs); + } + if (postcontext.length() != 0) { + patch.length1 += postcontext.length(); + patch.length2 += postcontext.length(); + if (!patch.diffs.isEmpty() + && patch.diffs.getLast().operation == Operation.EQUAL) { + patch.diffs.getLast().text += postcontext; + } else { + patch.diffs.add(new Diff(Operation.EQUAL, postcontext)); + } + } + if (!empty) { + pointer.add(patch); + } + } + bigpatch = pointer.hasNext() ? pointer.next() : null; + } + } + + /** + * Take a list of patches and return a textual representation. + * @param patches List of Patch objects. + * @return Text representation of patches. + */ + public String patch_toText(List patches) { + StringBuilder text = new StringBuilder(); + for (Patch aPatch : patches) { + text.append(aPatch); + } + return text.toString(); + } + + /** + * Parse a textual representation of patches and return a List of Patch + * objects. + * @param textline Text representation of patches. + * @return List of Patch objects. + * @throws IllegalArgumentException If invalid input. + */ + public List patch_fromText(String textline) + throws IllegalArgumentException { + List patches = new LinkedList(); + if (textline.length() == 0) { + return patches; + } + List textList = Arrays.asList(textline.split("\n")); + LinkedList text = new LinkedList(textList); + Patch patch; + Pattern patchHeader = Pattern.compile("^@@ -(\\d+),?(\\d*) \\+(\\d+),?(\\d*) @@$"); + Matcher m; + char sign; + String line; + while (!text.isEmpty()) { + m = patchHeader.matcher(text.getFirst()); + if (!m.matches()) { + throw new IllegalArgumentException( + "Invalid patch string: " + text.getFirst()); + } + patch = new Patch(); + patches.add(patch); + patch.start1 = Integer.parseInt(m.group(1)); + if (m.group(2).length() == 0) { + patch.start1--; + patch.length1 = 1; + } else if (m.group(2).equals("0")) { + patch.length1 = 0; + } else { + patch.start1--; + patch.length1 = Integer.parseInt(m.group(2)); + } + + patch.start2 = Integer.parseInt(m.group(3)); + if (m.group(4).length() == 0) { + patch.start2--; + patch.length2 = 1; + } else if (m.group(4).equals("0")) { + patch.length2 = 0; + } else { + patch.start2--; + patch.length2 = Integer.parseInt(m.group(4)); + } + text.removeFirst(); + + while (!text.isEmpty()) { + try { + sign = text.getFirst().charAt(0); + } catch (IndexOutOfBoundsException e) { + // Blank line? Whatever. + text.removeFirst(); + continue; + } + line = text.getFirst().substring(1); + line = line.replace("+", "%2B"); // decode would change all "+" to " " + try { + line = URLDecoder.decode(line, "UTF-8"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } catch (IllegalArgumentException e) { + // Malformed URI sequence. + throw new IllegalArgumentException( + "Illegal escape in patch_fromText: " + line, e); + } + if (sign == '-') { + // Deletion. + patch.diffs.add(new Diff(Operation.DELETE, line)); + } else if (sign == '+') { + // Insertion. + patch.diffs.add(new Diff(Operation.INSERT, line)); + } else if (sign == ' ') { + // Minor equality. + patch.diffs.add(new Diff(Operation.EQUAL, line)); + } else if (sign == '@') { + // Start of next patch. + break; + } else { + // WTF? + throw new IllegalArgumentException( + "Invalid patch mode '" + sign + "' in: " + line); + } + text.removeFirst(); + } + } + return patches; + } + + /** + * Class representing one diff operation. + */ + public static class Diff { + /** + * One of: INSERT, DELETE or EQUAL. + */ + public Operation operation; + /** + * The text associated with this diff operation. + */ + public String text; + + /** + * Constructor. Initializes the diff with the provided values. + * @param operation One of INSERT, DELETE or EQUAL. + * @param text The text being applied. + */ + public Diff(Operation operation, String text) { + // Construct a diff with the specified operation and text. + this.operation = operation; + this.text = text; + } + + /** + * Display a human-readable version of this Diff. + * @return text version. + */ + public String toString() { + String prettyText = this.text.replace('\n', '\u00b6'); + return "Diff(" + this.operation + ",\"" + prettyText + "\")"; + } + + /** + * Create a numeric hash value for a Diff. + * This function is not used by DMP. + * @return Hash value. + */ + @Override + public int hashCode() { + final int prime = 31; + int result = (operation == null) ? 0 : operation.hashCode(); + result += prime * ((text == null) ? 0 : text.hashCode()); + return result; + } + + /** + * Is this Diff equivalent to another Diff? + * @param obj Another Diff to compare against. + * @return true or false. + */ + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + Diff other = (Diff) obj; + if (operation != other.operation) { + return false; + } + if (text == null) { + if (other.text != null) { + return false; + } + } else if (!text.equals(other.text)) { + return false; + } + return true; + } + } + + /** + * Class representing one patch operation. + */ + public static class Patch { + public LinkedList diffs; + public int start1; + public int start2; + public int length1; + public int length2; + + /** + * Constructor. Initializes with an empty list of diffs. + */ + public Patch() { + this.diffs = new LinkedList(); + } + + /** + * Emulate GNU diff's format. + * Header: @@ -382,8 +481,9 @@ + * Indices are printed as 1-based, not 0-based. + * @return The GNU diff string. + */ + public String toString() { + String coords1, coords2; + if (this.length1 == 0) { + coords1 = this.start1 + ",0"; + } else if (this.length1 == 1) { + coords1 = Integer.toString(this.start1 + 1); + } else { + coords1 = (this.start1 + 1) + "," + this.length1; + } + if (this.length2 == 0) { + coords2 = this.start2 + ",0"; + } else if (this.length2 == 1) { + coords2 = Integer.toString(this.start2 + 1); + } else { + coords2 = (this.start2 + 1) + "," + this.length2; + } + StringBuilder text = new StringBuilder(); + text + .append("@@ -") + .append(coords1) + .append(" +") + .append(coords2) + .append(" @@\n"); + // Escape the body of the patch with %xx notation. + for (Diff aDiff : this.diffs) { + switch (aDiff.operation) { + case INSERT: + text.append('+'); + break; + case DELETE: + text.append('-'); + break; + case EQUAL: + text.append(' '); + break; + } + try { + text + .append(URLEncoder.encode(aDiff.text, "UTF-8").replace('+', ' ')) + .append("\n"); + } catch (UnsupportedEncodingException e) { + // Not likely on modern system. + throw new Error("This system does not support UTF-8.", e); + } + } + return unescapeForEncodeUriCompatability(text.toString()); + } + } + + /** + * Unescape selected chars for compatability with JavaScript's encodeURI. + * In speed critical applications this could be dropped since the + * receiving application will certainly decode these fine. + * Note that this function is case-sensitive. Thus "%3f" would not be + * unescaped. But this is ok because it is only called with the output of + * URLEncoder.encode which returns uppercase hex. + * + * Example: "%3F" -> "?", "%24" -> "$", etc. + * + * @param str The string to escape. + * @return The escaped string. + */ + private static String unescapeForEncodeUriCompatability(String str) { + return str + .replace("%21", "!") + .replace("%7E", "~") + .replace("%27", "'") + .replace("%28", "(") + .replace("%29", ")") + .replace("%3B", ";") + .replace("%2F", "/") + .replace("%3F", "?") + .replace("%3A", ":") + .replace("%40", "@") + .replace("%26", "&") + .replace("%3D", "=") + .replace("%2B", "+") + .replace("%24", "$") + .replace("%2C", ",") + .replace("%23", "#"); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java index fdbd6e99d..33183b0f6 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java @@ -1,3 +1,4 @@ + package eu.dnetlib.pace.util; import com.google.common.base.Function; @@ -7,4 +8,4 @@ public class DotAbbreviations implements Function { public String apply(String s) { return s.length() == 1 ? s + "." : s; } -}; \ No newline at end of file +}; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index cc801068b..2c1a1700b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -1,166 +1,177 @@ -package eu.dnetlib.pace.util; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.Configuration; -import com.jayway.jsonpath.JsonPath; -import com.jayway.jsonpath.Option; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.*; -import net.minidev.json.JSONArray; +package eu.dnetlib.pace.util; import java.math.BigDecimal; import java.util.*; import java.util.function.Predicate; import java.util.stream.Collectors; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.*; +import net.minidev.json.JSONArray; + public class MapDocumentUtil { - public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; - public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); + public static final String URL_REGEX = "^(http|https|ftp)\\://.*"; + public static Predicate urlFilter = s -> s.trim().matches(URL_REGEX); - public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { - MapDocument m = new MapDocument(); - m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); - Map stringField = new HashMap<>(); - conf.getPace().getModel().forEach(fdef -> { - switch (fdef.getType()) { - case String: - case Int: - stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), truncateValue(getJPathString(fdef.getPath(), json), fdef.getLength()))); - break; - case URL: - String uv = getJPathString(fdef.getPath(), json); - if (!urlFilter.test(uv)) uv = ""; - stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); - break; - case List: - case JSON: - FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); - truncateList(getJPathList(fdef.getPath(), json, fdef.getType()), fdef.getSize()) - .stream() - .map(item -> new FieldValueImpl(Type.String, fdef.getName(), item)) - .forEach(fi::add); - stringField.put(fdef.getName(), fi); - break; - case DoubleArray: - stringField.put( - fdef.getName(), - new FieldValueImpl(Type.DoubleArray, - fdef.getName(), - getJPathArray(fdef.getPath(), json)) - ); - break; - case StringConcat: - String[] jpaths = fdef.getPath().split("\\|\\|\\|"); - stringField.put( - fdef.getName(), - new FieldValueImpl(Type.String, - fdef.getName(), - truncateValue(Arrays.stream(jpaths).map(jpath -> getJPathString(jpath, json)).collect(Collectors.joining(" ")), - fdef.getLength()) - ) - ); - break; - } - }); - m.setFieldMap(stringField); - return m; - } + public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) { + MapDocument m = new MapDocument(); + m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json)); + Map stringField = new HashMap<>(); + conf.getPace().getModel().forEach(fdef -> { + switch (fdef.getType()) { + case String: + case Int: + stringField + .put( + fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), + truncateValue(getJPathString(fdef.getPath(), json), fdef.getLength()))); + break; + case URL: + String uv = getJPathString(fdef.getPath(), json); + if (!urlFilter.test(uv)) + uv = ""; + stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv)); + break; + case List: + case JSON: + FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType()); + truncateList(getJPathList(fdef.getPath(), json, fdef.getType()), fdef.getSize()) + .stream() + .map(item -> new FieldValueImpl(Type.String, fdef.getName(), item)) + .forEach(fi::add); + stringField.put(fdef.getName(), fi); + break; + case DoubleArray: + stringField + .put( + fdef.getName(), + new FieldValueImpl(Type.DoubleArray, + fdef.getName(), + getJPathArray(fdef.getPath(), json))); + break; + case StringConcat: + String[] jpaths = fdef.getPath().split("\\|\\|\\|"); + stringField + .put( + fdef.getName(), + new FieldValueImpl(Type.String, + fdef.getName(), + truncateValue( + Arrays + .stream(jpaths) + .map(jpath -> getJPathString(jpath, json)) + .collect(Collectors.joining(" ")), + fdef.getLength()))); + break; + } + }); + m.setFieldMap(stringField); + return m; + } - public static List getJPathList(String path, String json, Type type) { - if (type == Type.List) - return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); - Object jresult; - List result = new ArrayList<>(); - try { - jresult = JsonPath.read(json, path); - } catch (Throwable e) { - return result; - } - if (jresult instanceof JSONArray) { + public static List getJPathList(String path, String json, Type type) { + if (type == Type.List) + return JsonPath + .using( + Configuration + .defaultConfiguration() + .addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)) + .parse(json) + .read(path); + Object jresult; + List result = new ArrayList<>(); + try { + jresult = JsonPath.read(json, path); + } catch (Throwable e) { + return result; + } + if (jresult instanceof JSONArray) { - ((JSONArray) jresult).forEach(it -> { + ((JSONArray) jresult).forEach(it -> { - try { - result.add(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { + try { + result.add(new ObjectMapper().writeValueAsString(it)); + } catch (JsonProcessingException e) { - } - } - ); - return result; - } + } + }); + return result; + } - if (jresult instanceof LinkedHashMap) { - try { - result.add(new ObjectMapper().writeValueAsString(jresult)); - } catch (JsonProcessingException e) { + if (jresult instanceof LinkedHashMap) { + try { + result.add(new ObjectMapper().writeValueAsString(jresult)); + } catch (JsonProcessingException e) { - } - return result; - } - if (jresult instanceof String) { - result.add((String) jresult); - } - return result; - } + } + return result; + } + if (jresult instanceof String) { + result.add((String) jresult); + } + return result; + } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) - return (String)o; - if (o instanceof JSONArray && ((JSONArray)o).size()>0) - return (String)((JSONArray)o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } + public static double[] getJPathArray(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof double[]) + return (double[]) o; + if (o instanceof JSONArray) { + Object[] objects = ((JSONArray) o).toArray(); + double[] array = new double[objects.length]; + for (int i = 0; i < objects.length; i++) { + if (objects[i] instanceof BigDecimal) + array[i] = ((BigDecimal) objects[i]).doubleValue(); + else + array[i] = (double) objects[i]; + } + return array; + } + return new double[0]; + } catch (Exception e) { + e.printStackTrace(); + return new double[0]; + } + } - public static double[] getJPathArray(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof double[]) - return (double[]) o; - if (o instanceof JSONArray) { - Object[] objects = ((JSONArray) o).toArray(); - double[] array = new double[objects.length]; - for (int i = 0; i < objects.length; i++) { - if (objects[i] instanceof BigDecimal) - array[i] = ((BigDecimal)objects[i]).doubleValue(); - else - array[i] = (double) objects[i]; - } - return array; - } - return new double[0]; - } - catch (Exception e) { - e.printStackTrace(); - return new double[0]; - } - } + public static String truncateValue(String value, int length) { + if (value == null) + return ""; + if (length == -1 || length > value.length()) + return value; - public static String truncateValue(String value, int length) { - if (value == null) - return ""; + return value.substring(0, length); + } - if (length == -1 || length > value.length()) - return value; + public static List truncateList(List list, int size) { + if (size == -1 || size > list.size()) + return list; - return value.substring(0, length); - } - - public static List truncateList(List list, int size) { - if (size == -1 || size > list.size()) - return list; - - return list.subList(0, size); - } + return list.subList(0, size); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java index 198861c53..077139482 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceException.java @@ -1,13 +1,14 @@ + package eu.dnetlib.pace.util; public class PaceException extends RuntimeException { - public PaceException(String s, Throwable e){ - super(s, e); - } + public PaceException(String s, Throwable e) { + super(s, e); + } - public PaceException(String s){ - super(s); - } + public PaceException(String s) { + super(s); + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index bf6feea1c..252205c79 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -1,49 +1,61 @@ -package eu.dnetlib.pace.util; -import eu.dnetlib.pace.clustering.ClusteringClass; -import eu.dnetlib.pace.clustering.ClusteringFunction; -import eu.dnetlib.pace.tree.support.Comparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.reflections.Reflections; +package eu.dnetlib.pace.util; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; import java.util.Map; import java.util.stream.Collectors; +import org.reflections.Reflections; + +import eu.dnetlib.pace.clustering.ClusteringClass; +import eu.dnetlib.pace.clustering.ClusteringFunction; +import eu.dnetlib.pace.tree.support.Comparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + public class PaceResolver implements Serializable { - public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); - public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); + public static final Reflections CLUSTERING_RESOLVER = new Reflections("eu.dnetlib.pace.clustering"); + public static final Reflections COMPARATOR_RESOLVER = new Reflections("eu.dnetlib.pace.tree"); - private final Map> clusteringFunctions; - private final Map> comparators; + private final Map> clusteringFunctions; + private final Map> comparators; - public PaceResolver() { + public PaceResolver() { - this.clusteringFunctions = CLUSTERING_RESOLVER.getTypesAnnotatedWith(ClusteringClass.class).stream() - .filter(ClusteringFunction.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class)cl)); + this.clusteringFunctions = CLUSTERING_RESOLVER + .getTypesAnnotatedWith(ClusteringClass.class) + .stream() + .filter(ClusteringFunction.class::isAssignableFrom) + .collect( + Collectors + .toMap( + cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class) cl)); - this.comparators = COMPARATOR_RESOLVER.getTypesAnnotatedWith(ComparatorClass.class).stream() - .filter(Comparator.class::isAssignableFrom) - .collect(Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class)cl)); - } + this.comparators = COMPARATOR_RESOLVER + .getTypesAnnotatedWith(ComparatorClass.class) + .stream() + .filter(Comparator.class::isAssignableFrom) + .collect( + Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class) cl)); + } - public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { - try { - return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) { - throw new PaceException(name + " not found ", e); - } - } + public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { + try { + return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException + | NoSuchMethodException e) { + throw new PaceException(name + " not found ", e); + } + } - public Comparator getComparator(String name, Map params) throws PaceException { - try { - return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); - } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException | NullPointerException e) { - throw new PaceException(name + " not found ", e); - } - } + public Comparator getComparator(String name, Map params) throws PaceException { + try { + return comparators.get(name).getDeclaredConstructor(Map.class).newInstance(params); + } catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException + | NullPointerException e) { + throw new PaceException(name + " not found ", e); + } + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java index 10c886cb5..fd6761aa1 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/Reporter.java @@ -1,11 +1,11 @@ -package eu.dnetlib.pace.util; +package eu.dnetlib.pace.util; import java.io.Serializable; public interface Reporter extends Serializable { - void incrementCounter(String counterGroup, String counterName, long delta); + void incrementCounter(String counterGroup, String counterName, long delta); - void emit(String type, String from, String to); + void emit(String type, String from, String to); } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java index 27c804ac7..2a37701aa 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -1,11 +1,5 @@ -package eu.dnetlib.pace; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import org.apache.commons.io.IOUtils; +package eu.dnetlib.pace; import java.io.IOException; import java.io.StringWriter; @@ -13,6 +7,14 @@ import java.nio.charset.StandardCharsets; import java.util.List; import java.util.stream.Collectors; +import org.apache.commons.io.IOUtils; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; + public abstract class AbstractPaceTest extends AbstractPaceFunctions { protected String readFromClasspath(final String filename) { @@ -41,9 +43,12 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions { return new FieldValueImpl(Type.DoubleArray, "array", a); } - protected Field createFieldList(List strings, String fieldName){ + protected Field createFieldList(List strings, String fieldName) { - List fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList()); + List fieldValueStream = strings + .stream() + .map(s -> new FieldValueImpl(Type.String, fieldName, s)) + .collect(Collectors.toList()); FieldListImpl a = new FieldListImpl(); a.addAll(fieldValueStream); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index f57daaa32..9873278b9 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -1,17 +1,20 @@ -package eu.dnetlib.pace.clustering; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.DedupConfig; -import org.junit.jupiter.api.*; +package eu.dnetlib.pace.clustering; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; +import org.junit.jupiter.api.*; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.DedupConfig; + public class ClusteringFunctionTest extends AbstractPaceTest { private static Map params; @@ -20,7 +23,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @BeforeAll public static void setUp() throws Exception { params = Maps.newHashMap(); - conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class)); + conf = DedupConfig + .load( + AbstractPaceFunctions + .readFromClasspath( + "/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class)); } @Test @@ -210,7 +217,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } @Test - public void testPersonClustering(){ + public void testPersonClustering() { final ClusteringFunction cf = new PersonClustering(params); final String s = "Abd-Alla, Abo-el-nour N."; @@ -224,7 +231,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } @Test - public void testPersonHash(){ + public void testPersonHash() { final ClusteringFunction cf = new PersonHash(params); final String s = "Manghi, Paolo"; @@ -238,7 +245,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { } @Test - public void testLastNameFirstInitial(){ + public void testLastNameFirstInitial() { final ClusteringFunction cf = new LastNameFirstInitial(params); final String s = "LI Yonghong"; @@ -246,4 +253,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(cf.apply(conf, Lists.newArrayList(title(s)))); } -} \ No newline at end of file +} diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java index ff1ca6721..7fd81d975 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java @@ -1,56 +1,57 @@ + package eu.dnetlib.pace.common; -import org.junit.jupiter.api.*; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; +import org.junit.jupiter.api.*; + public class PaceFunctionTest extends AbstractPaceFunctions { - private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; + private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; - @Test - public void normalizePidTest(){ + @Test + public void normalizePidTest() { - assertEquals("identifier", normalizePid("IdentifIer")); - assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); - assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); - assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); - } + assertEquals("identifier", normalizePid("IdentifIer")); + assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347")); + assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI")); + assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI")); + } - @Test - public void filterAllStopwordsTest(){ + @Test + public void filterAllStopwordsTest() { - assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); - } + assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche")); + } - @Test - public void normalizeTest() { - assertEquals("universitat", normalize("Universität")); + @Test + public void normalizeTest() { + assertEquals("universitat", normalize("Universität")); - System.out.println(normalize("İstanbul Ticarət Universiteti")); - } + System.out.println(normalize("İstanbul Ticarət Universiteti")); + } - @Test - public void cleanupTest() { - assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti")); + @Test + public void cleanupTest() { + assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti")); + System.out.println("cleaned up : " + cleanup(TEST_STRING)); + } - System.out.println("cleaned up : " + cleanup(TEST_STRING)); - } + @Test + public void testGetNumbers() { + System.out.println("Numbers : " + getNumbers(TEST_STRING)); + } - @Test - public void testGetNumbers() { - System.out.println("Numbers : " + getNumbers(TEST_STRING)); - } + @Test + public void testRemoveSymbols() { + System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); + } - @Test - public void testRemoveSymbols() { - System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); - } - - @Test - public void testFixAliases() { - System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); - } + @Test + public void testFixAliases() { + System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); + } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index f0333cbc8..5c846c058 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -1,14 +1,6 @@ + package eu.dnetlib.pace.comparators; -import eu.dnetlib.pace.AbstractPaceTest; -import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.tree.*; -import eu.dnetlib.pace.config.DedupConfig; - -import org.junit.jupiter.api.*; import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.ArrayList; @@ -16,6 +8,16 @@ import java.util.Arrays; import java.util.HashMap; import java.util.Map; +import org.junit.jupiter.api.*; + +import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldValueImpl; +import eu.dnetlib.pace.tree.*; + @TestInstance(TestInstance.Lifecycle.PER_CLASS) public class ComparatorTest extends AbstractPaceTest { @@ -24,7 +26,8 @@ public class ComparatorTest extends AbstractPaceTest { @BeforeAll public void setup() { - conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); + conf = DedupConfig + .load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); } @BeforeEach @@ -37,7 +40,6 @@ public class ComparatorTest extends AbstractPaceTest { params.put("jpath_classid", "$.qualifier.classid"); } - @Test public void testCleanForSorting() { NGramUtils utils = new NGramUtils(); @@ -48,22 +50,27 @@ public class ComparatorTest extends AbstractPaceTest { public void cityMatchTest() { final CityMatch cityMatch = new CityMatch(params); - //both names with no cities + // both names with no cities assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf)); - //one of the two names with no cities + // one of the two names with no cities assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf)); - //both names with cities (same) + // both names with cities (same) assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf)); - //both names with cities (different) + // both names with cities (different) assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf)); - //particular cases + // particular cases assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); - assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); + assertEquals( + 1.0, + cityMatch + .distance( + "Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", + conf)); // failing becasuse 'Allen' is a transliterrated greek stopword // assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf)); @@ -71,17 +78,18 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void keywordMatchTest(){ + public void keywordMatchTest() { params.put("threshold", "0.5"); final KeywordMatch keywordMatch = new KeywordMatch(params); - assertEquals(0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); + assertEquals( + 0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf)); assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf)); assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); - assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); + assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf)); assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf)); assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf)); @@ -89,7 +97,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void listContainsMatchTest(){ + public void listContainsMatchTest() { Field a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType"); Field b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType"); @@ -112,7 +120,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void stringContainsMatchTest(){ + public void stringContainsMatchTest() { params.put("string", "openorgs"); params.put("aggregator", "XOR"); @@ -132,7 +140,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void numbersMatchTest(){ + public void numbersMatchTest() { final NumbersMatch numbersMatch = new NumbersMatch(params); assertEquals(0.0, numbersMatch.distance("University of Rennes 2", "Universita di Rennes 7", conf)); @@ -140,7 +148,7 @@ public class ComparatorTest extends AbstractPaceTest { } @Test - public void romansMatchTest(){ + public void romansMatchTest() { final RomansMatch romansMatch = new RomansMatch(params); @@ -154,8 +162,9 @@ public class ComparatorTest extends AbstractPaceTest { final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); - double result = jaroWinklerNormalizedName.distance("AT&T (United States)", "United States Military Academy", conf); - System.out.println("result = " + result); + double result = jaroWinklerNormalizedName + .distance("AT&T (United States)", "United States Military Academy", conf); + System.out.println("result = " + result); result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); System.out.println("result = " + result); @@ -183,7 +192,11 @@ public class ComparatorTest extends AbstractPaceTest { final LevensteinTitle levensteinTitle = new LevensteinTitle(params); - double result = levensteinTitle.distance("Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6", "Degradation of lignin β-aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK-6", conf); + double result = levensteinTitle + .distance( + "Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6", + "Degradation of lignin β-aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK-6", + conf); System.out.println("result = " + result); } @@ -207,13 +220,15 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(1.0, result); - Field c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); + Field c = createFieldList( + Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); result = instanceTypeMatch.compare(c, b, conf); assertEquals(1.0, result); Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); - Field e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); + Field e = createFieldList( + Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); result = instanceTypeMatch.compare(d, e, conf); assertEquals(1.0, result); @@ -234,7 +249,8 @@ public class ComparatorTest extends AbstractPaceTest { AuthorsMatch authorsMatch = new AuthorsMatch(params); - Field a = createFieldList(Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors"); + Field a = createFieldList( + Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors"); Field b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors"); double result = authorsMatch.compare(a, b, conf); @@ -244,7 +260,7 @@ public class ComparatorTest extends AbstractPaceTest { Field d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors"); result = authorsMatch.compare(c, d, conf); - assertEquals(0.0, result) ; + assertEquals(0.0, result); params.put("mode", "surname"); authorsMatch = new AuthorsMatch(params); @@ -258,7 +274,7 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(0.25, result); Field f = createFieldList(new ArrayList<>(), "authors"); - result = authorsMatch.compare(f,f, conf); + result = authorsMatch.compare(f, f, conf); System.out.println("result = " + result); } @@ -268,8 +284,19 @@ public class ComparatorTest extends AbstractPaceTest { JsonListMatch jsonListMatch = new JsonListMatch(params); - Field a = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors"); - Field b = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), "authors"); + Field a = createFieldList( + Arrays + .asList( + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), + "authors"); + Field b = createFieldList( + Arrays + .asList( + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), + "authors"); double result = jsonListMatch.compare(a, b, conf); @@ -299,13 +326,16 @@ public class ComparatorTest extends AbstractPaceTest { CosineSimilarity cosineSimilarity = new CosineSimilarity(params); - Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3}); - Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3}); + Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[] { + 1, 2, 3 + }); + Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[] { + 1, 2, 3 + }); double compare = cosineSimilarity.compare(a, b, conf); System.out.println("compare = " + compare); } - } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index 4a2a062a1..02b59354a 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -1,5 +1,14 @@ + package eu.dnetlib.pace.config; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.*; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.*; import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; @@ -15,15 +24,6 @@ import eu.dnetlib.pace.tree.support.FieldConf; import eu.dnetlib.pace.tree.support.TreeNodeDef; import eu.dnetlib.pace.tree.support.TreeNodeStats; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.junit.jupiter.api.*; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import java.util.*; -import java.util.stream.Collectors; - public class ConfigTest extends AbstractPaceTest { @@ -68,7 +68,7 @@ public class ConfigTest extends AbstractPaceTest { System.out.println("translationMap = " + translationMap.size()); - for (String key: translationMap.keySet()) { + for (String key : translationMap.keySet()) { if (translationMap.get(key).equals("key::1")) System.out.println("key = " + key); } @@ -99,7 +99,7 @@ public class ConfigTest extends AbstractPaceTest { System.out.println("mapDocument = " + mapDocument.getFieldMap().get("title").stringValue()); - } + } @Test public void authorAsMapDocument() { @@ -110,20 +110,23 @@ public class ConfigTest extends AbstractPaceTest { final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); - System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue())); + System.out + .println( + "mapDocument = " + + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue())); } - @Test - public void testJPath() { - final String json = readFromClasspath("organization.json"); + @Test + public void testJPath() { + final String json = readFromClasspath("organization.json"); - final String jpath ="$.id"; + final String jpath = "$.id"; - System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); - } + System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json)); + } - @Test + @Test public void clusteringCombinerTest() { DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); @@ -150,7 +153,7 @@ public class ConfigTest extends AbstractPaceTest { Collection strings = BlacklistAwareClusteringCombiner.filterAndCombine(mapDocument, dedupConf); - for (String s: strings) { + for (String s : strings) { System.out.println("s = " + s); } @@ -169,7 +172,7 @@ public class ConfigTest extends AbstractPaceTest { TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf); - assertTrue(nodeStats.getFinalScore(AggType.MAX)>0.7); + assertTrue(nodeStats.getFinalScore(AggType.MAX) > 0.7); } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index a5c6d2729..41b24d8be 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -1,38 +1,41 @@ + package eu.dnetlib.pace.util; -import eu.dnetlib.pace.model.Person; -import org.junit.jupiter.api.*; +import static org.junit.jupiter.api.Assertions.assertEquals; import java.util.HashMap; import java.util.Map; -import static org.junit.jupiter.api.Assertions.assertEquals; + +import org.junit.jupiter.api.*; + +import eu.dnetlib.pace.model.Person; public class UtilTest { - static Map params; + static Map params; - @BeforeAll - public static void setUp(){ - params = new HashMap<>(); - } + @BeforeAll + public static void setUp() { + params = new HashMap<>(); + } - @Test - public void paceResolverTest() { - PaceResolver paceResolver = new PaceResolver(); - paceResolver.getComparator("keywordMatch", params); - } + @Test + public void paceResolverTest() { + PaceResolver paceResolver = new PaceResolver(); + paceResolver.getComparator("keywordMatch", params); + } - @Test - public void personTest() { - Person p = new Person("j. f. kennedy", false); + @Test + public void personTest() { + Person p = new Person("j. f. kennedy", false); - assertEquals("kennedy", p.getSurnameString()); - assertEquals("j f", p.getNameString()); + assertEquals("kennedy", p.getSurnameString()); + assertEquals("j f", p.getNameString()); - p = new Person("Guan-Hua Du", false); + p = new Person("Guan-Hua Du", false); - System.out.println("surname = " + p.getSurnameString()); - System.out.println("name = " + p.getNameString()); - } + System.out.println("surname = " + p.getSurnameString()); + System.out.println("name = " + p.getNameString()); + } } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala index 3315fc41d..2cbd53097 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/SparkGenerateDoiBoost.scala @@ -284,4 +284,4 @@ object SparkGenerateDoiBoost { .save(s"$workingDirPath/doiBoostOrganization") } -} \ No newline at end of file +}