From b0ade43608093565aea3fb7de49da17b3ebe0271 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 16 Jun 2023 09:41:11 +0200 Subject: [PATCH] Precompile blacklists patterns before evaluating clustering criteria Enable Junit 5 tests in maven builds Make path comparisons platform-independent Read String resource files assuming they are encoded in UTF-8 Fix a few test conditions --- .../BlacklistAwareClusteringCombiner.java | 84 +++++++++---------- .../pace/clustering/ClusteringCombiner.java | 6 +- .../dnetlib/pace/clustering/FieldFilter.java | 48 ----------- .../pace/common/AbstractPaceFunctions.java | 28 +++---- .../java/eu/dnetlib/pace/config/Config.java | 3 +- .../eu/dnetlib/pace/config/DedupConfig.java | 20 ++++- .../pace/tree/StringContainsMatch.java | 33 ++++---- .../eu/dnetlib/pace/AbstractPaceTest.java | 3 +- .../pace/comparators/ComparatorTest.java | 18 ++-- .../java/eu/dnetlib/pace/util/UtilTest.java | 2 - 10 files changed, 108 insertions(+), 137 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java index 0167d2fd0..79a264a49 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java @@ -1,59 +1,59 @@ package eu.dnetlib.pace.clustering; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; import com.google.common.collect.Maps; - import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldListImpl; import eu.dnetlib.pace.model.MapDocument; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.regex.Pattern; public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { - private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class); + public static Collection filterAndCombine(final MapDocument a, final Config conf) { + Document filtered = filter(a, conf.blacklists()); + return combine(filtered, conf); + } - public static Collection filterAndCombine(final MapDocument a, final Config conf) { + private static MapDocument filter(final MapDocument a, final Map> blacklists) { + if (blacklists == null || blacklists.isEmpty()) { + return a; + } - final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists()); - return combine(filtered, conf); - } + final Map filtered = Maps.newHashMap(a.getFieldMap()); - private MapDocument filter(final MapDocument a, final Map> blacklists) { - final Map filtered = Maps.newHashMap(a.getFieldMap()); - if (blacklists != null) { - for (final Entry e : filtered.entrySet()) { + for (final Entry> e : blacklists.entrySet()) { + Field fields = a.getFieldMap().get(e.getKey()); + if (fields != null) { + final FieldListImpl fl = new FieldListImpl(); - final FieldListImpl fl = new FieldListImpl(); - fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); - filtered.put(e.getKey(), fl); - } - } - return new MapDocument(a.getIdentifier(), filtered); - } + for (Field f : fields) { + if (!isBlackListed(f.stringValue(), e.getValue())) { + fl.add(f); + } + } + + filtered.put(e.getKey(), fl); + } + } + + return new MapDocument(a.getIdentifier(), filtered); + } + + private static boolean isBlackListed(String value, List blacklist) { + for (Pattern pattern : blacklist) { + if (pattern.matcher(value).matches()) { + return true; + } + } + + return false; + } - /** - * Tries to match the fields in the regex blacklist. - * - * @param fieldName - * @param value - * @return true if the field matches, false otherwise - */ - protected boolean regexMatches(final String fieldName, final String value, final Map> blacklists) { - if (blacklists.containsKey(fieldName)) { - for (final String regex : blacklists.get(fieldName)) { - if (value.matches(regex)) return true; - } - } - return false; - } } + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java index 77a6aa137..037476289 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -20,10 +20,6 @@ public class ClusteringCombiner { private static String COLLAPSE_ON= "collapseOn"; public static Collection combine(final Document a, final Config conf) { - return new ClusteringCombiner().doCombine(a, conf); - } - - private Collection doCombine(final Document a, final Config conf) { final Collection res = Sets.newLinkedHashSet(); for (final ClusteringDef cd : conf.clusterings()) { for (final String fieldName : cd.getFields()) { @@ -51,7 +47,7 @@ public class ClusteringCombiner { return res; } - private String getPrefix(ClusteringDef cd, String fieldName) { + private static String getPrefix(ClusteringDef cd, String fieldName) { return cd.getName()+ SEPARATOR + cd.getParams().keySet() .stream() diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java deleted file mode 100644 index 60d956970..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java +++ /dev/null @@ -1,48 +0,0 @@ -package eu.dnetlib.pace.clustering; - -import java.util.List; -import java.util.Map; - -import com.google.common.base.Predicate; - -import eu.dnetlib.pace.model.Field; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -public class FieldFilter implements Predicate { - - private static final Log log = LogFactory.getLog(FieldFilter.class); - - private Map> blacklists; - - private String filedName; - - public FieldFilter(final String fieldName, final Map> blacklists) { - this.filedName = fieldName; - this.blacklists = blacklists; - } - - @Override - public boolean apply(final Field f) { - return !regexMatches(filedName, f.stringValue(), blacklists); - } - - /** - * Tries to match the fields in the regex blacklist. - * - * @param fieldName - * @param value - * @return true if the field matches, false otherwise - */ - protected boolean regexMatches(final String fieldName, final String value, final Map> blacklists) { - if (blacklists.containsKey(fieldName)) { - final Iterable regexes = blacklists.get(fieldName); - for (final String regex : regexes) { - if (StringUtils.isBlank(regex)) return false; - if (value.matches(regex)) return true; - } - } - return false; - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 0802dfba3..bfe9f6220 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -3,28 +3,23 @@ package eu.dnetlib.pace.common; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import com.ibm.icu.text.Transliterator; import eu.dnetlib.pace.clustering.NGramUtils; -import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import java.io.IOException; import java.io.StringWriter; -import java.io.UnsupportedEncodingException; import java.nio.charset.StandardCharsets; import java.text.Normalizer; import java.util.*; -import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -import com.ibm.icu.text.Transliterator; /** * Set of common functions for the framework @@ -133,10 +128,12 @@ public abstract class AbstractPaceFunctions { protected static String fixAliases(final String s) { final StringBuilder sb = new StringBuilder(); - for (final char ch : Lists.charactersOf(s)) { + + s.chars().forEach(ch -> { final int i = StringUtils.indexOf(aliases_from, ch); - sb.append(i >= 0 ? aliases_to.charAt(i) : ch); - } + sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch); + }); + return sb.toString(); } @@ -152,9 +149,10 @@ public abstract class AbstractPaceFunctions { protected String removeSymbols(final String s) { final StringBuilder sb = new StringBuilder(); - for (final char ch : Lists.charactersOf(s)) { - sb.append(StringUtils.contains(alpha, ch) ? ch : " "); - } + s.chars().forEach(ch -> { + sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' '); + }); + return sb.toString().replaceAll("\\s+", " "); } @@ -241,7 +239,7 @@ public abstract class AbstractPaceFunctions { final Set h = Sets.newHashSet(); try { - for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) { + for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords } } catch (final Throwable e) { @@ -256,7 +254,7 @@ public abstract class AbstractPaceFunctions { final Map m = new HashMap<>(); try { - for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) { + for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) { //string is like this: code;word1;word2;word3 String[] line = s.split(";"); String value = line[0]; @@ -349,7 +347,7 @@ public abstract class AbstractPaceFunctions { public static String readFromClasspath(final String filename, final Class clazz) { final StringWriter sw = new StringWriter(); try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); + IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8); return sw.toString(); } catch (final IOException e) { throw new RuntimeException("cannot load resource from classpath: " + filename); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java index 32f192fa0..6b44f4ebd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.config; import java.util.List; import java.util.Map; +import java.util.regex.Pattern; import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.FieldDef; @@ -47,7 +48,7 @@ public interface Config { * * @return the map */ - public Map> blacklists(); + public Map> blacklists(); /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java index 6f91ebf0c..a377b087f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.config; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; import eu.dnetlib.pace.model.ClusteringDef; @@ -7,15 +8,19 @@ import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.util.PaceException; import org.antlr.stringtemplate.StringTemplate; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.io.IOException; import java.io.Serializable; +import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import eu.dnetlib.pace.tree.support.TreeNodeDef; @@ -31,6 +36,9 @@ public class DedupConfig implements Config, Serializable { private WfConfig wf; + @JsonIgnore + private Map> blacklists; + private static Map defaults = Maps.newHashMap(); static { @@ -57,6 +65,12 @@ public class DedupConfig implements Config, Serializable { config = new ObjectMapper().readValue(json, DedupConfig.class); config.getPace().initModel(); config.getPace().initTranslationMap(); + + config.blacklists = config.getPace().getBlacklists().entrySet() + .stream() + .collect(Collectors.toMap(e -> e.getKey(), + e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) )); + return config; } catch (IOException e) { throw new PaceException("Error in parsing configuration json", e); @@ -88,7 +102,7 @@ public class DedupConfig implements Config, Serializable { } private String readFromClasspath(final String resource) throws IOException { - return IOUtils.toString(getClass().getResource(resource)); + return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8); } public PaceConfig getPace() { @@ -137,8 +151,8 @@ public class DedupConfig implements Config, Serializable { } @Override - public Map> blacklists() { - return getPace().getBlacklists(); + public Map> blacklists() { + return blacklists; } @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java index 126c01010..9c5a9fed5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java @@ -42,22 +42,25 @@ public class StringContainsMatch extends AbstractComparator { STRING = STRING.toLowerCase(); } - switch(AGGREGATOR) { - case "AND": - if(ca.contains(STRING) && cb.contains(STRING)) - return 1.0; - break; - case "OR": - if(ca.contains(STRING) || cb.contains(STRING)) - return 1.0; - break; - case "XOR": - if(ca.contains(STRING) ^ cb.contains(STRING)) - return 1.0; - break; - default: - return 0.0; + if (AGGREGATOR != null) { + switch (AGGREGATOR) { + case "AND": + if (ca.contains(STRING) && cb.contains(STRING)) + return 1.0; + break; + case "OR": + if (ca.contains(STRING) || cb.contains(STRING)) + return 1.0; + break; + case "XOR": + if (ca.contains(STRING) ^ cb.contains(STRING)) + return 1.0; + break; + default: + return 0.0; + } } + return 0.0; } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java index b98fd989b..27c804ac7 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -9,6 +9,7 @@ import org.apache.commons.io.IOUtils; import java.io.IOException; import java.io.StringWriter; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.stream.Collectors; @@ -17,7 +18,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions { protected String readFromClasspath(final String filename) { final StringWriter sw = new StringWriter(); try { - IOUtils.copy(getClass().getResourceAsStream(filename), sw); + IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8); return sw.toString(); } catch (final IOException e) { throw new RuntimeException("cannot load resource from classpath: " + filename); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index b19d77e5c..f0333cbc8 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -24,15 +24,20 @@ public class ComparatorTest extends AbstractPaceTest { @BeforeAll public void setup() { + conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); + } + + @BeforeEach + public void beforeEachTest() { params = new HashMap<>(); params.put("weight", "1.0"); params.put("surname_th", "0.99"); params.put("name_th", "0.95"); params.put("jpath_value", "$.value"); params.put("jpath_classid", "$.qualifier.classid"); - conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); } + @Test public void testCleanForSorting() { NGramUtils utils = new NGramUtils(); @@ -59,7 +64,10 @@ public class ComparatorTest extends AbstractPaceTest { //particular cases assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf)); - assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf)); + + // failing becasuse 'Allen' is a transliterrated greek stopword + // assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf)); + assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf)); } @Test @@ -73,7 +81,7 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf)); assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf)); assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf)); - assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); + assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf)); assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf)); assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf)); assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf)); @@ -107,7 +115,7 @@ public class ComparatorTest extends AbstractPaceTest { public void stringContainsMatchTest(){ params.put("string", "openorgs"); - params.put("bool", "XOR"); + params.put("aggregator", "XOR"); params.put("caseSensitive", "false"); StringContainsMatch stringContainsMatch = new StringContainsMatch(params); @@ -115,7 +123,7 @@ public class ComparatorTest extends AbstractPaceTest { assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf)); params.put("string", "openorgs"); - params.put("bool", "AND"); + params.put("aggregator", "AND"); params.put("caseSensitive", "false"); stringContainsMatch = new StringContainsMatch(params); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index 1e6053246..a5c6d2729 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -1,7 +1,6 @@ package eu.dnetlib.pace.util; import eu.dnetlib.pace.model.Person; -import jdk.nashorn.internal.ir.annotations.Ignore; import org.junit.jupiter.api.*; import java.util.HashMap; @@ -18,7 +17,6 @@ public class UtilTest { } @Test - @Ignore public void paceResolverTest() { PaceResolver paceResolver = new PaceResolver(); paceResolver.getComparator("keywordMatch", params);