From 84aaa655017149ea2bcbfd1330b2fe2a60ac46e9 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 17 Dec 2019 09:16:26 +0100 Subject: [PATCH] implementation of new json comparator and update of the publication configuration --- .../eu/dnetlib/pace/model/adaptor/Pid.java | 57 --- .../pace/model/adaptor/PidOafSerialiser.java | 50 --- .../eu/dnetlib/pace/tree/JsonListMatch.java | 70 ++++ .../java/eu/dnetlib/pace/tree/PidMatch.java | 64 --- .../eu/dnetlib/pace/tree/StringListMatch.java | 47 +++ .../eu/dnetlib/pace/util/MapDocumentUtil.java | 4 +- .../clustering/ClusteringFunctionTest.java | 11 +- .../pace/comparators/ComparatorTest.java | 7 +- .../eu/dnetlib/pace/config/ConfigTest.java | 39 +- .../PersonComparatorUtilsNGramsTest.java | 126 ------ .../PersonComparatorUtilsSimilarityTest.java | 89 ---- .../eu/dnetlib/pace/model/PersonTest.java | 111 ----- .../eu/dnetlib/pace/clustering/gt.author.json | 1 - ...nt.conf => organization.current.conf.json} | 81 +++- ...onf => organization.no_synonyms.conf.json} | 10 +- .../pace/config/publication.current.conf.json | 387 ++++++++++++++++++ .../config/{result.json => publication.json} | 0 .../pace/config/result.pace.conf_jpath.json | 48 --- .../eu/dnetlib/pace/config/result.test.conf | 51 --- 19 files changed, 606 insertions(+), 647 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java delete mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java delete mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java delete mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java delete mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json rename dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/{organization.current.conf => organization.current.conf.json} (91%) rename dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/{organization.no_synonyms.conf => organization.no_synonyms.conf.json} (82%) create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json rename dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/{result.json => publication.json} (100%) delete mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json delete mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java deleted file mode 100644 index 3dd70f7a3..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java +++ /dev/null @@ -1,57 +0,0 @@ -package eu.dnetlib.pace.model.adaptor; - -import java.util.List; - -import com.google.common.base.Function; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.reflect.TypeToken; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * Created by claudio on 01/03/16. - */ -public class Pid { - - private static final Log log = LogFactory.getLog(Pid.class); - - private String value; - - private String type; - - public static List fromOafJson(final List json) { - - log.debug(String.format("\nPid: %s", json)); - - final GsonBuilder gb = new GsonBuilder(); - gb.registerTypeAdapter(Pid.class, new PidOafSerialiser()); - final Gson gson = gb.create(); - - return Lists.newArrayList(Iterables.transform(json, new Function() { - @Override - public Pid apply(final String s) { - return gson.fromJson(s, Pid.class); - } - })); - } - - public String getType() { - return type; - } - - public void setType(final String type) { - this.type = type; - } - - public String getValue() { - return value; - } - - public void setValue(final String value) { - this.value = value; - } - -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java deleted file mode 100644 index 8acaee673..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java +++ /dev/null @@ -1,50 +0,0 @@ -package eu.dnetlib.pace.model.adaptor; - -import java.lang.reflect.Type; -import java.util.List; - -import com.google.common.collect.Lists; -import com.google.gson.*; -import eu.dnetlib.pace.model.gt.GTAuthor; - -/** - * Created by claudio on 01/03/16. - */ -public class PidOafSerialiser implements JsonDeserializer { - - private static final String VALUE = "value"; - - private static final String QUALIFIER = "qualifier"; - private static final String CLASSID = "classid"; - - @Override - public Pid deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { - - final Pid pid = new Pid(); - - pid.setType(getType(json)); - pid.setValue(getValue(json)); - - return pid; - } - - private String getValue(final JsonElement json) { - final JsonObject obj =json.getAsJsonObject(); - return obj.get(VALUE).getAsString(); - - } - - private String getType(final JsonElement json) { - - final JsonObject obj =json.getAsJsonObject(); - - if (!obj.has(QUALIFIER)) - throw new IllegalArgumentException("pid does not contain any type: " + json.toString()); - - final JsonObject qualifier = obj.getAsJsonObject(QUALIFIER); - - final JsonElement classid = qualifier.get(CLASSID); - - return classid.getAsString(); - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java new file mode 100644 index 000000000..166151db0 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -0,0 +1,70 @@ +package eu.dnetlib.pace.tree; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +@ComparatorClass("jsonListMatch") +public class JsonListMatch extends AbstractComparator { + + private static final Log log = LogFactory.getLog(JsonListMatch.class); + private Map params; + + public JsonListMatch(final Map params) { + super(params); + this.params = params; + } + + @Override + public double compare(final Field a, final Field b, final Config conf) { + + final List sa = ((FieldList) a).stringList(); + final List sb = ((FieldList) b).stringList(); + + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } + + final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); + final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); + + int incommon = Sets.intersection(ca, cb).size(); + int simDiff = Sets.symmetricDifference(ca, cb).size(); + + if (incommon + simDiff == 0) { + return 0.0; + } + + return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; + + } + + //converts every json into a comparable string basing on parameters + private String toComparableString(String json){ + + StringBuilder st = new StringBuilder(); //to build the string used for comparisons basing on the jpath into parameters + + //for each path in the param list + for (String key: params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { + String path = params.get(key); + String value = MapDocumentUtil.getJPathString(path, json); + if (value == null || value.isEmpty()) + value = ""; + st.append( value + "::"); + } + + st.setLength(st.length()-2); + return st.toString(); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java deleted file mode 100644 index 0632e8bf9..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/PidMatch.java +++ /dev/null @@ -1,64 +0,0 @@ -package eu.dnetlib.pace.tree; - -import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldList; -import eu.dnetlib.pace.model.adaptor.Pid; -import eu.dnetlib.pace.tree.support.AbstractComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -@ComparatorClass("pidMatch") -public class PidMatch extends AbstractComparator { - - private static final Log log = LogFactory.getLog(PidMatch.class); - private Map params; - - public PidMatch(final Map params) { - super(params); - this.params = params; - } - - @Override - public double compare(final Field a, final Field b, final Config conf) { - - final List sa = ((FieldList) a).stringList(); - final List sb = ((FieldList) b).stringList(); - - final List pal = Pid.fromOafJson(sa); - final List pbl = Pid.fromOafJson(sb); - - if (pal.isEmpty() || pbl.isEmpty()) { - return -1; - } - - final Set pidAset = toHashSet(pal); - final Set pidBset = toHashSet(pbl); - - int incommon = Sets.intersection(pidAset, pidBset).size(); - int simDiff = Sets.symmetricDifference(pidAset, pidBset).size(); - - if (incommon + simDiff == 0) { - return 0.0; - } - - return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; - - } - - //lowercase + normalization of the pid before adding it to the set - private Set toHashSet(List pbl) { - - return pbl.stream() - .map(pid -> pid.getType() + normalizePid(pid.getValue())) - .collect(Collectors.toCollection(HashSet::new)); - } -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java new file mode 100644 index 000000000..f9b53d396 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java @@ -0,0 +1,47 @@ +package eu.dnetlib.pace.tree; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +@ComparatorClass("stringListMatch") +public class StringListMatch extends AbstractComparator { + + private static final Log log = LogFactory.getLog(StringListMatch.class); + private Map params; + + public StringListMatch(final Map params) { + super(params); + this.params = params; + } + + @Override + public double compare(final Field a, final Field b, final Config conf) { + + final Set pa = new HashSet<>(((FieldList) a).stringList()); + final Set pb = new HashSet<>(((FieldList) b).stringList()); + + if (pa.isEmpty() || pb.isEmpty()) { + return -1; //return undefined if one of the two lists of pids is empty + } + + int incommon = Sets.intersection(pa, pb).size(); + int simDiff = Sets.symmetricDifference(pa, pb).size(); + + if (incommon + simDiff == 0) { + return 0.0; + } + + return (double)incommon / (incommon + simDiff) > Double.parseDouble(params.getOrDefault("threshold", "0.5")) ? 1 : 0; + + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index 20141738c..c55e13d36 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -2,7 +2,9 @@ package eu.dnetlib.pace.util; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.Configuration; import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; @@ -55,7 +57,7 @@ public class MapDocumentUtil { public static List getJPathList(String path, String json, Type type) { if (type == Type.List) - return JsonPath.read(json, path); + return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path); Object jresult; List result = new ArrayList<>(); try { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index ff7a49fbe..2bccdd5cb 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -18,7 +18,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Before public void setUp() throws Exception { params = Maps.newHashMap(); - conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class)); + conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ClusteringFunctionTest.class)); } @Test @@ -110,15 +110,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(sp.apply(conf, Lists.newArrayList(title(s)))); } - @Test - public void testPersonClustering2() { - final ClusteringFunction cf = new PersonClustering(params); - - final String s = readFromClasspath("gt.author.json"); - System.out.println(s); - System.out.println(cf.apply(conf, Lists.newArrayList(person(s)))); - } - @Test public void testKeywordsClustering() { diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 905fac4bd..9bd3a44a8 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -4,7 +4,6 @@ import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.config.DedupConfig; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import eu.dnetlib.pace.common.AbstractPaceFunctions; @@ -24,7 +23,7 @@ public class ComparatorTest extends AbstractPaceFunctions { public void setup() { params = new HashMap<>(); params.put("weight", "1.0"); - conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class)); + conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class)); } @@ -115,5 +114,9 @@ public class ComparatorTest extends AbstractPaceFunctions { System.out.println("result = " + result); } + @Test + public void jsonListMatchTest() { + + } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java index cd553baad..09d7c0b9b 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -2,11 +2,15 @@ package eu.dnetlib.pace.config; import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.tree.JsonListMatch; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; import org.junit.Test; +import java.util.HashMap; +import java.util.List; import java.util.Map; import static org.junit.Assert.assertEquals; @@ -16,7 +20,7 @@ public class ConfigTest extends AbstractPaceTest { @Test public void dedupConfigSerializationTest() { - final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf")); + final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf.json")); final String conf = cfgFromClasspath.toString(); @@ -26,13 +30,12 @@ public class ConfigTest extends AbstractPaceTest { assertNotNull(cfgFromClasspath); assertNotNull(cfgFromSerialization); - } @Test public void dedupConfigTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf")); + DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf.json")); System.out.println(load.toString()); } @@ -40,7 +43,7 @@ public class ConfigTest extends AbstractPaceTest { @Test public void initTranslationMapTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf")); + DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf.json")); Map translationMap = load.translationMap(); @@ -50,38 +53,26 @@ public class ConfigTest extends AbstractPaceTest { if (translationMap.get(key).equals("key::1")) System.out.println("key = " + key); } - } @Test public void emptyTranslationMapTest() { - DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf")); + DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf.json")); assertEquals(0, load.getPace().translationMap().keySet().size()); } - - @Test - public void testAsMapDocumentJPath() throws Exception { - - DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf_jpath.json")); - - - System.out.println(load.getWf().getIdPath()); - - final String result =IOUtils.toString(this.getClass().getResourceAsStream("result.json")); - - System.out.println(result); - final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(load, result); - - System.out.println(mapDocument.getFieldMap()); - - } + public void asMapDocumentTest() throws Exception { + DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json")); + final String json = readFromClasspath("publication.json"); + final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json); + System.out.println("mapDocument = " + mapDocument.getFieldMap()); + } } diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java deleted file mode 100644 index b78866c6c..000000000 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java +++ /dev/null @@ -1,126 +0,0 @@ -package eu.dnetlib.pace.model; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.util.Set; - -import org.junit.Test; - -public class PersonComparatorUtilsNGramsTest { - - @Test - public void testNormaizePerson_1() { - verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini"); - } - - @Test - public void testNormaizePerson_2() { - verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini"); - } - - @Test - public void testNormaizePerson_3() { - verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_4() { - verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_5() { - verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini"); - } - - @Test - public void testNormaizePerson_6() { - verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_7() { - verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_8() { - verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini"); - } - - @Test - public void testNormaizePerson_9() { - verifyGetNgramsForPerson("Artini, M", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_10() { - verifyGetNgramsForPerson("Artini, M.", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_11() { - verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_12() { - verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_13() { - verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig"); - } - - @Test - public void testNormaizePerson_14() { - verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig"); - } - - @Test - public void testNormaizePerson_15() { - verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_16() { - verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini"); - } - - @Test - public void testNormaizePerson_17() { - verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0); - } - - @Test - public void testNormaizePerson_18() { - verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico"); - } - - @Test - public void testNormaizePerson_19() { - verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith"); - } - - @Test - public void testNormaizePerson_20() { - verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa"); - } - - @Test - public void testNormaizePerson_21() { - verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6); - } - - private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) { - Set list = PersonComparatorUtils.getNgramsForPerson(name); - System.out.println(list); - assertEquals(expectedSize, list.size()); - for (String s : expectedTokens) { - assertTrue(list.contains(s)); - } - } - -} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java deleted file mode 100644 index 20da8db87..000000000 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java +++ /dev/null @@ -1,89 +0,0 @@ -package eu.dnetlib.pace.model; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -import org.junit.Test; - -public class PersonComparatorUtilsSimilarityTest { - - @Test - public void testSimilarity_0() { - assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini")); - } - - @Test - public void testSimilarity_1() { - assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele")); - } - - @Test - public void testSimilarity_2() { - assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele")); - } - - @Test - public void testSimilarity_3() { - assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele")); - } - - @Test - public void testSimilarity_4() { - assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G.")); - } - - @Test - public void testSimilarity_5() { - assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele")); - } - - @Test - public void testSimilarity_6() { - assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G.")); - } - - @Test - public void testSimilarity_7() { - assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A.")); - } - - @Test - public void testSimilarity_8() { - assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe")); - } - - @Test - public void testSimilarity_9() { - assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele")); - } - - @Test - public void testSimilarity_10() { - assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni")); - } - - @Test - public void testSimilarity_11() { - assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G.")); - } - - @Test - public void testSimilarity_12() { - assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele")); - } - - @Test - public void testSimilarity_13() { - assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele")); - } - - @Test - public void testSimilarity_14() { - assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini")); - } - - @Test - public void testSimilarity_15() { - assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI")); - } -} \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java deleted file mode 100644 index a457fd8de..000000000 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java +++ /dev/null @@ -1,111 +0,0 @@ -package eu.dnetlib.pace.model; - -import static org.junit.Assert.assertEquals; - -import java.text.Normalizer; -import java.util.Queue; - -import org.junit.Test; - -import com.google.common.collect.Lists; - -public class PersonTest { - - @Test - public void test_1() { - check("Atzori, Claudio", "Atzori, Claudio"); - } - - @Test - public void test_2() { - check("Atzori, Claudio A.", "Atzori, Claudio A."); - } - - @Test - public void test_3() { - check("Claudio ATZORI", "Atzori, Claudio"); - } - - @Test - public void test_4() { - check("ATZORI, Claudio", "Atzori, Claudio"); - } - - @Test - public void test_5() { - check("Claudio Atzori", "Claudio Atzori"); - } - - @Test - public void test_6() { - check(" Manghi , Paolo", "Manghi, Paolo"); - } - - @Test - public void test_7() { - check("ATZORI, CLAUDIO", "Atzori, Claudio"); - } - - @Test - public void test_8() { - check("ATZORI, CLAUDIO A", "Atzori, Claudio A."); - } - - @Test - public void test_9() { - check("Bølviken, B.", "Bølviken, B."); - } - - @Test - public void test_10() { - check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B."); - } - - @Test - public void test_11() { - check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true); - } - - @Test - public void test_12() { - check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false); - } - - @Test - public void test_13() { - check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false); - } - - @Test - public void test_hashes() { - checkHash(" Claudio ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio"); - } - - private void checkHash(String... ss) { - Queue q = Lists.newLinkedList(Lists.newArrayList(ss)); - String h1 = new Person(q.remove(), false).hash(); - while (!q.isEmpty()) { - assertEquals(h1, new Person(q.remove(), false).hash()); - } - } - - private void check(String s, String expectedFullName) { - check(s, expectedFullName, false); - } - - private void check(String s, String expectedFullName, boolean aggressive) { - Person p = new Person(s, aggressive); - - System.out.println("original: " + p.getOriginal()); - System.out.println("accurate: " + p.isAccurate()); - System.out.println("normalised: '" + p.getNormalisedFullname() + "'"); - if (p.isAccurate()) { - System.out.println("name: " + p.getNormalisedFirstName()); - System.out.println("surname: " + p.getNormalisedSurname()); - } - System.out.println("hash: " + p.hash()); - System.out.println(""); - assertEquals(expectedFullName, p.getNormalisedFullname()); - } - -} diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json deleted file mode 100644 index d7fbf2166..000000000 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json +++ /dev/null @@ -1 +0,0 @@ -{"metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}},"coauthor": [{"id": "30|od________88::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od________88::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od________88::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}},{"id": "30|od_______908::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od________88::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od_______908::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od________88::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od________88::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od_______908::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od_______908::46acd9f206c2559f13b9119f8c5aef4c","anchorId": "30|dedup_wf_001::06a55cf2c97156d48ec49bcaf4bddcaf","metadata": {"firstname": {"value": "Stephen P."},"secondnames": [{"value": "Goff"}],"fullname": {"value": "Goff, Stephen P."}}},{"id": "30|od________88::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od_______908::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od_______908::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}}],"mergedperson": [{"id": "30|od_______908::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}},{"id": "30|od________88::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}}],"anchor": true} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json similarity index 91% rename from dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf rename to dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json index f92655f3f..a16ade3d0 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.current.conf.json @@ -18,16 +18,81 @@ { "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } ], "decisionTree" : { - "start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, - "layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, - "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} + "start": { + "fields": [ + { + "field": "gridid", + "comparator": "exactMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "websiteurl", + "comparator": "domainExactMatch", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + }, + { + "field": "country", + "comparator": "exactMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "MIN", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "legalname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.9, + "countIfUndefined": "false", + "params": { + "windowSize": 4, + "threshold": 0.7 + } + }, + { + "field": "legalshortname", + "comparator": "jaroWinklerNormalizedName", + "weight": 0.1, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.9, + "aggregation": "W_MEAN", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } }, "model" : [ - { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, - { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, - { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} + { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} ], "blacklists" : { "legalname" : [] diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json similarity index 82% rename from dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf rename to dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json index d79b7758e..80a53c83f 100644 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/organization.no_synonyms.conf.json @@ -23,11 +23,11 @@ "layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"} }, "model" : [ - { "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"}, - { "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"}, - { "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" }, - { "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" }, - { "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"} + { "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"}, + { "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"}, + { "name" : "legalname", "type" : "String", "path" : "$organization.metadata.legalname.value" }, + { "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" }, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid ==\"grid\")].value"} ], "blacklists" : { "legalname" : [] diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json new file mode 100644 index 000000000..3dd1830af --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.current.conf.json @@ -0,0 +1,387 @@ +{ + "wf": { + "threshold": "0.99", + "dedupRun": "001", + "entityType": "result", + "subEntityType": "resulttype", + "subEntityValue": "publication", + "orderField": "title", + "queueMaxSize": "2000", + "groupMaxSize": "100", + "maxChildren": "100", + "slidingWindowSize": "200", + "rootBuilder": [ + "result", + "resultProject_outcome_isProducedBy", + "resultResult_publicationDataset_isRelatedTo", + "resultResult_similarity_isAmongTopNSimilarDocuments", + "resultResult_similarity_hasAmongTopNSimilarDocuments", + "resultOrganization_affiliation_isAffiliatedWith", + "resultResult_part_hasPart", + "resultResult_part_isPartOf", + "resultResult_supplement_isSupplementTo", + "resultResult_supplement_isSupplementedBy", + "resultResult_version_isVersionOf" + ], + "includeChildren": "true", + "maxIterations": 20, + "idPath": "$.entity.id" + }, + "pace": { + "clustering": [ + { + "name": "ngrampairs", + "fields": [ + "title" + ], + "params": { + "max": "1", + "ngramLen": "3" + } + }, + { + "name": "suffixprefix", + "fields": [ + "title" + ], + "params": { + "max": "1", + "len": "3" + } + }, + { + "name": "lowercase", + "fields": [ + "doi" + ], + "params": {} + } + ], + "decisionTree": { + "start": { + "fields": [ + { + "field": "pid", + "comparator": "jsonListMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "threshold": "0.5", + "jpath_value": "$.value", + "jpath_classid": "$.qualifier.classid" + } + } + ], + "threshold": 1.0, + "aggregation": "MAX", + "positive": "MATCH", + "negative": "layer2", + "undefined": "layer2", + "ignoreUndefined": "true" + }, + "layer2": { + "fields": [ + { + "field": "title", + "comparator": "titleVersionMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + }, + { + "field": "authors", + "comparator": "sizeMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": {} + } + ], + "threshold": 1.0, + "aggregation": "NC", + "positive": "layer3", + "negative": "NO_MATCH", + "undefined": "layer3", + "ignoreUndefined": "false" + }, + "layer3": { + "fields": [ + { + "field": "title", + "comparator": "LevensteinTitle", + "weight": 1.0, + "countIfUndefined": "true", + "params": {} + } + ], + "threshold": 0.99, + "aggregation": "SUM", + "positive": "MATCH", + "negative": "NO_MATCH", + "undefined": "NO_MATCH", + "ignoreUndefined": "true" + } + }, + "model": [ + { + "name": "doi", + "type": "String", + "path": "$.pid[?(@.qualifier.classid == 'doi')].value" + }, + { + "name": "pid", + "type": "JSON", + "path": "$.pid[*]", + "overrideMatch": "true" + }, + { + "name": "title", + "type": "String", + "path": "$.title[?(@.qualifier.classid == 'main title')].value", + "length": 250, + "size": 5 + }, + { + "name": "authors", + "type": "List", + "path": "$.author[*].fullname", + "size": 200 + }, + { + "name": "resulttype", + "type": "String", + "path": "$.resulttype.classid" + } + ], + "blacklists": { + "title": [ + "^Inside Front Cover$", + "(?i)^Poster presentations$", + "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", + "^Problems with perinatal pathology\\.?$", + "(?i)^Cases? of Puerperal Convulsions$", + "(?i)^Operative Gyna?ecology$", + "(?i)^Mind the gap\\!?\\:?$", + "^Chronic fatigue syndrome\\.?$", + "^Cartas? ao editor Letters? to the Editor$", + "^Note from the Editor$", + "^Anesthesia Abstract$", + "^Annual report$", + "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", + "(?i)^Graph and Table of Infectious Diseases?$", + "^Presentation$", + "(?i)^Reviews and Information on Publications$", + "(?i)^PUBLIC HEALTH SERVICES?$", + "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", + "(?i)^Adrese autora$", + "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", + "(?i)^Acknowledgement to Referees$", + "(?i)^Behçet's disease\\.?$", + "(?i)^Isolation and identification of restriction endonuclease.*$", + "(?i)^CEREBROVASCULAR DISEASES?.?$", + "(?i)^Screening for abdominal aortic aneurysms?\\.?$", + "^Event management$", + "(?i)^Breakfast and Crohn's disease.*\\.?$", + "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", + "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", + "^Gushi hakubutsugaku$", + "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", + "^Intestinal spirocha?etosis$", + "^Treatment of Rodent Ulcer$", + "(?i)^\\W*Cloud Computing\\W*$", + "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", + "^Free Communications, Poster Presentations: Session [A-F]$", + "^“The Historical Aspects? of Quackery\\.?”$", + "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", + "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", + "(?i)^Case Report$", + "^Boletín Informativo$", + "(?i)^Glioblastoma Multiforme$", + "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", + "^Zaměstnanecké výhody$", + "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", + "(?i)^Carotid body tumours?\\.?$", + "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", + "^Avant-propos$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", + "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", + "(?i)^PUBLIC HEALTH VERSUS THE STATE$", + "^Viñetas de Cortázar$", + "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", + "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", + "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", + "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", + "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", + "^Aus der AGMB$", + "^Znanstveno-stručni prilozi$", + "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", + "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", + "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", + "^Finanční analýza podniku$", + "^Financial analysis( of business)?$", + "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", + "^Jikken nihon shūshinsho$", + "(?i)^CORONER('|s)(s|') INQUESTS$", + "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", + "(?i)^Consultants' contract(s)?$", + "(?i)^Upute autorima$", + "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", + "^Joshi shin kokubun$", + "^Kōtō shōgaku dokuhon nōson'yō$", + "^Jinjō shōgaku shōka$", + "^Shōgaku shūjichō$", + "^Nihon joshi dokuhon$", + "^Joshi shin dokuhon$", + "^Chūtō kanbun dokuhon$", + "^Wabun dokuhon$", + "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", + "(?i)^cardiac rehabilitation$", + "(?i)^Analytical summary$", + "^Thesaurus resolutionum Sacrae Congregationis Concilii$", + "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", + "^Prikazi i osvrti$", + "^Rodinný dům s provozovnou$", + "^Family house with an establishment$", + "^Shinsei chūtō shin kokugun$", + "^Pulmonary alveolar proteinosis(\\.?)$", + "^Shinshū kanbun$", + "^Viñeta(s?) de Rodríguez$", + "(?i)^RUBRIKA UREDNIKA$", + "^A Matching Model of the Academic Publication Market$", + "^Yōgaku kōyō$", + "^Internetový marketing$", + "^Internet marketing$", + "^Chūtō kokugo dokuhon$", + "^Kokugo dokuhon$", + "^Antibiotic Cover for Dental Extraction(s?)$", + "^Strategie podniku$", + "^Strategy of an Enterprise$", + "(?i)^respiratory disease(s?)(\\.?)$", + "^Award(s?) for Gallantry in Civil Defence$", + "^Podniková kultura$", + "^Corporate Culture$", + "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", + "^Pracovní motivace$", + "^Work Motivation$", + "^Kaitei kōtō jogaku dokuhon$", + "^Konsolidovaná účetní závěrka$", + "^Consolidated Financial Statements$", + "(?i)^intracranial tumour(s?)$", + "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", + "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", + "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", + "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", + "^Úroveň motivačního procesu jako způsobu vedení lidí$", + "^The level of motivation process as a leadership$", + "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", + "(?i)^news and events$", + "(?i)^NOVOSTI I DOGAĐAJI$", + "^Sansū no gakushū$", + "^Posouzení informačního systému firmy a návrh změn$", + "^Information System Assessment and Proposal for ICT Modification$", + "^Stresové zatížení pracovníků ve vybrané profesi$", + "^Stress load in a specific job$", + "^Sunday: Poster Sessions, Pt.*$", + "^Monday: Poster Sessions, Pt.*$", + "^Wednesday: Poster Sessions, Pt.*", + "^Tuesday: Poster Sessions, Pt.*$", + "^Analýza reklamy$", + "^Analysis of advertising$", + "^Shōgaku shūshinsho$", + "^Shōgaku sansū$", + "^Shintei joshi kokubun$", + "^Taishō joshi kokubun dokuhon$", + "^Joshi kokubun$", + "^Účetní uzávěrka a účetní závěrka v ČR$", + "(?i)^The \"?Causes\"? of Cancer$", + "^Normas para la publicación de artículos$", + "^Editor('|s)(s|') [Rr]eply$", + "^Editor(’|s)(s|’) letter$", + "^Redaktoriaus žodis$", + "^DISCUSSION ON THE PRECEDING PAPER$", + "^Kōtō shōgaku shūshinsho jidōyō$", + "^Shōgaku nihon rekishi$", + "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", + "^Préface$", + "^Occupational [Hh]ealth [Ss]ervices.$", + "^In Memoriam Professor Toshiyuki TAKESHIMA$", + "^Účetní závěrka ve vybraném podniku.*$", + "^Financial statements in selected company$", + "^Abdominal [Aa]ortic [Aa]neurysms.*$", + "^Pseudomyxoma peritonei$", + "^Kazalo autora$", + "(?i)^uvodna riječ$", + "^Motivace jako způsob vedení lidí$", + "^Motivation as a leadership$", + "^Polyfunkční dům$", + "^Multi\\-funkcional building$", + "^Podnikatelský plán$", + "(?i)^Podnikatelský záměr$", + "(?i)^Business Plan$", + "^Oceňování nemovitostí$", + "^Marketingová komunikace$", + "^Marketing communication$", + "^Sumario Analítico$", + "^Riječ uredništva$", + "^Savjetovanja i priredbe$", + "^Índice$", + "^(Starobosanski nadpisi).*$", + "^Vzdělávání pracovníků v organizaci$", + "^Staff training in organization$", + "^(Life Histories of North American Geometridae).*$", + "^Strategická analýza podniku$", + "^Strategic Analysis of an Enterprise$", + "^Sadržaj$", + "^Upute suradnicima$", + "^Rodinný dům$", + "(?i)^Fami(l)?ly house$", + "^Upute autorima$", + "^Strategic Analysis$", + "^Finanční analýza vybraného podniku$", + "^Finanční analýza$", + "^Riječ urednika$", + "(?i)^Content(s?)$", + "(?i)^Inhalt$", + "^Jinjō shōgaku shūshinsho jidōyō$", + "(?i)^Index$", + "^Chūgaku kokubun kyōkasho$", + "^Retrato de una mujer$", + "^Retrato de un hombre$", + "^Kōtō shōgaku dokuhon$", + "^Shotōka kokugo$", + "^Shōgaku dokuhon$", + "^Jinjō shōgaku kokugo dokuhon$", + "^Shinsei kokugo dokuhon$", + "^Teikoku dokuhon$", + "^Instructions to Authors$", + "^KİTAP TAHLİLİ$", + "^PRZEGLĄD PIŚMIENNICTWA$", + "(?i)^Presentación$", + "^İçindekiler$", + "(?i)^Tabl?e of contents$", + "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", + "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", + "^Editorial( Board)?$", + "(?i)^Editorial \\(English\\)$", + "^Editörden$", + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*", + "(?i)^.*authors['’′]? reply\\.?$", + "(?i)^.*authors['’′]? response\\.?$" + ] + }, + "synonyms": {} + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.json similarity index 100% rename from dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.json rename to dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/publication.json diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json deleted file mode 100644 index 96094b814..000000000 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf_jpath.json +++ /dev/null @@ -1,48 +0,0 @@ -{ - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "10", - "slidingWindowSize" : "200", - "idPath": "$.entity.id", - "rootBuilder" : [ "result" ], - "includeChildren" : "true" - }, - "pace" : { - "clustering" : [ - { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } - ], - "decisionTree": {}, - "model" : [ - { "name" : "pid", "type" : "JSON", "path" : "$.entity.pid"}, - { "name" : "dateofacceptance", "type" : "String", "path" : "$.entity.result.metadata.dateofacceptance.value"}, - { "name" : "title", "type" : "String","path" : "$.entity.result.metadata.title[?(@.qualifier.classid ==\"main title\")].value" }, - { "name" : "authors", "type" : "List", "path" : "$.entity.result.metadata.author[*].fullname" } - ], - "blacklists" : { - "title" : [ - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*" - ] } , - "synonyms": {} - } - -} diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf deleted file mode 100644 index 115bb04f0..000000000 --- a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.test.conf +++ /dev/null @@ -1,51 +0,0 @@ -{ - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "10", - "slidingWindowSize" : "200", - "rootBuilder" : [ "result" ], - "includeChildren" : "true" - }, - "pace" : { - "clustering" : [ - { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } - ], - "decisionTree": { - "start": {"fields": [{"field":"pid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"}, - "layer2": {"fields": [{"field":"dateofacceptance", "comparator":"yearMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"title", "comparator":"titleVersionMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}, {"field":"authors", "comparator":"sizeMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"}, - "layer3": {"fields": [{"field":"title", "comparator":"JaroWinkler", "weight":1.0, "countIfUndefined":"false", "params":{}}], "threshold": 0.99, "aggregation": "MAX", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "false"} - }, - "model" : [ - { "name" : "pid", "type" : "String", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, - { "name" : "title", "type" : "String", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, - { "name" : "dateofacceptance", "type" : "String", "path" : "result/metadata/dateofacceptance/value" } , - { "name" : "authors", "type" : "List", "path" : "result/author/metadata/fullname/value" } - ], - "blacklists" : { - "title" : [ - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*" - ] }, - "synonyms": {} - } - -}