diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml
index 756321097..88b611e0c 100644
--- a/dnet-pace-core/pom.xml
+++ b/dnet-pace-core/pom.xml
@@ -67,8 +67,6 @@
json-path
-
-
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
index 6c370732b..2f0fc4f45 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java
@@ -44,6 +44,9 @@ public abstract class AbstractPaceFunctions {
//blacklist of ngrams: to avoid generic keys
protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
+ //html regex for normalization
+ public final String HTML_REGEX = "<[^>]*>";
+
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
@@ -62,7 +65,9 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
- final String s0 = unicodeNormalization(s.toLowerCase());
+
+ final String s00 = s.replaceAll(HTML_REGEX, "");
+ final String s0 = unicodeNormalization(s00.toLowerCase());
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("–", " ");
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
new file mode 100644
index 000000000..ff9d49794
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java
@@ -0,0 +1,85 @@
+package eu.dnetlib.pace.tree;
+
+import com.google.common.collect.Iterables;
+import com.wcohen.ss.JaroWinkler;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.model.Person;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+@ComparatorClass("authorsMatch")
+public class AuthorsMatch extends AbstractComparator {
+
+ Map params;
+
+ private double SURNAME_THRESHOLD;
+ private double NAME_THRESHOLD;
+ private double FULLNAME_THRESHOLD;
+ private String MODE; //full or surname
+
+ public AuthorsMatch(Map params){
+ super(params, new com.wcohen.ss.JaroWinkler());
+ this.params = params;
+
+ MODE = params.getOrDefault("mode", "full");
+ SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
+ NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
+ FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
+ }
+
+ @Override
+ public double compare(final Field a, final Field b, final Config conf) {
+
+ if (a.isEmpty() || b.isEmpty())
+ return -1;
+
+ List aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+ List bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
+
+ int common = 0;
+ for (Person p1 : aList)
+ for (Person p2 : bList)
+ if(MODE.equals("full")) {
+ if (personComparator(p1, p2))
+ common += 1;
+ }
+ else {
+ if (surnameComparator(p1, p2))
+ common += 1;
+ }
+
+ return (double)common / (aList.size() + bList.size() - common);
+ }
+
+ public boolean personComparator(Person p1, Person p2) {
+
+ if(!p1.isAccurate() || !p2.isAccurate())
+ return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
+
+ if(ssalgo.score(p1.getSurnameString(),p2.getSurnameString()) > SURNAME_THRESHOLD)
+ if(p1.getNameString().length()<=2 || p2.getNameString().length()<=2)
+ return firstLC(p1.getNameString()).equals(firstLC(p2.getNameString()));
+ else
+ return ssalgo.score(p1.getNameString(), p2.getNameString()) > NAME_THRESHOLD;
+ else
+ return false;
+ }
+
+ public boolean surnameComparator(Person p1, Person p2) {
+
+ if(!p1.isAccurate() || !p2.isAccurate())
+ return ssalgo.score(p1.getOriginal(), p2.getOriginal()) > FULLNAME_THRESHOLD;
+
+ return ssalgo.score(p1.getSurnameString(), p2.getSurnameString()) > SURNAME_THRESHOLD;
+ }
+
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
index 77262df8a..661b17433 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java
@@ -5,6 +5,7 @@ import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.HashMap;
import java.util.List;
@@ -12,6 +13,7 @@ import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
+@ComparatorClass("instanceTypeMatch")
public class InstanceTypeMatch extends AbstractComparator {
final Map translationMap = new HashMap<>();
@@ -41,6 +43,10 @@ public class InstanceTypeMatch extends AbstractComparator {
@Override
public double compare(final Field a, final Field b, final Config conf) {
+ if (a == null || b == null) {
+ return -1;
+ }
+
final List sa = ((FieldList) a).stringList();
final List sb = ((FieldList) b).stringList();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
index 4fea8d86c..eb831b094 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java
@@ -21,9 +21,13 @@ public class JsonListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(JsonListMatch.class);
private Map params;
+ private String MODE; //"percentage" or "count"
+
public JsonListMatch(final Map params) {
super(params);
this.params = params;
+
+ MODE = params.getOrDefault("mode", "percentage");
}
@Override
@@ -46,7 +50,10 @@ public class JsonListMatch extends AbstractComparator {
return 0.0;
}
- return (double)incommon / (incommon + simDiff);
+ if (MODE.equals("percentage"))
+ return (double)incommon / (incommon + simDiff);
+ else
+ return incommon;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java
new file mode 100644
index 000000000..40e041f6f
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ListContainsMatch.java
@@ -0,0 +1,74 @@
+package eu.dnetlib.pace.tree;
+
+import com.google.common.collect.Sets;
+import eu.dnetlib.pace.config.Config;
+import eu.dnetlib.pace.model.Field;
+import eu.dnetlib.pace.model.FieldList;
+import eu.dnetlib.pace.tree.support.AbstractComparator;
+import eu.dnetlib.pace.tree.support.ComparatorClass;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+/**
+ * The Class Contains match
+ *
+ * @author miconis
+ * */
+@ComparatorClass("listContainsMatch")
+public class ListContainsMatch extends AbstractComparator {
+
+ private Map params;
+ private boolean CASE_SENSITIVE;
+ private String STRING;
+ private String AGGREGATOR;
+
+ public ListContainsMatch(Map params) {
+ super(params);
+ this.params = params;
+
+ //read parameters
+ CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
+ STRING = params.get("string");
+ AGGREGATOR = params.get("bool");
+ }
+
+ @Override
+ public double compare(final Field a, final Field b, final Config conf) {
+
+ List sa = ((FieldList) a).stringList();
+ List sb = ((FieldList) b).stringList();
+
+ if (sa.isEmpty() || sb.isEmpty()) {
+ return -1;
+ }
+
+ if (!CASE_SENSITIVE) {
+ sa = sa.stream().map(String::toLowerCase).collect(Collectors.toList());
+ sb = sb.stream().map(String::toLowerCase).collect(Collectors.toList());
+ STRING = STRING.toLowerCase();
+ }
+
+ switch(AGGREGATOR) {
+ case "AND":
+ if(sa.contains(STRING) && sb.contains(STRING))
+ return 1.0;
+ break;
+ case "OR":
+ if(sa.contains(STRING) || sb.contains(STRING))
+ return 1.0;
+ break;
+ case "XOR":
+ if(sa.contains(STRING) ^ sb.contains(STRING))
+ return 1.0;
+ break;
+ default:
+ return 0.0;
+ }
+ return 0.0;
+
+ }
+}
+
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
similarity index 56%
rename from dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java
rename to dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
index 8b8a342cd..126c01010 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/ContainsMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringContainsMatch.java
@@ -11,42 +11,48 @@ import java.util.Map;
*
* @author miconis
* */
-@ComparatorClass("containsMatch")
-public class ContainsMatch extends AbstractComparator {
+@ComparatorClass("stringContainsMatch")
+public class StringContainsMatch extends AbstractComparator {
private Map params;
- public ContainsMatch(Map params) {
+ private boolean CASE_SENSITIVE;
+ private String STRING;
+ private String AGGREGATOR;
+
+ public StringContainsMatch(Map params) {
super(params);
this.params = params;
+
+ //read parameters
+ CASE_SENSITIVE = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
+ STRING = params.get("string");
+ AGGREGATOR = params.get("aggregator");
+
}
@Override
public double distance(final String a, final String b, final Config conf) {
- //read parameters
- boolean caseSensitive = Boolean.parseBoolean(params.getOrDefault("caseSensitive", "false"));
- String string = params.get("string");
- String agg = params.get("bool");
-
String ca = a;
String cb = b;
- if (!caseSensitive) {
+ if (!CASE_SENSITIVE) {
ca = a.toLowerCase();
cb = b.toLowerCase();
+ STRING = STRING.toLowerCase();
}
- switch(agg) {
+ switch(AGGREGATOR) {
case "AND":
- if(ca.contains(string) && cb.contains(string))
+ if(ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
- if(ca.contains(string) || cb.contains(string))
+ if(ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
- if(ca.contains(string) ^ cb.contains(string))
+ if(ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
index 9bf05f37b..d5e785af6 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessorForTesting.java
@@ -7,11 +7,7 @@ import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
-import eu.dnetlib.pace.tree.JsonListMatch;
-import eu.dnetlib.pace.tree.LevensteinTitle;
-import eu.dnetlib.pace.tree.SizeMatch;
-import eu.dnetlib.pace.tree.TitleVersionMatch;
-import eu.dnetlib.pace.tree.support.FieldStats;
+import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.tree.support.TreeProcessor;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
@@ -159,13 +155,17 @@ public class BlockProcessorForTesting {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
-// if (new TreeProcessor(dedupConf).compare(pivot, curr) == true && publicationCompare(pivot, curr, dedupConf) == false)
-// emitOutput(true, idPivot, idCurr, context);
-//
- if(useTree)
+ if(!compareInstanceType(pivot, curr, dedupConf)){
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
- else
- emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
+ }
+ else {
+ emitOutput(false, idPivot, idCurr, context);
+ }
+
+// if(useTree)
+// emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
+// else
+// emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
}
@@ -173,6 +173,13 @@ public class BlockProcessorForTesting {
}
}
+ protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) {
+ Map params = new HashMap<>();
+ InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
+ double compare = instanceTypeMatch.compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf);
+ return compare>=1.0;
+ }
+
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
double score = 0.0;
diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
index 2c0424177..70a5b16b5 100644
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java
@@ -3,19 +3,12 @@ package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.Field;
-import eu.dnetlib.pace.model.FieldListImpl;
-import eu.dnetlib.pace.model.FieldValueImpl;
-import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
-import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import eu.dnetlib.pace.common.AbstractPaceFunctions;
-
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@@ -30,8 +23,11 @@ public class ComparatorTest extends AbstractPaceTest {
public void setup() {
params = new HashMap<>();
params.put("weight", "1.0");
+ params.put("surname_th", "0.99");
+ params.put("name_th", "0.95");
+ params.put("jpath_value", "$.value");
+ params.put("jpath_classid", "$.qualifier.classid");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
-
}
@Test
@@ -82,15 +78,46 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
- public void containsMatchTest(){
+ public void listContainsMatchTest(){
+
+ Field a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
+ Field b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
+
+ params.put("string", "Article");
+ params.put("bool", "XOR");
+ params.put("caseSensitive", "false");
+
+ ListContainsMatch listContainsMatch = new ListContainsMatch(params);
+
+ assertEquals(0.0, listContainsMatch.compare(a, b, conf));
+
+ params.put("string", "Article");
+ params.put("bool", "AND");
+ params.put("caseSensitive", "false");
+
+ listContainsMatch = new ListContainsMatch(params);
+
+ assertEquals(1.0, listContainsMatch.compare(a, b, conf));
+ }
+
+ @Test
+ public void stringContainsMatchTest(){
params.put("string", "openorgs");
params.put("bool", "XOR");
params.put("caseSensitive", "false");
- final ContainsMatch containsMatch = new ContainsMatch(params);
+ StringContainsMatch stringContainsMatch = new StringContainsMatch(params);
- assertEquals(0.0, containsMatch.distance("openorgs", "openorgs", conf));
+ assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf));
+
+ params.put("string", "openorgs");
+ params.put("bool", "AND");
+ params.put("caseSensitive", "false");
+
+ stringContainsMatch = new StringContainsMatch(params);
+
+ assertEquals(1.0, stringContainsMatch.distance("openorgs", "openorgs", conf));
}
@Test
@@ -122,7 +149,6 @@ public class ComparatorTest extends AbstractPaceTest {
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result);
-
}
@Test
@@ -150,7 +176,8 @@ public class ComparatorTest extends AbstractPaceTest {
public void levensteinTitleTest() {
final LevensteinTitle levensteinTitle = new LevensteinTitle(params);
- double result = levensteinTitle.distance("JRC: Open Power Plants Database", "JRC Open Power Plants Database (JRC-PPDB-OPEN)", conf);
+
+ double result = levensteinTitle.distance("Degradation of lignin β‐aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK‐6", "Degradation of lignin β-aryl ether units in Arabidopsis thaliana expressing LigD, LigF and LigG from Sphingomonas paucimobilis SYK-6", conf);
System.out.println("result = " + result);
}
@@ -181,6 +208,55 @@ public class ComparatorTest extends AbstractPaceTest {
result = instanceTypeMatch.compare(e, g, conf);
assertEquals(0.0, result);
+
+ Field h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType");
+ result = instanceTypeMatch.compare(a, h, conf);
+
+ assertEquals(1.0, result);
+ }
+
+ @Test
+ public void authorsMatchTest() {
+
+ AuthorsMatch authorsMatch = new AuthorsMatch(params);
+
+ Field a = createFieldList(Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
+ Field b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
+ double result = authorsMatch.compare(a, b, conf);
+
+ assertEquals(1.0, result);
+
+ Field c = createFieldList(Arrays.asList("Manghi, Paolo"), "authors");
+ Field d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
+ result = authorsMatch.compare(c, d, conf);
+
+ assertEquals(0.0, result) ;
+
+ params.put("mode", "surname");
+ authorsMatch = new AuthorsMatch(params);
+ result = authorsMatch.compare(c, d, conf);
+
+ assertEquals(1.0, result);
+
+ }
+
+ @Test
+ public void jsonListMatch() {
+
+ JsonListMatch jsonListMatch = new JsonListMatch(params);
+
+ Field a = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "authors");
+ Field b = createFieldList(Arrays.asList("{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}","{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), "authors");
+
+ double result = jsonListMatch.compare(a, b, conf);
+
+ assertEquals(0.25, result);
+
+ params.put("mode", "count");
+ jsonListMatch = new JsonListMatch(params);
+ result = jsonListMatch.compare(a, b, conf);
+
+ assertEquals(1.0, result);
}
diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
index dbf7f08a9..879e5724c 100644
--- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
+++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java
@@ -101,7 +101,6 @@ public class ConfigTest extends AbstractPaceTest {
System.out.println("mapDocument = " + mapDocument.getFieldMap().get("title").stringValue());
-
}
@Test
diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json
new file mode 100644
index 000000000..8ebd2be33
--- /dev/null
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.instancetype.tree.conf.json
@@ -0,0 +1,442 @@
+{
+ "wf": {
+ "threshold": "0.99",
+ "dedupRun": "001",
+ "entityType": "result",
+ "subEntityType": "resulttype",
+ "subEntityValue": "publication",
+ "orderField": "title",
+ "queueMaxSize": "200",
+ "groupMaxSize": "100",
+ "maxChildren": "100",
+ "slidingWindowSize": "50",
+ "rootBuilder": [
+ "result",
+ "resultProject_outcome_isProducedBy",
+ "resultResult_publicationDataset_isRelatedTo",
+ "resultResult_similarity_isAmongTopNSimilarDocuments",
+ "resultResult_similarity_hasAmongTopNSimilarDocuments",
+ "resultOrganization_affiliation_isAffiliatedWith",
+ "resultResult_part_hasPart",
+ "resultResult_part_isPartOf",
+ "resultResult_supplement_isSupplementTo",
+ "resultResult_supplement_isSupplementedBy",
+ "resultResult_version_isVersionOf"
+ ],
+ "includeChildren": "true",
+ "maxIterations": 20,
+ "idPath": "$.id"
+ },
+ "pace": {
+ "clustering": [
+ {
+ "name": "wordsStatsSuffixPrefixChain",
+ "fields": [
+ "title"
+ ],
+ "params": {
+ "mod": "10"
+ }
+ },
+ {
+ "name": "lowercase",
+ "fields": [
+ "doi",
+ "altdoi"
+ ],
+ "params": {
+ "collapseOn:pid": "0"
+ }
+ }
+ ],
+ "decisionTree": {
+ "start": {
+ "fields": [
+ {
+ "field": "instance",
+ "comparator": "instanceTypeMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ }
+ ],
+ "threshold": 0.5,
+ "aggregation": "MAX",
+ "positive": "layer1",
+ "negative": "NO_MATCH",
+ "undefined": "layer1",
+ "ignoreUndefined": "true"
+ },
+ "layer1": {
+ "fields": [
+ {
+ "field": "pid",
+ "comparator": "jsonListMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {
+ "jpath_value": "$.value",
+ "jpath_classid": "$.qualifier.classid"
+ }
+ },
+ {
+ "field": "pid",
+ "comparator": "jsonListMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {
+ "jpath_value": "$.value",
+ "jpath_classid": "$.qualifier.classid",
+ "crossCompare": "alternateid"
+ }
+ }
+ ],
+ "threshold": 0.5,
+ "aggregation": "MAX",
+ "positive": "layer2",
+ "negative": "layer3",
+ "undefined": "layer3",
+ "ignoreUndefined": "true"
+ },
+ "layer2": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "levensteinTitle",
+ "weight": 1.0,
+ "countIfUndefined": "true",
+ "params": {}
+ }
+ ],
+ "threshold": 0.9,
+ "aggregation": "AVG",
+ "positive": "MATCH",
+ "negative": "NO_MATCH",
+ "undefined": "NO_MATCH",
+ "ignoreUndefined": "true"
+ },
+ "layer3": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "titleVersionMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ },
+ {
+ "field": "authors",
+ "comparator": "sizeMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ }
+ ],
+ "threshold": 1.0,
+ "aggregation": "AND",
+ "positive": "layer4",
+ "negative": "NO_MATCH",
+ "undefined": "layer4",
+ "ignoreUndefined": "false"
+ },
+ "layer4": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "levensteinTitle",
+ "weight": 1.0,
+ "countIfUndefined": "true",
+ "params": {}
+ }
+ ],
+ "threshold": 0.99,
+ "aggregation": "AVG",
+ "positive": "MATCH",
+ "negative": "NO_MATCH",
+ "undefined": "NO_MATCH",
+ "ignoreUndefined": "true"
+ }
+ },
+ "model": [
+ {
+ "name": "doi",
+ "type": "String",
+ "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
+ },
+ {
+ "name": "altdoi",
+ "type": "String",
+ "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
+ },
+ {
+ "name": "pid",
+ "type": "JSON",
+ "path": "$.instance[*].pid[*]",
+ "overrideMatch": "true"
+ },
+ {
+ "name": "alternateid",
+ "type": "JSON",
+ "path": "$.instance[*].alternateIdentifier[*]",
+ "overrideMatch": "true"
+ },
+ {
+ "name": "title",
+ "type": "String",
+ "path": "$.title[?(@.qualifier.classid == 'main title')].value",
+ "length": 250,
+ "size": 5
+ },
+ {
+ "name": "authors",
+ "type": "List",
+ "path": "$.author[*].fullname",
+ "size": 200
+ },
+ {
+ "name": "resulttype",
+ "type": "String",
+ "path": "$.resulttype.classid"
+ },
+ {
+ "name": "instance",
+ "type": "List",
+ "path": "$.instance[*].instancetype.classname"
+ }
+ ],
+ "blacklists": {
+ "title": [
+ "(?i)^Data Management Plan",
+ "^Inside Front Cover$",
+ "(?i)^Poster presentations$",
+ "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+ "^Problems with perinatal pathology\\.?$",
+ "(?i)^Cases? of Puerperal Convulsions$",
+ "(?i)^Operative Gyna?ecology$",
+ "(?i)^Mind the gap\\!?\\:?$",
+ "^Chronic fatigue syndrome\\.?$",
+ "^Cartas? ao editor Letters? to the Editor$",
+ "^Note from the Editor$",
+ "^Anesthesia Abstract$",
+ "^Annual report$",
+ "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
+ "(?i)^Graph and Table of Infectious Diseases?$",
+ "^Presentation$",
+ "(?i)^Reviews and Information on Publications$",
+ "(?i)^PUBLIC HEALTH SERVICES?$",
+ "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+ "(?i)^Adrese autora$",
+ "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+ "(?i)^Acknowledgement to Referees$",
+ "(?i)^Behçet's disease\\.?$",
+ "(?i)^Isolation and identification of restriction endonuclease.*$",
+ "(?i)^CEREBROVASCULAR DISEASES?.?$",
+ "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
+ "^Event management$",
+ "(?i)^Breakfast and Crohn's disease.*\\.?$",
+ "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
+ "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
+ "^Gushi hakubutsugaku$",
+ "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
+ "^Intestinal spirocha?etosis$",
+ "^Treatment of Rodent Ulcer$",
+ "(?i)^\\W*Cloud Computing\\W*$",
+ "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+ "^Free Communications, Poster Presentations: Session [A-F]$",
+ "^“The Historical Aspects? of Quackery\\.?”$",
+ "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+ "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+ "(?i)^Case Report$",
+ "^Boletín Informativo$",
+ "(?i)^Glioblastoma Multiforme$",
+ "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+ "^Zaměstnanecké výhody$",
+ "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+ "(?i)^Carotid body tumours?\\.?$",
+ "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+ "^Avant-propos$",
+ "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+ "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+ "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+ "^Viñetas de Cortázar$",
+ "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
+ "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
+ "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+ "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+ "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+ "^Aus der AGMB$",
+ "^Znanstveno-stručni prilozi$",
+ "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+ "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+ "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+ "^Finanční analýza podniku$",
+ "^Financial analysis( of business)?$",
+ "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+ "^Jikken nihon shūshinsho$",
+ "(?i)^CORONER('|s)(s|') INQUESTS$",
+ "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+ "(?i)^Consultants' contract(s)?$",
+ "(?i)^Upute autorima$",
+ "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+ "^Joshi shin kokubun$",
+ "^Kōtō shōgaku dokuhon nōson'yō$",
+ "^Jinjō shōgaku shōka$",
+ "^Shōgaku shūjichō$",
+ "^Nihon joshi dokuhon$",
+ "^Joshi shin dokuhon$",
+ "^Chūtō kanbun dokuhon$",
+ "^Wabun dokuhon$",
+ "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+ "(?i)^cardiac rehabilitation$",
+ "(?i)^Analytical summary$",
+ "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+ "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+ "^Prikazi i osvrti$",
+ "^Rodinný dům s provozovnou$",
+ "^Family house with an establishment$",
+ "^Shinsei chūtō shin kokugun$",
+ "^Pulmonary alveolar proteinosis(\\.?)$",
+ "^Shinshū kanbun$",
+ "^Viñeta(s?) de Rodríguez$",
+ "(?i)^RUBRIKA UREDNIKA$",
+ "^A Matching Model of the Academic Publication Market$",
+ "^Yōgaku kōyō$",
+ "^Internetový marketing$",
+ "^Internet marketing$",
+ "^Chūtō kokugo dokuhon$",
+ "^Kokugo dokuhon$",
+ "^Antibiotic Cover for Dental Extraction(s?)$",
+ "^Strategie podniku$",
+ "^Strategy of an Enterprise$",
+ "(?i)^respiratory disease(s?)(\\.?)$",
+ "^Award(s?) for Gallantry in Civil Defence$",
+ "^Podniková kultura$",
+ "^Corporate Culture$",
+ "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
+ "^Pracovní motivace$",
+ "^Work Motivation$",
+ "^Kaitei kōtō jogaku dokuhon$",
+ "^Konsolidovaná účetní závěrka$",
+ "^Consolidated Financial Statements$",
+ "(?i)^intracranial tumour(s?)$",
+ "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+ "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+ "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+ "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+ "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+ "^The level of motivation process as a leadership$",
+ "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+ "(?i)^news and events$",
+ "(?i)^NOVOSTI I DOGAĐAJI$",
+ "^Sansū no gakushū$",
+ "^Posouzení informačního systému firmy a návrh změn$",
+ "^Information System Assessment and Proposal for ICT Modification$",
+ "^Stresové zatížení pracovníků ve vybrané profesi$",
+ "^Stress load in a specific job$",
+ "^Sunday: Poster Sessions, Pt.*$",
+ "^Monday: Poster Sessions, Pt.*$",
+ "^Wednesday: Poster Sessions, Pt.*",
+ "^Tuesday: Poster Sessions, Pt.*$",
+ "^Analýza reklamy$",
+ "^Analysis of advertising$",
+ "^Shōgaku shūshinsho$",
+ "^Shōgaku sansū$",
+ "^Shintei joshi kokubun$",
+ "^Taishō joshi kokubun dokuhon$",
+ "^Joshi kokubun$",
+ "^Účetní uzávěrka a účetní závěrka v ČR$",
+ "(?i)^The \"?Causes\"? of Cancer$",
+ "^Normas para la publicación de artículos$",
+ "^Editor('|s)(s|') [Rr]eply$",
+ "^Editor(’|s)(s|’) letter$",
+ "^Redaktoriaus žodis$",
+ "^DISCUSSION ON THE PRECEDING PAPER$",
+ "^Kōtō shōgaku shūshinsho jidōyō$",
+ "^Shōgaku nihon rekishi$",
+ "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+ "^Préface$",
+ "^Occupational [Hh]ealth [Ss]ervices.$",
+ "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+ "^Účetní závěrka ve vybraném podniku.*$",
+ "^Financial statements in selected company$",
+ "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+ "^Pseudomyxoma peritonei$",
+ "^Kazalo autora$",
+ "(?i)^uvodna riječ$",
+ "^Motivace jako způsob vedení lidí$",
+ "^Motivation as a leadership$",
+ "^Polyfunkční dům$",
+ "^Multi\\-funkcional building$",
+ "^Podnikatelský plán$",
+ "(?i)^Podnikatelský záměr$",
+ "(?i)^Business Plan$",
+ "^Oceňování nemovitostí$",
+ "^Marketingová komunikace$",
+ "^Marketing communication$",
+ "^Sumario Analítico$",
+ "^Riječ uredništva$",
+ "^Savjetovanja i priredbe$",
+ "^Índice$",
+ "^(Starobosanski nadpisi).*$",
+ "^Vzdělávání pracovníků v organizaci$",
+ "^Staff training in organization$",
+ "^(Life Histories of North American Geometridae).*$",
+ "^Strategická analýza podniku$",
+ "^Strategic Analysis of an Enterprise$",
+ "^Sadržaj$",
+ "^Upute suradnicima$",
+ "^Rodinný dům$",
+ "(?i)^Fami(l)?ly house$",
+ "^Upute autorima$",
+ "^Strategic Analysis$",
+ "^Finanční analýza vybraného podniku$",
+ "^Finanční analýza$",
+ "^Riječ urednika$",
+ "(?i)^Content(s?)$",
+ "(?i)^Inhalt$",
+ "^Jinjō shōgaku shūshinsho jidōyō$",
+ "(?i)^Index$",
+ "^Chūgaku kokubun kyōkasho$",
+ "^Retrato de una mujer$",
+ "^Retrato de un hombre$",
+ "^Kōtō shōgaku dokuhon$",
+ "^Shotōka kokugo$",
+ "^Shōgaku dokuhon$",
+ "^Jinjō shōgaku kokugo dokuhon$",
+ "^Shinsei kokugo dokuhon$",
+ "^Teikoku dokuhon$",
+ "^Instructions to Authors$",
+ "^KİTAP TAHLİLİ$",
+ "^PRZEGLĄD PIŚMIENNICTWA$",
+ "(?i)^Presentación$",
+ "^İçindekiler$",
+ "(?i)^Tabl?e of contents$",
+ "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+ "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+ "^Editorial( Board)?$",
+ "(?i)^Editorial \\(English\\)$",
+ "^Editörden$",
+ "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+ "^(Kiri Karl Morgensternile).*$",
+ "^(\\[Eksliibris Aleksandr).*\\]$",
+ "^(\\[Eksliibris Aleksandr).*$",
+ "^(Eksliibris Aleksandr).*$",
+ "^(Kiri A\\. de Vignolles).*$",
+ "^(2 kirja Karl Morgensternile).*$",
+ "^(Pirita kloostri idaosa arheoloogilised).*$",
+ "^(Kiri tundmatule).*$",
+ "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+ "^(Eksliibris Nikolai Birukovile).*$",
+ "^(Eksliibris Nikolai Issakovile).*$",
+ "^(WHP Cruise Summary Information of section).*$",
+ "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+ "^(Measurement of the spin\\-dependent structure function).*",
+ "(?i)^.*authors['’′]? reply\\.?$",
+ "(?i)^.*authors['’′]? response\\.?$",
+ "^Data [mM]anagement [sS]ervices\\.$",
+ "Research and Advanced Technology for Digital Libraries"
+ ]
+ },
+ "synonyms": {}
+ }
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json
new file mode 100644
index 000000000..153e38ada
--- /dev/null
+++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/pub.new.tree.conf.json
@@ -0,0 +1,465 @@
+{
+ "wf": {
+ "threshold": "0.99",
+ "dedupRun": "001",
+ "entityType": "result",
+ "subEntityType": "resulttype",
+ "subEntityValue": "publication",
+ "orderField": "title",
+ "queueMaxSize": "200",
+ "groupMaxSize": "100",
+ "maxChildren": "100",
+ "slidingWindowSize": "50",
+ "rootBuilder": [
+ "result",
+ "resultProject_outcome_isProducedBy",
+ "resultResult_publicationDataset_isRelatedTo",
+ "resultResult_similarity_isAmongTopNSimilarDocuments",
+ "resultResult_similarity_hasAmongTopNSimilarDocuments",
+ "resultOrganization_affiliation_isAffiliatedWith",
+ "resultResult_part_hasPart",
+ "resultResult_part_isPartOf",
+ "resultResult_supplement_isSupplementTo",
+ "resultResult_supplement_isSupplementedBy",
+ "resultResult_version_isVersionOf"
+ ],
+ "includeChildren": "true",
+ "maxIterations": 20,
+ "idPath": "$.id"
+ },
+ "pace": {
+ "clustering": [
+ {
+ "name": "wordsStatsSuffixPrefixChain",
+ "fields": [
+ "title"
+ ],
+ "params": {
+ "mod": "10"
+ }
+ },
+ {
+ "name": "lowercase",
+ "fields": [
+ "doi",
+ "altdoi"
+ ],
+ "params": {
+ "collapseOn:pid": "0"
+ }
+ }
+ ],
+ "decisionTree": {
+ "start": {
+ "fields": [
+ {
+ "field": "pid",
+ "comparator": "jsonListMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {
+ "jpath_value": "$.value",
+ "jpath_classid": "$.qualifier.classid",
+ "mode": "count"
+ }
+ }
+ ],
+ "threshold": 1.0,
+ "aggregation": "MAX",
+ "positive": "MATCH",
+ "negative": "NO_MATCH",
+ "undefined": "instanceTypeCheck",
+ "ignoreUndefined": "false"
+ },
+ "instanceTypeCheck": {
+ "fields": [
+ {
+ "field": "instance",
+ "comparator": "instanceTypeMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ }
+ ],
+ "threshold": 0.5,
+ "aggregation": "MAX",
+ "positive": "pidVSaltid",
+ "negative": "NO_MATCH",
+ "undefined": "pidVSaltid",
+ "ignoreUndefined": "true"
+ },
+ "pidVSaltid": {
+ "fields": [
+ {
+ "field": "pid",
+ "comparator": "jsonListMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {
+ "jpath_value": "$.value",
+ "jpath_classid": "$.qualifier.classid",
+ "crossCompare": "alternateid",
+ "mode": "count"
+ }
+ }
+ ],
+ "threshold": 1.0,
+ "aggregation": "MAX",
+ "positive": "softCheck",
+ "negative": "earlyExits",
+ "undefined": "earlyExits",
+ "ignoreUndefined": "true"
+ },
+ "softCheck": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "levensteinTitle",
+ "weight": 1.0,
+ "countIfUndefined": "true",
+ "params": {}
+ }
+ ],
+ "threshold": 0.9,
+ "aggregation": "AVG",
+ "positive": "MATCH",
+ "negative": "NO_MATCH",
+ "undefined": "NO_MATCH",
+ "ignoreUndefined": "true"
+ },
+ "earlyExits": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "titleVersionMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ },
+ {
+ "field": "authors",
+ "comparator": "sizeMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {}
+ },
+ {
+ "field": "authors",
+ "comparator": "authorsMatch",
+ "weight": 1.0,
+ "countIfUndefined": "false",
+ "params": {
+ "surname_th": 0.99,
+ "fullname_th": 0.95,
+ "mode": "surname"
+ }
+ }
+ ],
+ "threshold": 1.0,
+ "aggregation": "AND",
+ "positive": "strongCheck",
+ "negative": "NO_MATCH",
+ "undefined": "strongCheck",
+ "ignoreUndefined": "false"
+ },
+ "strongCheck": {
+ "fields": [
+ {
+ "field": "title",
+ "comparator": "levensteinTitle",
+ "weight": 1.0,
+ "countIfUndefined": "true",
+ "params": {}
+ }
+ ],
+ "threshold": 0.99,
+ "aggregation": "AVG",
+ "positive": "MATCH",
+ "negative": "NO_MATCH",
+ "undefined": "NO_MATCH",
+ "ignoreUndefined": "true"
+ }
+ },
+ "model": [
+ {
+ "name": "doi",
+ "type": "String",
+ "path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
+ },
+ {
+ "name": "altdoi",
+ "type": "String",
+ "path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
+ },
+ {
+ "name": "pid",
+ "type": "JSON",
+ "path": "$.instance[*].pid[*]",
+ "overrideMatch": "true"
+ },
+ {
+ "name": "alternateid",
+ "type": "JSON",
+ "path": "$.instance[*].alternateIdentifier[*]",
+ "overrideMatch": "true"
+ },
+ {
+ "name": "title",
+ "type": "String",
+ "path": "$.title[?(@.qualifier.classid == 'main title')].value",
+ "length": 250,
+ "size": 5
+ },
+ {
+ "name": "authors",
+ "type": "List",
+ "path": "$.author[*].fullname",
+ "size": 200
+ },
+ {
+ "name": "resulttype",
+ "type": "String",
+ "path": "$.resulttype.classid"
+ },
+ {
+ "name": "instance",
+ "type": "List",
+ "path": "$.instance[*].instancetype.classname"
+ }
+ ],
+ "blacklists": {
+ "title": [
+ "(?i)^Data Management Plan",
+ "^Inside Front Cover$",
+ "(?i)^Poster presentations$",
+ "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
+ "^Problems with perinatal pathology\\.?$",
+ "(?i)^Cases? of Puerperal Convulsions$",
+ "(?i)^Operative Gyna?ecology$",
+ "(?i)^Mind the gap\\!?\\:?$",
+ "^Chronic fatigue syndrome\\.?$",
+ "^Cartas? ao editor Letters? to the Editor$",
+ "^Note from the Editor$",
+ "^Anesthesia Abstract$",
+ "^Annual report$",
+ "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
+ "(?i)^Graph and Table of Infectious Diseases?$",
+ "^Presentation$",
+ "(?i)^Reviews and Information on Publications$",
+ "(?i)^PUBLIC HEALTH SERVICES?$",
+ "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
+ "(?i)^Adrese autora$",
+ "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
+ "(?i)^Acknowledgement to Referees$",
+ "(?i)^Behçet's disease\\.?$",
+ "(?i)^Isolation and identification of restriction endonuclease.*$",
+ "(?i)^CEREBROVASCULAR DISEASES?.?$",
+ "(?i)^Screening for abdominal aortic aneurysms?\\.?$",
+ "^Event management$",
+ "(?i)^Breakfast and Crohn's disease.*\\.?$",
+ "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
+ "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
+ "^Gushi hakubutsugaku$",
+ "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
+ "^Intestinal spirocha?etosis$",
+ "^Treatment of Rodent Ulcer$",
+ "(?i)^\\W*Cloud Computing\\W*$",
+ "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
+ "^Free Communications, Poster Presentations: Session [A-F]$",
+ "^“The Historical Aspects? of Quackery\\.?”$",
+ "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
+ "^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
+ "(?i)^Case Report$",
+ "^Boletín Informativo$",
+ "(?i)^Glioblastoma Multiforme$",
+ "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
+ "^Zaměstnanecké výhody$",
+ "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
+ "(?i)^Carotid body tumours?\\.?$",
+ "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
+ "^Avant-propos$",
+ "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
+ "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
+ "(?i)^PUBLIC HEALTH VERSUS THE STATE$",
+ "^Viñetas de Cortázar$",
+ "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
+ "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
+ "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
+ "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
+ "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
+ "^Aus der AGMB$",
+ "^Znanstveno-stručni prilozi$",
+ "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
+ "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
+ "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
+ "^Finanční analýza podniku$",
+ "^Financial analysis( of business)?$",
+ "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
+ "^Jikken nihon shūshinsho$",
+ "(?i)^CORONER('|s)(s|') INQUESTS$",
+ "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
+ "(?i)^Consultants' contract(s)?$",
+ "(?i)^Upute autorima$",
+ "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
+ "^Joshi shin kokubun$",
+ "^Kōtō shōgaku dokuhon nōson'yō$",
+ "^Jinjō shōgaku shōka$",
+ "^Shōgaku shūjichō$",
+ "^Nihon joshi dokuhon$",
+ "^Joshi shin dokuhon$",
+ "^Chūtō kanbun dokuhon$",
+ "^Wabun dokuhon$",
+ "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
+ "(?i)^cardiac rehabilitation$",
+ "(?i)^Analytical summary$",
+ "^Thesaurus resolutionum Sacrae Congregationis Concilii$",
+ "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
+ "^Prikazi i osvrti$",
+ "^Rodinný dům s provozovnou$",
+ "^Family house with an establishment$",
+ "^Shinsei chūtō shin kokugun$",
+ "^Pulmonary alveolar proteinosis(\\.?)$",
+ "^Shinshū kanbun$",
+ "^Viñeta(s?) de Rodríguez$",
+ "(?i)^RUBRIKA UREDNIKA$",
+ "^A Matching Model of the Academic Publication Market$",
+ "^Yōgaku kōyō$",
+ "^Internetový marketing$",
+ "^Internet marketing$",
+ "^Chūtō kokugo dokuhon$",
+ "^Kokugo dokuhon$",
+ "^Antibiotic Cover for Dental Extraction(s?)$",
+ "^Strategie podniku$",
+ "^Strategy of an Enterprise$",
+ "(?i)^respiratory disease(s?)(\\.?)$",
+ "^Award(s?) for Gallantry in Civil Defence$",
+ "^Podniková kultura$",
+ "^Corporate Culture$",
+ "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
+ "^Pracovní motivace$",
+ "^Work Motivation$",
+ "^Kaitei kōtō jogaku dokuhon$",
+ "^Konsolidovaná účetní závěrka$",
+ "^Consolidated Financial Statements$",
+ "(?i)^intracranial tumour(s?)$",
+ "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
+ "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
+ "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
+ "^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
+ "^Úroveň motivačního procesu jako způsobu vedení lidí$",
+ "^The level of motivation process as a leadership$",
+ "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
+ "(?i)^news and events$",
+ "(?i)^NOVOSTI I DOGAĐAJI$",
+ "^Sansū no gakushū$",
+ "^Posouzení informačního systému firmy a návrh změn$",
+ "^Information System Assessment and Proposal for ICT Modification$",
+ "^Stresové zatížení pracovníků ve vybrané profesi$",
+ "^Stress load in a specific job$",
+ "^Sunday: Poster Sessions, Pt.*$",
+ "^Monday: Poster Sessions, Pt.*$",
+ "^Wednesday: Poster Sessions, Pt.*",
+ "^Tuesday: Poster Sessions, Pt.*$",
+ "^Analýza reklamy$",
+ "^Analysis of advertising$",
+ "^Shōgaku shūshinsho$",
+ "^Shōgaku sansū$",
+ "^Shintei joshi kokubun$",
+ "^Taishō joshi kokubun dokuhon$",
+ "^Joshi kokubun$",
+ "^Účetní uzávěrka a účetní závěrka v ČR$",
+ "(?i)^The \"?Causes\"? of Cancer$",
+ "^Normas para la publicación de artículos$",
+ "^Editor('|s)(s|') [Rr]eply$",
+ "^Editor(’|s)(s|’) letter$",
+ "^Redaktoriaus žodis$",
+ "^DISCUSSION ON THE PRECEDING PAPER$",
+ "^Kōtō shōgaku shūshinsho jidōyō$",
+ "^Shōgaku nihon rekishi$",
+ "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
+ "^Préface$",
+ "^Occupational [Hh]ealth [Ss]ervices.$",
+ "^In Memoriam Professor Toshiyuki TAKESHIMA$",
+ "^Účetní závěrka ve vybraném podniku.*$",
+ "^Financial statements in selected company$",
+ "^Abdominal [Aa]ortic [Aa]neurysms.*$",
+ "^Pseudomyxoma peritonei$",
+ "^Kazalo autora$",
+ "(?i)^uvodna riječ$",
+ "^Motivace jako způsob vedení lidí$",
+ "^Motivation as a leadership$",
+ "^Polyfunkční dům$",
+ "^Multi\\-funkcional building$",
+ "^Podnikatelský plán$",
+ "(?i)^Podnikatelský záměr$",
+ "(?i)^Business Plan$",
+ "^Oceňování nemovitostí$",
+ "^Marketingová komunikace$",
+ "^Marketing communication$",
+ "^Sumario Analítico$",
+ "^Riječ uredništva$",
+ "^Savjetovanja i priredbe$",
+ "^Índice$",
+ "^(Starobosanski nadpisi).*$",
+ "^Vzdělávání pracovníků v organizaci$",
+ "^Staff training in organization$",
+ "^(Life Histories of North American Geometridae).*$",
+ "^Strategická analýza podniku$",
+ "^Strategic Analysis of an Enterprise$",
+ "^Sadržaj$",
+ "^Upute suradnicima$",
+ "^Rodinný dům$",
+ "(?i)^Fami(l)?ly house$",
+ "^Upute autorima$",
+ "^Strategic Analysis$",
+ "^Finanční analýza vybraného podniku$",
+ "^Finanční analýza$",
+ "^Riječ urednika$",
+ "(?i)^Content(s?)$",
+ "(?i)^Inhalt$",
+ "^Jinjō shōgaku shūshinsho jidōyō$",
+ "(?i)^Index$",
+ "^Chūgaku kokubun kyōkasho$",
+ "^Retrato de una mujer$",
+ "^Retrato de un hombre$",
+ "^Kōtō shōgaku dokuhon$",
+ "^Shotōka kokugo$",
+ "^Shōgaku dokuhon$",
+ "^Jinjō shōgaku kokugo dokuhon$",
+ "^Shinsei kokugo dokuhon$",
+ "^Teikoku dokuhon$",
+ "^Instructions to Authors$",
+ "^KİTAP TAHLİLİ$",
+ "^PRZEGLĄD PIŚMIENNICTWA$",
+ "(?i)^Presentación$",
+ "^İçindekiler$",
+ "(?i)^Tabl?e of contents$",
+ "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
+ "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
+ "^Editorial( Board)?$",
+ "(?i)^Editorial \\(English\\)$",
+ "^Editörden$",
+ "^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
+ "^(Kiri Karl Morgensternile).*$",
+ "^(\\[Eksliibris Aleksandr).*\\]$",
+ "^(\\[Eksliibris Aleksandr).*$",
+ "^(Eksliibris Aleksandr).*$",
+ "^(Kiri A\\. de Vignolles).*$",
+ "^(2 kirja Karl Morgensternile).*$",
+ "^(Pirita kloostri idaosa arheoloogilised).*$",
+ "^(Kiri tundmatule).*$",
+ "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
+ "^(Eksliibris Nikolai Birukovile).*$",
+ "^(Eksliibris Nikolai Issakovile).*$",
+ "^(WHP Cruise Summary Information of section).*$",
+ "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
+ "^(Measurement of the spin\\-dependent structure function).*",
+ "(?i)^.*authors['’′]? reply\\.?$",
+ "(?i)^.*authors['’′]? response\\.?$",
+ "^Data [mM]anagement [sS]ervices\\.$",
+ "Research and Advanced Technology for Digital Libraries"
+ ]
+ },
+ "synonyms": {}
+ }
+}
\ No newline at end of file