implementation of new comparators for organization and dataset disambiguation
This commit is contained in:
parent
e5df68772d
commit
62c4c3ed29
|
@ -21,7 +21,7 @@ public class CodeMatch extends AbstractStringComparator {
|
||||||
public CodeMatch(Map<String, String> params) {
|
public CodeMatch(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> getRegexList(String input) {
|
public Set<String> getRegexList(String input) {
|
||||||
|
|
|
@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
switch (MODE) {
|
||||||
|
case "count":
|
||||||
|
return Sets.intersection(ca, cb).size();
|
||||||
|
|
||||||
|
case "percentage":
|
||||||
int incommon = Sets.intersection(ca, cb).size();
|
int incommon = Sets.intersection(ca, cb).size();
|
||||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||||
|
|
||||||
if (incommon + simDiff == 0) {
|
if (incommon + simDiff == 0) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (MODE.equals("percentage"))
|
|
||||||
return (double) incommon / (incommon + simDiff);
|
return (double) incommon / (incommon + simDiff);
|
||||||
else
|
|
||||||
return incommon;
|
|
||||||
|
|
||||||
|
case "type":
|
||||||
|
Set<String> typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||||
|
Set<String> typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
Set<String> types = Sets.intersection(typesA, typesB);
|
||||||
|
|
||||||
|
if (types.isEmpty()) // if no common type, it is impossible to compare
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||||
|
cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return (double) Sets.intersection(ca, cb).size() / types.size();
|
||||||
|
|
||||||
|
default:
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// converts every json into a comparable string basing on parameters
|
// converts every json into a comparable string basing on parameters
|
||||||
|
@ -69,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator {
|
||||||
// for each path in the param list
|
// for each path in the param list
|
||||||
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||||
String path = params.get(key);
|
String path = params.get(key);
|
||||||
String value = MapDocumentUtil.getJPathString(path, documentContext);
|
String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase();
|
||||||
if (value == null || value.isEmpty())
|
if (value == null || value.isEmpty())
|
||||||
value = "";
|
value = "";
|
||||||
st.append(value);
|
st.append(value);
|
||||||
|
|
|
@ -65,6 +65,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void datasetVersionCodeMatchTest() {
|
||||||
|
|
||||||
|
params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+");
|
||||||
|
CodeMatch codeMatch = new CodeMatch(params);
|
||||||
|
|
||||||
|
// names have different codes
|
||||||
|
assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf));
|
||||||
|
|
||||||
|
// names have same code
|
||||||
|
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf));
|
||||||
|
|
||||||
|
// code is not in both names
|
||||||
|
assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf));
|
||||||
|
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void listContainsMatchTest() {
|
public void listContainsMatchTest() {
|
||||||
|
|
||||||
|
@ -257,15 +274,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
List<String> a = createFieldList(
|
List<String> a = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||||
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
||||||
"authors");
|
"authors");
|
||||||
List<String> b = createFieldList(
|
List<String> b = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}",
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"),
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
|
|
||||||
"authors");
|
"authors");
|
||||||
|
|
||||||
double result = jsonListMatch.compare(a, b, conf);
|
double result = jsonListMatch.compare(a, b, conf);
|
||||||
|
@ -277,6 +294,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
result = jsonListMatch.compare(a, b, conf);
|
result = jsonListMatch.compare(a, b, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
params.put("mode", "type");
|
||||||
|
jsonListMatch = new JsonListMatch(params);
|
||||||
|
result = jsonListMatch.compare(a, b, conf);
|
||||||
|
|
||||||
|
assertEquals(0.5, result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue