forked from D-Net/dnet-hadoop
Merge pull request 'dedup_new_comparators' (#509) from dedup_new_comparators into beta
Reviewed-on: D-Net/dnet-hadoop#509
This commit is contained in:
commit
ef51a60f19
|
@ -21,7 +21,7 @@ public class CodeMatch extends AbstractStringComparator {
|
||||||
public CodeMatch(Map<String, String> params) {
|
public CodeMatch(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> getRegexList(String input) {
|
public Set<String> getRegexList(String input) {
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
import org.joda.time.DateTime;
|
||||||
|
|
||||||
|
import java.time.DateTimeException;
|
||||||
|
import java.time.LocalDate;
|
||||||
|
import java.time.Period;
|
||||||
|
import java.time.format.DateTimeFormatter;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ComparatorClass("dateRange")
|
||||||
|
public class DateRange extends AbstractStringComparator {
|
||||||
|
|
||||||
|
int YEAR_RANGE;
|
||||||
|
|
||||||
|
public DateRange(Map<String, String> params) {
|
||||||
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public DateRange(final double weight) {
|
||||||
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
|
||||||
|
super(weight, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isNumeric(String str) {
|
||||||
|
return str.matches("\\d+"); //match a number with optional '-' and decimal.
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
|
return -1.0; // return -1 if a field is missing
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
|
||||||
|
LocalDate d1 = LocalDate.parse(a, formatter);
|
||||||
|
LocalDate d2 = LocalDate.parse(b, formatter);
|
||||||
|
Period period = Period.between(d1, d2);
|
||||||
|
|
||||||
|
return period.getYears() <= YEAR_RANGE? 1.0 : 0.0;
|
||||||
|
}
|
||||||
|
catch (DateTimeException e) {
|
||||||
|
return -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getWeight() {
|
||||||
|
return super.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected double normalize(final double d) {
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
}
|
|
@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||||
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
switch (MODE) {
|
||||||
|
case "count":
|
||||||
|
return Sets.intersection(ca, cb).size();
|
||||||
|
|
||||||
|
case "percentage":
|
||||||
int incommon = Sets.intersection(ca, cb).size();
|
int incommon = Sets.intersection(ca, cb).size();
|
||||||
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
int simDiff = Sets.symmetricDifference(ca, cb).size();
|
||||||
|
|
||||||
if (incommon + simDiff == 0) {
|
if (incommon + simDiff == 0) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (MODE.equals("percentage"))
|
|
||||||
return (double) incommon / (incommon + simDiff);
|
return (double) incommon / (incommon + simDiff);
|
||||||
else
|
|
||||||
return incommon;
|
|
||||||
|
|
||||||
|
case "type":
|
||||||
|
Set<String> typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||||
|
Set<String> typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
Set<String> types = Sets.intersection(typesA, typesB);
|
||||||
|
|
||||||
|
if (types.isEmpty()) // if no common type, it is impossible to compare
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||||
|
cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
return (double) Sets.intersection(ca, cb).size() / types.size();
|
||||||
|
|
||||||
|
default:
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// converts every json into a comparable string basing on parameters
|
// converts every json into a comparable string basing on parameters
|
||||||
|
@ -69,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator {
|
||||||
// for each path in the param list
|
// for each path in the param list
|
||||||
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||||
String path = params.get(key);
|
String path = params.get(key);
|
||||||
String value = MapDocumentUtil.getJPathString(path, documentContext);
|
String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase();
|
||||||
if (value == null || value.isEmpty())
|
if (value == null || value.isEmpty())
|
||||||
value = "";
|
value = "";
|
||||||
st.append(value);
|
st.append(value);
|
||||||
|
|
|
@ -65,6 +65,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void datasetVersionCodeMatchTest() {
|
||||||
|
|
||||||
|
params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+");
|
||||||
|
CodeMatch codeMatch = new CodeMatch(params);
|
||||||
|
|
||||||
|
// names have different codes
|
||||||
|
assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf));
|
||||||
|
|
||||||
|
// names have same code
|
||||||
|
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf));
|
||||||
|
|
||||||
|
// code is not in both names
|
||||||
|
assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf));
|
||||||
|
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void listContainsMatchTest() {
|
public void listContainsMatchTest() {
|
||||||
|
|
||||||
|
@ -257,15 +274,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
List<String> a = createFieldList(
|
List<String> a = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||||
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
|
||||||
"authors");
|
"authors");
|
||||||
List<String> b = createFieldList(
|
List<String> b = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}",
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"),
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
|
|
||||||
"authors");
|
"authors");
|
||||||
|
|
||||||
double result = jsonListMatch.compare(a, b, conf);
|
double result = jsonListMatch.compare(a, b, conf);
|
||||||
|
@ -277,6 +294,13 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
result = jsonListMatch.compare(a, b, conf);
|
result = jsonListMatch.compare(a, b, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
params.put("mode", "type");
|
||||||
|
jsonListMatch = new JsonListMatch(params);
|
||||||
|
result = jsonListMatch.compare(a, b, conf);
|
||||||
|
|
||||||
|
assertEquals(0.5, result);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -327,6 +351,24 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void dateMatch() {
|
||||||
|
|
||||||
|
DateRange dateRange = new DateRange(params);
|
||||||
|
|
||||||
|
double result = dateRange.distance("2021-05-13", "2023-05-13", conf);
|
||||||
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
result = dateRange.distance("2021-05-13", "2025-05-13", conf);
|
||||||
|
assertEquals(0.0, result);
|
||||||
|
|
||||||
|
result = dateRange.distance("", "2020-05-05", conf);
|
||||||
|
assertEquals(-1.0, result);
|
||||||
|
|
||||||
|
result = dateRange.distance("invalid date", "2021-05-02", conf);
|
||||||
|
assertEquals(-1.0, result);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void titleVersionMatchTest() {
|
public void titleVersionMatchTest() {
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue