1
0
Fork 0

Merge pull request 'dedup_new_comparators' (#509) from dedup_new_comparators into beta

Reviewed-on: D-Net/dnet-hadoop#509
This commit is contained in:
Claudio Atzori 2024-11-19 15:13:40 +01:00
commit ef51a60f19
4 changed files with 145 additions and 19 deletions

View File

@ -21,7 +21,7 @@ public class CodeMatch extends AbstractStringComparator {
public CodeMatch(Map<String, String> params) { public CodeMatch(Map<String, String> params) {
super(params); super(params);
this.params = params; this.params = params;
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+")); this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+"));
} }
public Set<String> getRegexList(String input) { public Set<String> getRegexList(String input) {

View File

@ -0,0 +1,67 @@
package eu.dnetlib.pace.tree;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.joda.time.DateTime;
import java.time.DateTimeException;
import java.time.LocalDate;
import java.time.Period;
import java.time.format.DateTimeFormatter;
import java.util.Locale;
import java.util.Map;
@ComparatorClass("dateRange")
public class DateRange extends AbstractStringComparator {
int YEAR_RANGE;
public DateRange(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3"));
}
public DateRange(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected DateRange(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
public static boolean isNumeric(String str) {
return str.matches("\\d+"); //match a number with optional '-' and decimal.
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; // return -1 if a field is missing
}
try {
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH);
LocalDate d1 = LocalDate.parse(a, formatter);
LocalDate d2 = LocalDate.parse(b, formatter);
Period period = Period.between(d1, d2);
return period.getYears() <= YEAR_RANGE? 1.0 : 0.0;
}
catch (DateTimeException e) {
return -1.0;
}
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator {
return -1; return -1;
} }
final Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); Set<String> ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); Set<String> cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet());
int incommon = Sets.intersection(ca, cb).size(); switch (MODE) {
int simDiff = Sets.symmetricDifference(ca, cb).size(); case "count":
return Sets.intersection(ca, cb).size();
if (incommon + simDiff == 0) { case "percentage":
return 0.0; int incommon = Sets.intersection(ca, cb).size();
int simDiff = Sets.symmetricDifference(ca, cb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
return (double) incommon / (incommon + simDiff);
case "type":
Set<String> typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
Set<String> typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet());
Set<String> types = Sets.intersection(typesA, typesB);
if (types.isEmpty()) // if no common type, it is impossible to compare
return -1;
ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet());
return (double) Sets.intersection(ca, cb).size() / types.size();
default:
return -1;
} }
if (MODE.equals("percentage"))
return (double) incommon / (incommon + simDiff);
else
return incommon;
} }
// converts every json into a comparable string basing on parameters // converts every json into a comparable string basing on parameters
@ -69,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator {
// for each path in the param list // for each path in the param list
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
String path = params.get(key); String path = params.get(key);
String value = MapDocumentUtil.getJPathString(path, documentContext); String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase();
if (value == null || value.isEmpty()) if (value == null || value.isEmpty())
value = ""; value = "";
st.append(value); st.append(value);

View File

@ -65,6 +65,23 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test
public void datasetVersionCodeMatchTest() {
params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+");
CodeMatch codeMatch = new CodeMatch(params);
// names have different codes
assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf));
// names have same code
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf));
// code is not in both names
assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf));
assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf));
}
@Test @Test
public void listContainsMatchTest() { public void listContainsMatchTest() {
@ -257,15 +274,15 @@ public class ComparatorTest extends AbstractPaceTest {
List<String> a = createFieldList( List<String> a = createFieldList(
Arrays Arrays
.asList( .asList(
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"),
"authors"); "authors");
List<String> b = createFieldList( List<String> b = createFieldList(
Arrays Arrays
.asList( .asList(
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}", "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}", "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}",
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}", "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"),
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"),
"authors"); "authors");
double result = jsonListMatch.compare(a, b, conf); double result = jsonListMatch.compare(a, b, conf);
@ -277,6 +294,13 @@ public class ComparatorTest extends AbstractPaceTest {
result = jsonListMatch.compare(a, b, conf); result = jsonListMatch.compare(a, b, conf);
assertEquals(1.0, result); assertEquals(1.0, result);
params.put("mode", "type");
jsonListMatch = new JsonListMatch(params);
result = jsonListMatch.compare(a, b, conf);
assertEquals(0.5, result);
} }
@Test @Test
@ -327,6 +351,24 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test
public void dateMatch() {
DateRange dateRange = new DateRange(params);
double result = dateRange.distance("2021-05-13", "2023-05-13", conf);
assertEquals(1.0, result);
result = dateRange.distance("2021-05-13", "2025-05-13", conf);
assertEquals(0.0, result);
result = dateRange.distance("", "2020-05-05", conf);
assertEquals(-1.0, result);
result = dateRange.distance("invalid date", "2021-05-02", conf);
assertEquals(-1.0, result);
}
@Test @Test
public void titleVersionMatchTest() { public void titleVersionMatchTest() {