From 451114418d2d6ac8738ebe5ef7669546e55a7cf2 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 4 Nov 2021 15:20:57 +0100 Subject: [PATCH] implementation of the instance type comparator and its tests --- .../pace/common/AbstractPaceFunctions.java | 4 + .../dnetlib/pace/tree/InstanceTypeMatch.java | 78 +++++++++++++++++++ .../pace/comparators/ComparatorTest.java | 40 ++++++++-- 3 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index b980018..6c37073 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -6,9 +6,11 @@ import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -16,6 +18,7 @@ import java.io.IOException; import java.io.StringWriter; import java.text.Normalizer; import java.util.*; +import java.util.function.Function; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -313,4 +316,5 @@ public abstract class AbstractPaceFunctions { throw new RuntimeException("cannot load resource from classpath: " + filename); } } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java new file mode 100644 index 0000000..77262df --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java @@ -0,0 +1,78 @@ +package eu.dnetlib.pace.tree; + +import com.google.common.collect.Sets; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.tree.support.AbstractComparator; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +public class InstanceTypeMatch extends AbstractComparator { + + final Map translationMap = new HashMap<>(); + + public InstanceTypeMatch(Map params){ + super(params); + + //jolly types + translationMap.put("Conference object", "*"); + translationMap.put("Other literature type", "*"); + translationMap.put("Unknown", "*"); + + //article types + translationMap.put("Article", "Article"); + translationMap.put("Data Paper", "Article"); + translationMap.put("Software Paper", "Article"); + translationMap.put("Preprint", "Article"); + + //thesis types + translationMap.put("Thesis", "Thesis"); + translationMap.put("Master thesis", "Thesis"); + translationMap.put("Bachelor thesis", "Thesis"); + translationMap.put("Doctoral thesis", "Thesis"); + } + + + @Override + public double compare(final Field a, final Field b, final Config conf) { + + final List sa = ((FieldList) a).stringList(); + final List sb = ((FieldList) b).stringList(); + + if (sa.isEmpty() || sb.isEmpty()) { + return -1; + } + + final Set ca = sa.stream().map(this::translate).collect(Collectors.toSet()); + final Set cb = sb.stream().map(this::translate).collect(Collectors.toSet()); + + //if at least one is a jolly type, it must produce a match + if (ca.contains("*") || cb.contains("*")) + return 1.0; + + int incommon = Sets.intersection(ca, cb).size(); + + //if at least one is in common, it must produce a match + return incommon >= 1 ? 1 : 0; + } + + public String translate(String term){ + return translationMap.getOrDefault(term, term); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 6bdd1ad..2c04241 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -1,6 +1,10 @@ package eu.dnetlib.pace.comparators; +import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.config.DedupConfig; @@ -11,11 +15,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import eu.dnetlib.pace.common.AbstractPaceFunctions; +import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.Map; @TestInstance(TestInstance.Lifecycle.PER_CLASS) -public class ComparatorTest extends AbstractPaceFunctions { +public class ComparatorTest extends AbstractPaceTest { private Map params; private DedupConfig conf; @@ -138,12 +144,6 @@ public class ComparatorTest extends AbstractPaceFunctions { result = levenstein.distance("Victoria", "Windsor", conf); System.out.println("result = " + result); - //University of Victoria Dataverse - //University of British Columbia Dataverse - //University of Windsor Dataverse - //University of Waterloo Dataverse - //University of Toronto Dataverse - //University of Ottawa Dataverse } @Test @@ -156,8 +156,32 @@ public class ComparatorTest extends AbstractPaceFunctions { } @Test - public void jsonListMatchTest(){ + public void instanceTypeMatchTest() { + final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params); + + Field a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType"); + Field b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType"); + double result = instanceTypeMatch.compare(a, b, conf); + + assertEquals(1.0, result); + + Field c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType"); + result = instanceTypeMatch.compare(c, b, conf); + + assertEquals(1.0, result); + + Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType"); + Field e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType"); + result = instanceTypeMatch.compare(d, e, conf); + + assertEquals(1.0, result); + + Field g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType"); + result = instanceTypeMatch.compare(e, g, conf); + + assertEquals(0.0, result); } + }