forked from D-Net/dnet-hadoop
implementation of the instance type comparator and its tests
This commit is contained in:
parent
fbb1b66bfb
commit
8f1db32921
|
@ -6,9 +6,11 @@ import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
import eu.dnetlib.pace.model.FieldListImpl;
|
||||||
|
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
@ -16,6 +18,7 @@ import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.function.Function;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -313,4 +316,5 @@ public abstract class AbstractPaceFunctions {
|
||||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,78 @@
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.FieldList;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class InstanceTypeMatch extends AbstractComparator {
|
||||||
|
|
||||||
|
final Map<String, String> translationMap = new HashMap<>();
|
||||||
|
|
||||||
|
public InstanceTypeMatch(Map<String, String> params){
|
||||||
|
super(params);
|
||||||
|
|
||||||
|
//jolly types
|
||||||
|
translationMap.put("Conference object", "*");
|
||||||
|
translationMap.put("Other literature type", "*");
|
||||||
|
translationMap.put("Unknown", "*");
|
||||||
|
|
||||||
|
//article types
|
||||||
|
translationMap.put("Article", "Article");
|
||||||
|
translationMap.put("Data Paper", "Article");
|
||||||
|
translationMap.put("Software Paper", "Article");
|
||||||
|
translationMap.put("Preprint", "Article");
|
||||||
|
|
||||||
|
//thesis types
|
||||||
|
translationMap.put("Thesis", "Thesis");
|
||||||
|
translationMap.put("Master thesis", "Thesis");
|
||||||
|
translationMap.put("Bachelor thesis", "Thesis");
|
||||||
|
translationMap.put("Doctoral thesis", "Thesis");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double compare(final Field a, final Field b, final Config conf) {
|
||||||
|
|
||||||
|
final List<String> sa = ((FieldList) a).stringList();
|
||||||
|
final List<String> sb = ((FieldList) b).stringList();
|
||||||
|
|
||||||
|
if (sa.isEmpty() || sb.isEmpty()) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
final Set<String> ca = sa.stream().map(this::translate).collect(Collectors.toSet());
|
||||||
|
final Set<String> cb = sb.stream().map(this::translate).collect(Collectors.toSet());
|
||||||
|
|
||||||
|
//if at least one is a jolly type, it must produce a match
|
||||||
|
if (ca.contains("*") || cb.contains("*"))
|
||||||
|
return 1.0;
|
||||||
|
|
||||||
|
int incommon = Sets.intersection(ca, cb).size();
|
||||||
|
|
||||||
|
//if at least one is in common, it must produce a match
|
||||||
|
return incommon >= 1 ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String translate(String term){
|
||||||
|
return translationMap.getOrDefault(term, term);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getWeight() {
|
||||||
|
return super.weight;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected double normalize(final double d) {
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,6 +1,10 @@
|
||||||
package eu.dnetlib.pace.comparators;
|
package eu.dnetlib.pace.comparators;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
import eu.dnetlib.pace.model.FieldListImpl;
|
||||||
|
import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.tree.*;
|
import eu.dnetlib.pace.tree.*;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
@ -11,11 +15,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
public class ComparatorTest extends AbstractPaceFunctions {
|
public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
private DedupConfig conf;
|
private DedupConfig conf;
|
||||||
|
@ -138,12 +144,6 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
result = levenstein.distance("Victoria", "Windsor", conf);
|
result = levenstein.distance("Victoria", "Windsor", conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
//University of Victoria Dataverse
|
|
||||||
//University of British Columbia Dataverse
|
|
||||||
//University of Windsor Dataverse
|
|
||||||
//University of Waterloo Dataverse
|
|
||||||
//University of Toronto Dataverse
|
|
||||||
//University of Ottawa Dataverse
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -156,8 +156,32 @@ public class ComparatorTest extends AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void jsonListMatchTest(){
|
public void instanceTypeMatchTest() {
|
||||||
|
|
||||||
|
final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
|
||||||
|
|
||||||
|
Field a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
|
||||||
|
Field b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
|
||||||
|
double result = instanceTypeMatch.compare(a, b, conf);
|
||||||
|
|
||||||
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
Field c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
|
||||||
|
result = instanceTypeMatch.compare(c, b, conf);
|
||||||
|
|
||||||
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
|
||||||
|
Field e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
|
||||||
|
result = instanceTypeMatch.compare(d, e, conf);
|
||||||
|
|
||||||
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
|
Field g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType");
|
||||||
|
result = instanceTypeMatch.compare(e, g, conf);
|
||||||
|
|
||||||
|
assertEquals(0.0, result);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue