Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
3 changed files with 114 additions and 8 deletions
Showing only changes of commit 8f1db32921 - Show all commits

View File

@ -6,9 +6,11 @@ import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList; import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl; import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -16,6 +18,7 @@ import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -313,4 +316,5 @@ public abstract class AbstractPaceFunctions {
throw new RuntimeException("cannot load resource from classpath: " + filename); throw new RuntimeException("cannot load resource from classpath: " + filename);
} }
} }
} }

View File

@ -0,0 +1,78 @@
package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
public class InstanceTypeMatch extends AbstractComparator {
final Map<String, String> translationMap = new HashMap<>();
public InstanceTypeMatch(Map<String, String> params){
super(params);
//jolly types
translationMap.put("Conference object", "*");
translationMap.put("Other literature type", "*");
translationMap.put("Unknown", "*");
//article types
translationMap.put("Article", "Article");
translationMap.put("Data Paper", "Article");
translationMap.put("Software Paper", "Article");
translationMap.put("Preprint", "Article");
//thesis types
translationMap.put("Thesis", "Thesis");
translationMap.put("Master thesis", "Thesis");
translationMap.put("Bachelor thesis", "Thesis");
translationMap.put("Doctoral thesis", "Thesis");
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
final List<String> sa = ((FieldList) a).stringList();
final List<String> sb = ((FieldList) b).stringList();
if (sa.isEmpty() || sb.isEmpty()) {
return -1;
}
final Set<String> ca = sa.stream().map(this::translate).collect(Collectors.toSet());
final Set<String> cb = sb.stream().map(this::translate).collect(Collectors.toSet());
//if at least one is a jolly type, it must produce a match
if (ca.contains("*") || cb.contains("*"))
return 1.0;
int incommon = Sets.intersection(ca, cb).size();
//if at least one is in common, it must produce a match
return incommon >= 1 ? 1 : 0;
}
public String translate(String term){
return translationMap.getOrDefault(term, term);
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -1,6 +1,10 @@
package eu.dnetlib.pace.comparators; package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.*; import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
@ -11,11 +15,13 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@TestInstance(TestInstance.Lifecycle.PER_CLASS) @TestInstance(TestInstance.Lifecycle.PER_CLASS)
public class ComparatorTest extends AbstractPaceFunctions { public class ComparatorTest extends AbstractPaceTest {
private Map<String, String> params; private Map<String, String> params;
private DedupConfig conf; private DedupConfig conf;
@ -138,12 +144,6 @@ public class ComparatorTest extends AbstractPaceFunctions {
result = levenstein.distance("Victoria", "Windsor", conf); result = levenstein.distance("Victoria", "Windsor", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
//University of Victoria Dataverse
//University of British Columbia Dataverse
//University of Windsor Dataverse
//University of Waterloo Dataverse
//University of Toronto Dataverse
//University of Ottawa Dataverse
} }
@Test @Test
@ -156,8 +156,32 @@ public class ComparatorTest extends AbstractPaceFunctions {
} }
@Test @Test
public void jsonListMatchTest(){ public void instanceTypeMatchTest() {
final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
Field a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
Field b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
double result = instanceTypeMatch.compare(a, b, conf);
assertEquals(1.0, result);
Field c = createFieldList(Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
result = instanceTypeMatch.compare(c, b, conf);
assertEquals(1.0, result);
Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
Field e = createFieldList(Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
result = instanceTypeMatch.compare(d, e, conf);
assertEquals(1.0, result);
Field g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType");
result = instanceTypeMatch.compare(e, g, conf);
assertEquals(0.0, result);
} }
} }