revised PidMatch implementation, cleanup

This commit is contained in:
Claudio Atzori 2018-10-20 08:38:19 +02:00
parent 0bab8cf704
commit bc4505e0e6
7 changed files with 74 additions and 78 deletions

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.gson.Gson;
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
@ -21,12 +22,14 @@ import eu.dnetlib.pace.model.gt.GTAuthor;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.RandomUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@ -101,6 +104,13 @@ public abstract class AbstractProtoPaceTest extends OafTest {
return result(config, id, title, date, Lists.newArrayList(pid), authors);
}
static List<String> pidTypes = Lists.newArrayList();
static {
pidTypes.add("doi");
//pidTypes.add("oai");
//pidTypes.add("pmid");
}
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
if (!StringUtils.isBlank(title)) {
@ -126,7 +136,7 @@ public abstract class AbstractProtoPaceTest extends OafTest {
if (pid != null) {
for(String p : pid) {
if (!StringUtils.isBlank(p)) {
entity.addPid(sp(p, "doi"));
entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
}
}

View File

@ -108,8 +108,8 @@ public class DetectorTest extends AbstractProtoPaceTest {
public void testDistanceResultMissingTwoDate() {
final Config config = getResultConf();
final MapDocument resA = result(config, "A", "title title title 6BESR");
final MapDocument resB = result(config, "B", "title title title 6CLER");
final MapDocument resA = result(config, "A", "bellaciao");
final MapDocument resB = result(config, "B", "bellocioa");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore();
@ -326,6 +326,51 @@ public class DetectorTest extends AbstractProtoPaceTest {
// assertTrue(d.getScore() == 0.0);
}
@Test
public void testDistanceResultNoPidsConf() {
final Config config = getResultFullConf();
final MapDocument resA =
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
final MapDocument resB =
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double s = sr.getScore();
log.info(sr.toString());
log.info(String.format(" s ---> %s", s));
// assertTrue(d.getScore() == 0.0);
}
@Test
public void testDistanceResultPidsConf() {
final Config config = getResultFullConf();
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
final MapDocument resA =
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
pidA, authorsA);
final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
final MapDocument resB =
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
pidB, authorsB);
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double s = sr.getScore();
log.info(sr.toString());
log.info(String.format(" s ---> %s", s));
// assertTrue(d.getScore() == 0.0);
}
@Test
public void testDistanceResultFullConf() {

View File

@ -20,7 +20,7 @@
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" }
],
"blacklists" : { }

View File

@ -1,7 +1,11 @@
package eu.dnetlib.pace.condition;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
@ -33,20 +37,21 @@ public class PidMatch extends AbstractCondition {
final List<Pid> pal = Pid.fromOafJson(sa);
final List<Pid> pbl = Pid.fromOafJson(sb);
int result = 0;
for(Pid pa : pal) {
final String ta = pa.getType();
final Set<String> pidAset = toHashSet(pal);
final Set<String> pidBset = toHashSet(pbl);
for(Pid pb : pbl) {
final String tb = pb.getType();
int incommon = Sets.intersection(pidAset, pidBset).size();
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
if (tb.equalsIgnoreCase(ta)) {
result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1;
}
}
}
int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1;
return new ConditionEval(cond, a, b, result);
}
private Set<String> toHashSet(List<Pid> pbl) {
return pbl.stream()
.map(pid -> pid.getType() + pid.getValue())
.collect(Collectors.toCollection(HashSet::new));
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.pace.distance.algo;
public class LevensteinDate extends Levenstein {
public LevensteinDate(double w) {
super(w);
}
@Override
public double distance(String a, String b) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
}

View File

@ -1,37 +0,0 @@
package eu.dnetlib.pace.distance.algo;
public class YearLevenstein extends SubStringLevenstein {
public YearLevenstein(double w) {
super(w);
}
public YearLevenstein(double w, int limit) {
super(w, limit);
}
@Override
public double distance(String a, String b) {
boolean check = checkLength(a) && checkLength(b);
if (check) {
if (a.equals(b)) {
return 1.0;
} else {
return 0.5;
}
} else {
return 1.0;
}
}
protected boolean checkLength(String s) {
return getNumbers(s).length() == limit;
}
@Override
public double getWeight() {
return super.weight;
}
}

View File

@ -83,8 +83,6 @@ public class FieldDef implements Serializable {
return new LevensteinTitle(getWeight());
case SubStringLevenstein:
return new SubStringLevenstein(getWeight(), getLimit());
case YearLevenstein:
return new YearLevenstein(getWeight(), getLimit());
case SortedJaroWinkler:
return new SortedJaroWinkler(getWeight());
case SortedLevel2JaroWinkler: