revised PidMatch implementation, cleanup
This commit is contained in:
parent
0bab8cf704
commit
bc4505e0e6
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
||||
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||
|
@ -21,12 +22,14 @@ import eu.dnetlib.pace.model.gt.GTAuthor;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang.RandomStringUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang3.RandomUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
|
@ -101,6 +104,13 @@ public abstract class AbstractProtoPaceTest extends OafTest {
|
|||
return result(config, id, title, date, Lists.newArrayList(pid), authors);
|
||||
}
|
||||
|
||||
static List<String> pidTypes = Lists.newArrayList();
|
||||
static {
|
||||
pidTypes.add("doi");
|
||||
//pidTypes.add("oai");
|
||||
//pidTypes.add("pmid");
|
||||
}
|
||||
|
||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
|
||||
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
|
||||
if (!StringUtils.isBlank(title)) {
|
||||
|
@ -126,7 +136,7 @@ public abstract class AbstractProtoPaceTest extends OafTest {
|
|||
if (pid != null) {
|
||||
for(String p : pid) {
|
||||
if (!StringUtils.isBlank(p)) {
|
||||
entity.addPid(sp(p, "doi"));
|
||||
entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
|
||||
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,8 +108,8 @@ public class DetectorTest extends AbstractProtoPaceTest {
|
|||
public void testDistanceResultMissingTwoDate() {
|
||||
final Config config = getResultConf();
|
||||
|
||||
final MapDocument resA = result(config, "A", "title title title 6BESR");
|
||||
final MapDocument resB = result(config, "B", "title title title 6CLER");
|
||||
final MapDocument resA = result(config, "A", "bellaciao");
|
||||
final MapDocument resB = result(config, "B", "bellocioa");
|
||||
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
double d = sr.getScore();
|
||||
|
@ -326,6 +326,51 @@ public class DetectorTest extends AbstractProtoPaceTest {
|
|||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultNoPidsConf() {
|
||||
|
||||
final Config config = getResultFullConf();
|
||||
|
||||
final MapDocument resA =
|
||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
|
||||
|
||||
final MapDocument resB =
|
||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
|
||||
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double s = sr.getScore();
|
||||
|
||||
log.info(sr.toString());
|
||||
log.info(String.format(" s ---> %s", s));
|
||||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultPidsConf() {
|
||||
|
||||
final Config config = getResultFullConf();
|
||||
|
||||
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||
|
||||
final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
|
||||
final MapDocument resA =
|
||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||
pidA, authorsA);
|
||||
|
||||
final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
|
||||
final MapDocument resB =
|
||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
|
||||
pidB, authorsB);
|
||||
|
||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||
final double s = sr.getScore();
|
||||
log.info(sr.toString());
|
||||
log.info(String.format(" s ---> %s", s));
|
||||
|
||||
// assertTrue(d.getScore() == 0.0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDistanceResultFullConf() {
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
],
|
||||
"model" : [
|
||||
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
|
||||
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
|
||||
{ "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
|
||||
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" }
|
||||
],
|
||||
"blacklists" : { }
|
||||
|
|
|
@ -1,7 +1,11 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
@ -33,20 +37,21 @@ public class PidMatch extends AbstractCondition {
|
|||
final List<Pid> pal = Pid.fromOafJson(sa);
|
||||
final List<Pid> pbl = Pid.fromOafJson(sb);
|
||||
|
||||
int result = 0;
|
||||
for(Pid pa : pal) {
|
||||
final String ta = pa.getType();
|
||||
final Set<String> pidAset = toHashSet(pal);
|
||||
final Set<String> pidBset = toHashSet(pbl);
|
||||
|
||||
for(Pid pb : pbl) {
|
||||
final String tb = pb.getType();
|
||||
int incommon = Sets.intersection(pidAset, pidBset).size();
|
||||
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
|
||||
|
||||
if (tb.equalsIgnoreCase(ta)) {
|
||||
result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1;
|
||||
|
||||
return new ConditionEval(cond, a, b, result);
|
||||
}
|
||||
|
||||
private Set<String> toHashSet(List<Pid> pbl) {
|
||||
return pbl.stream()
|
||||
.map(pid -> pid.getType() + pid.getValue())
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
|
||||
public class LevensteinDate extends Levenstein {
|
||||
|
||||
|
||||
public LevensteinDate(double w) {
|
||||
super(w);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
|
||||
public class YearLevenstein extends SubStringLevenstein {
|
||||
|
||||
public YearLevenstein(double w) {
|
||||
super(w);
|
||||
}
|
||||
|
||||
public YearLevenstein(double w, int limit) {
|
||||
super(w, limit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
boolean check = checkLength(a) && checkLength(b);
|
||||
if (check) {
|
||||
if (a.equals(b)) {
|
||||
return 1.0;
|
||||
} else {
|
||||
return 0.5;
|
||||
}
|
||||
} else {
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean checkLength(String s) {
|
||||
return getNumbers(s).length() == limit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -83,8 +83,6 @@ public class FieldDef implements Serializable {
|
|||
return new LevensteinTitle(getWeight());
|
||||
case SubStringLevenstein:
|
||||
return new SubStringLevenstein(getWeight(), getLimit());
|
||||
case YearLevenstein:
|
||||
return new YearLevenstein(getWeight(), getLimit());
|
||||
case SortedJaroWinkler:
|
||||
return new SortedJaroWinkler(getWeight());
|
||||
case SortedLevel2JaroWinkler:
|
||||
|
|
Loading…
Reference in New Issue