revised PidMatch implementation, cleanup

This commit is contained in:
Claudio Atzori 2018-10-20 08:38:19 +02:00
parent 3197f26691
commit 4d379c2227
4 changed files with 15 additions and 74 deletions

View File

@ -1,7 +1,11 @@
package eu.dnetlib.pace.condition; package eu.dnetlib.pace.condition;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
@ -33,20 +37,21 @@ public class PidMatch extends AbstractCondition {
final List<Pid> pal = Pid.fromOafJson(sa); final List<Pid> pal = Pid.fromOafJson(sa);
final List<Pid> pbl = Pid.fromOafJson(sb); final List<Pid> pbl = Pid.fromOafJson(sb);
int result = 0; final Set<String> pidAset = toHashSet(pal);
for(Pid pa : pal) { final Set<String> pidBset = toHashSet(pbl);
final String ta = pa.getType();
for(Pid pb : pbl) { int incommon = Sets.intersection(pidAset, pidBset).size();
final String tb = pb.getType(); int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
if (tb.equalsIgnoreCase(ta)) { int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1;
result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1;
}
}
}
return new ConditionEval(cond, a, b, result); return new ConditionEval(cond, a, b, result);
} }
private Set<String> toHashSet(List<Pid> pbl) {
return pbl.stream()
.map(pid -> pid.getType() + pid.getValue())
.collect(Collectors.toCollection(HashSet::new));
}
} }

View File

@ -1,25 +0,0 @@
package eu.dnetlib.pace.distance.algo;
public class LevensteinDate extends Levenstein {
public LevensteinDate(double w) {
super(w);
}
@Override
public double distance(String a, String b) {
return 1.0;
}
@Override
public double getWeight() {
return super.weight;
}
}

View File

@ -1,37 +0,0 @@
package eu.dnetlib.pace.distance.algo;
public class YearLevenstein extends SubStringLevenstein {
public YearLevenstein(double w) {
super(w);
}
public YearLevenstein(double w, int limit) {
super(w, limit);
}
@Override
public double distance(String a, String b) {
boolean check = checkLength(a) && checkLength(b);
if (check) {
if (a.equals(b)) {
return 1.0;
} else {
return 0.5;
}
} else {
return 1.0;
}
}
protected boolean checkLength(String s) {
return getNumbers(s).length() == limit;
}
@Override
public double getWeight() {
return super.weight;
}
}

View File

@ -83,8 +83,6 @@ public class FieldDef implements Serializable {
return new LevensteinTitle(getWeight()); return new LevensteinTitle(getWeight());
case SubStringLevenstein: case SubStringLevenstein:
return new SubStringLevenstein(getWeight(), getLimit()); return new SubStringLevenstein(getWeight(), getLimit());
case YearLevenstein:
return new YearLevenstein(getWeight(), getLimit());
case SortedJaroWinkler: case SortedJaroWinkler:
return new SortedJaroWinkler(getWeight()); return new SortedJaroWinkler(getWeight());
case SortedLevel2JaroWinkler: case SortedLevel2JaroWinkler: