forked from D-Net/dnet-hadoop
revised PidMatch implementation, cleanup
This commit is contained in:
parent
3197f26691
commit
4d379c2227
|
@ -1,7 +1,11 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
@ -33,20 +37,21 @@ public class PidMatch extends AbstractCondition {
|
|||
final List<Pid> pal = Pid.fromOafJson(sa);
|
||||
final List<Pid> pbl = Pid.fromOafJson(sb);
|
||||
|
||||
int result = 0;
|
||||
for(Pid pa : pal) {
|
||||
final String ta = pa.getType();
|
||||
final Set<String> pidAset = toHashSet(pal);
|
||||
final Set<String> pidBset = toHashSet(pbl);
|
||||
|
||||
for(Pid pb : pbl) {
|
||||
final String tb = pb.getType();
|
||||
int incommon = Sets.intersection(pidAset, pidBset).size();
|
||||
int simDiff = Sets.symmetricDifference(pidAset, pidBset).size();
|
||||
|
||||
if (tb.equalsIgnoreCase(ta)) {
|
||||
result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
int result = incommon / (incommon + simDiff) > 0.5 ? 1 : -1;
|
||||
|
||||
return new ConditionEval(cond, a, b, result);
|
||||
}
|
||||
|
||||
private Set<String> toHashSet(List<Pid> pbl) {
|
||||
return pbl.stream()
|
||||
.map(pid -> pid.getType() + pid.getValue())
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
|
||||
public class LevensteinDate extends Levenstein {
|
||||
|
||||
|
||||
public LevensteinDate(double w) {
|
||||
super(w);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,37 +0,0 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
|
||||
public class YearLevenstein extends SubStringLevenstein {
|
||||
|
||||
public YearLevenstein(double w) {
|
||||
super(w);
|
||||
}
|
||||
|
||||
public YearLevenstein(double w, int limit) {
|
||||
super(w, limit);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b) {
|
||||
boolean check = checkLength(a) && checkLength(b);
|
||||
if (check) {
|
||||
if (a.equals(b)) {
|
||||
return 1.0;
|
||||
} else {
|
||||
return 0.5;
|
||||
}
|
||||
} else {
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean checkLength(String s) {
|
||||
return getNumbers(s).length() == limit;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
}
|
|
@ -83,8 +83,6 @@ public class FieldDef implements Serializable {
|
|||
return new LevensteinTitle(getWeight());
|
||||
case SubStringLevenstein:
|
||||
return new SubStringLevenstein(getWeight(), getLimit());
|
||||
case YearLevenstein:
|
||||
return new YearLevenstein(getWeight(), getLimit());
|
||||
case SortedJaroWinkler:
|
||||
return new SortedJaroWinkler(getWeight());
|
||||
case SortedLevel2JaroWinkler:
|
||||
|
|
Loading…
Reference in New Issue