dnet-dedup/dnet-pace-core/src/main/java/eu/dnetlib/pace/tree/StringListMatch.java

54 lines
1.6 KiB
Java

package eu.dnetlib.pace.tree;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@ComparatorClass("stringListMatch")
public class StringListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
final private String TYPE; //percentage or count
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
TYPE = params.getOrDefault("type", "percentage");
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
final Set<String> pa = new HashSet<>(((FieldList) a).stringList());
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists is empty
}
int incommon = Sets.intersection(pa, pb).size();
int simDiff = Sets.symmetricDifference(pa, pb).size();
if (incommon + simDiff == 0) {
return 0.0;
}
if(TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
}
}