forked from D-Net/dnet-hadoop
update in the discovery of clustering, conditions and distance functions (annotated with custom annotations)
This commit is contained in:
parent
4d379c2227
commit
1cbbc3f15a
|
@ -55,6 +55,12 @@
|
|||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.reflections</groupId>
|
||||
<artifactId>reflections</artifactId>
|
||||
<version>0.9.10</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -19,6 +19,12 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
|||
this.params = params;
|
||||
}
|
||||
|
||||
public AbstractClusteringFunction(){}
|
||||
|
||||
public void setParams(Map<String, Integer> params){
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
protected abstract Collection<String> doApply(String s);
|
||||
|
||||
@Override
|
||||
|
|
|
@ -7,12 +7,17 @@ import java.util.StringTokenizer;
|
|||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
@ClusteringClass("acronyms")
|
||||
public class Acronyms extends AbstractClusteringFunction {
|
||||
|
||||
public Acronyms(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public Acronyms(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||
|
|
|
@ -22,9 +22,6 @@ public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
|
|||
|
||||
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
|
||||
|
||||
|
||||
|
||||
|
||||
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
|
||||
|
||||
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
|
||||
|
|
|
@ -1,5 +0,0 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
public enum Clustering {
|
||||
acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ClusteringClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -12,4 +12,5 @@ public interface ClusteringFunction {
|
|||
|
||||
public Map<String, Integer> getParams();
|
||||
|
||||
public void setParams(Map<String, Integer> params);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.reflections.Reflections;
|
||||
|
||||
public class ClusteringResolver implements Serializable {
|
||||
private final Map<String, Class<ClusteringFunction>> functionMap;
|
||||
|
||||
public ClusteringResolver() {
|
||||
|
||||
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
|
||||
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||
}
|
||||
|
||||
public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||
|
||||
return functionMap.get(clusteringFunction).newInstance();
|
||||
}
|
||||
}
|
|
@ -6,12 +6,17 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
@ClusteringClass("immutablefieldvalue")
|
||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||
|
||||
public ImmutableFieldValue(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public ImmutableFieldValue() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
|
|
@ -9,12 +9,17 @@ import com.google.common.collect.Sets;
|
|||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
@ClusteringClass("lowercase")
|
||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||
|
||||
public LowercaseClustering(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public LowercaseClustering(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
Collection<String> c = Sets.newLinkedHashSet();
|
||||
|
|
|
@ -6,8 +6,13 @@ import java.util.Map;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
@ClusteringClass("ngrampairs")
|
||||
public class NgramPairs extends Ngrams {
|
||||
|
||||
public NgramPairs() {
|
||||
super();
|
||||
}
|
||||
|
||||
public NgramPairs(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
|
|
@ -5,12 +5,17 @@ import java.util.LinkedHashSet;
|
|||
import java.util.Map;
|
||||
import java.util.StringTokenizer;
|
||||
|
||||
@ClusteringClass("ngrams")
|
||||
public class Ngrams extends AbstractClusteringFunction {
|
||||
|
||||
public Ngrams(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public Ngrams() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -18,6 +19,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
import eu.dnetlib.pace.model.gt.Author;
|
||||
import eu.dnetlib.pace.model.gt.GTAuthor;
|
||||
|
||||
@ClusteringClass("personclustering")
|
||||
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
private Map<String, Integer> params;
|
||||
|
@ -28,6 +30,10 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
|||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Integer> params){
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(final List<Field> fields) {
|
||||
final Set<String> hashes = Sets.newHashSet();
|
||||
|
|
|
@ -8,6 +8,7 @@ import com.google.common.collect.Lists;
|
|||
|
||||
import eu.dnetlib.pace.model.Person;
|
||||
|
||||
@ClusteringClass("personhash")
|
||||
public class PersonHash extends AbstractClusteringFunction {
|
||||
|
||||
private boolean DEFAULT_AGGRESSIVE = false;
|
||||
|
@ -16,6 +17,10 @@ public class PersonHash extends AbstractClusteringFunction {
|
|||
super(params);
|
||||
}
|
||||
|
||||
public PersonHash(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
|
|
@ -9,6 +9,10 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
|
|||
super(params);
|
||||
}
|
||||
|
||||
public RandomClusteringFunction(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
// TODO Auto-generated method stub
|
||||
|
|
|
@ -9,12 +9,17 @@ import com.google.common.base.Joiner;
|
|||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
@ClusteringClass("sortedngrampairs")
|
||||
public class SortedNgramPairs extends NgramPairs {
|
||||
|
||||
public SortedNgramPairs(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public SortedNgramPairs(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
|
||||
|
|
|
@ -9,12 +9,17 @@ import org.apache.commons.lang.StringUtils;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
@ClusteringClass("spacetrimmingfieldvalue")
|
||||
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||
|
||||
public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public SpaceTrimmingFieldValue(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final String s) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
|
|
@ -6,12 +6,17 @@ import java.util.Set;
|
|||
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
@ClusteringClass("suffixprefix")
|
||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||
|
||||
public SuffixPrefix(Map<String, Integer> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public SuffixPrefix(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(String s) {
|
||||
return suffixPrefix(s, param("len"), param("max"));
|
||||
|
|
|
@ -11,6 +11,7 @@ import java.util.List;
|
|||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@ClusteringClass("urlclustering")
|
||||
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||
|
||||
protected Map<String, Integer> params;
|
||||
|
@ -19,6 +20,14 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
|||
this.params = params;
|
||||
}
|
||||
|
||||
public UrlClustering() {
|
||||
super();
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Integer> params){
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<String> apply(List<Field> fields) {
|
||||
return fields.stream()
|
||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
|
@ -17,15 +16,25 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*/
|
||||
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
|
||||
|
||||
protected Cond cond;
|
||||
protected String cond;
|
||||
|
||||
protected List<FieldDef> fields;
|
||||
|
||||
public AbstractCondition(final Cond cond, final List<FieldDef> fields) {
|
||||
public AbstractCondition(final String cond, final List<FieldDef> fields) {
|
||||
this.cond = cond;
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public AbstractCondition(){}
|
||||
|
||||
public void setCond(String cond){
|
||||
this.cond = cond;
|
||||
}
|
||||
|
||||
public void setFields(List<FieldDef> fields){
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -11,12 +10,16 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("alwaystruecondition")
|
||||
public class AlwaysTrueCondition extends AbstractCondition {
|
||||
|
||||
public AlwaysTrueCondition(final Cond cond, final List<FieldDef> fields) {
|
||||
public AlwaysTrueCondition(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
public AlwaysTrueCondition(){
|
||||
super();
|
||||
}
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
return new ConditionEval(cond, a, b, 1);
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import java.util.List;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||
import eu.dnetlib.pace.model.Document;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
/**
|
||||
* Allows to express general conditions to be satisfied or not between two Documents.
|
||||
|
@ -24,4 +24,7 @@ public interface ConditionAlgo {
|
|||
*/
|
||||
public abstract ConditionEvalMap verify(Document a, Document b);
|
||||
|
||||
public void setFields(List<FieldDef> fields);
|
||||
public void setCond(String name);
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface ConditionClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.reflections.Reflections;
|
||||
|
||||
public class ConditionResolver implements Serializable {
|
||||
private final Map<String, Class<ConditionAlgo>> functionMap;
|
||||
|
||||
public ConditionResolver() {
|
||||
|
||||
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
|
||||
.filter(ConditionAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
||||
}
|
||||
|
||||
public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException {
|
||||
return functionMap.get(name).newInstance();
|
||||
}
|
||||
}
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
|
@ -11,11 +10,12 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("doiExactMatch")
|
||||
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||
|
||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||
|
||||
public DoiExactMatch(final Cond cond, final List<FieldDef> fields) {
|
||||
public DoiExactMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -13,12 +12,17 @@ import org.apache.commons.lang.StringUtils;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("exactMatch")
|
||||
public class ExactMatch extends AbstractCondition {
|
||||
|
||||
public ExactMatch(final Cond cond, final List<FieldDef> fields) {
|
||||
public ExactMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
public ExactMatch(){
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -12,9 +11,10 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("exactMatchIgnoreCase")
|
||||
public class ExactMatchIgnoreCase extends AbstractCondition {
|
||||
|
||||
public ExactMatchIgnoreCase(final Cond cond, final List<FieldDef> fields) {
|
||||
public ExactMatchIgnoreCase(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.pace.condition;
|
|||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -13,6 +12,7 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("mustBeDifferent")
|
||||
public class MustBeDifferent extends AbstractCondition {
|
||||
|
||||
/**
|
||||
|
@ -20,7 +20,7 @@ public class MustBeDifferent extends AbstractCondition {
|
|||
*
|
||||
* @param fields the fields
|
||||
*/
|
||||
public MustBeDifferent(final Cond cond, final List<FieldDef> fields) {
|
||||
public MustBeDifferent(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -6,7 +6,6 @@ import java.util.Set;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -20,11 +19,12 @@ import org.apache.commons.logging.LogFactory;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("pidMatch")
|
||||
public class PidMatch extends AbstractCondition {
|
||||
|
||||
private static final Log log = LogFactory.getLog(PidMatch.class);
|
||||
|
||||
public PidMatch(final Cond cond, final List<FieldDef> fields) {
|
||||
public PidMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ import java.util.List;
|
|||
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -14,6 +13,7 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("sizeMatch")
|
||||
public class SizeMatch extends AbstractCondition {
|
||||
|
||||
/**
|
||||
|
@ -22,7 +22,7 @@ public class SizeMatch extends AbstractCondition {
|
|||
* @param fields
|
||||
* the fields
|
||||
*/
|
||||
public SizeMatch(final Cond cond, final List<FieldDef> fields) {
|
||||
public SizeMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
|||
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
@ -13,9 +12,10 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
* @author claudio
|
||||
*
|
||||
*/
|
||||
@ConditionClass("titleVersionMatch")
|
||||
public class TitleVersionMatch extends AbstractCondition {
|
||||
|
||||
public TitleVersionMatch(final Cond cond, final List<FieldDef> fields) {
|
||||
public TitleVersionMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
package eu.dnetlib.pace.condition;
|
||||
|
||||
import java.time.Year;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -14,14 +14,17 @@ import eu.dnetlib.pace.model.FieldDef;
|
|||
*
|
||||
* @author claudio
|
||||
*/
|
||||
@ConditionClass("yearMatch")
|
||||
public class YearMatch extends AbstractCondition {
|
||||
|
||||
private int limit = 4;
|
||||
|
||||
public YearMatch(final Cond cond, final List<FieldDef> fields) {
|
||||
public YearMatch(final String cond, final List<FieldDef> fields) {
|
||||
super(cond, fields);
|
||||
}
|
||||
|
||||
public YearMatch(){}
|
||||
|
||||
// @Override
|
||||
// public boolean verify(final Document a, final Document b) {
|
||||
// boolean res = true;
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
/**
|
||||
* Enumerates the distance Algos.
|
||||
*/
|
||||
public enum Algo {
|
||||
|
||||
/** The Jaro winkler. */
|
||||
JaroWinkler,
|
||||
/** The Jaro winkler title. */
|
||||
JaroWinklerTitle,
|
||||
/** The Levenstein. */
|
||||
Levenstein,
|
||||
/** The Levenstein distance for title matching */
|
||||
LevensteinTitle,
|
||||
/** The Level2 jaro winkler. */
|
||||
Level2JaroWinkler,
|
||||
/** The Level2 jaro winkler for title matching */
|
||||
Level2JaroWinklerTitle,
|
||||
/** The Level2 levenstein. */
|
||||
Level2Levenstein,
|
||||
/** The Sub string levenstein. */
|
||||
SubStringLevenstein,
|
||||
/** The Year levenstein. */
|
||||
YearLevenstein,
|
||||
/** The Sorted jaro winkler. */
|
||||
SortedJaroWinkler,
|
||||
/** The Sorted level2 jaro winkler. */
|
||||
SortedLevel2JaroWinkler,
|
||||
/** Compares two urls */
|
||||
urlMatcher,
|
||||
/** Exact match algo. */
|
||||
ExactMatch,
|
||||
/**
|
||||
* Returns 0 for equal strings, 1 for different strings.
|
||||
*/
|
||||
MustBeDifferent,
|
||||
/** Always return 1.0 as distance. */
|
||||
AlwaysMatch,
|
||||
/** Person distance */
|
||||
PersonCoAuthorSurnamesDistance,
|
||||
PersonCoAnchorsDistance,
|
||||
PersonDistance,
|
||||
/** The Null. */
|
||||
Null
|
||||
}
|
|
@ -1,28 +0,0 @@
|
|||
package eu.dnetlib.pace.config;
|
||||
|
||||
/**
|
||||
* The Enum Cond.
|
||||
*/
|
||||
public enum Cond {
|
||||
|
||||
/** The year match. */
|
||||
yearMatch,
|
||||
/** The title version match. */
|
||||
titleVersionMatch,
|
||||
/** The size match. */
|
||||
sizeMatch,
|
||||
/**
|
||||
* Returns true if the field values are different
|
||||
*/
|
||||
mustBeDifferent,
|
||||
/** The Exact match. */
|
||||
exactMatch,
|
||||
/**
|
||||
* The Exact match ignore case.
|
||||
*/
|
||||
exactMatchIgnoreCase,
|
||||
/** The Exact match specialized to recognize DOI values. */
|
||||
doiExactMatch,
|
||||
/** The Exact match that checks if pid type and value are the same */
|
||||
pidMatch
|
||||
}
|
|
@ -2,6 +2,8 @@ package eu.dnetlib.pace.distance;
|
|||
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
|
||||
* objects.
|
||||
|
@ -11,5 +13,9 @@ public interface DistanceAlgo {
|
|||
public abstract double distance(Field a, Field b);
|
||||
|
||||
public double getWeight();
|
||||
public Map<String, Number> getParams();
|
||||
|
||||
public void setWeight(double w);
|
||||
public void setParams(Map<String, Number> params);
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.lang.annotation.ElementType;
|
||||
import java.lang.annotation.Retention;
|
||||
import java.lang.annotation.RetentionPolicy;
|
||||
import java.lang.annotation.Target;
|
||||
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface DistanceClass {
|
||||
|
||||
public String value();
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.reflections.Reflections;
|
||||
|
||||
public class DistanceResolver implements Serializable {
|
||||
private final Map<String, Class<DistanceAlgo>> functionMap;
|
||||
|
||||
public DistanceResolver() {
|
||||
|
||||
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||
}
|
||||
|
||||
public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException {
|
||||
|
||||
return functionMap.get(algo).newInstance();
|
||||
}
|
||||
}
|
|
@ -25,7 +25,7 @@ public class DistanceScorer {
|
|||
}
|
||||
|
||||
public ScoreResult distance(final Document a, final Document b) {
|
||||
final ScoreResult sr = new ScoreResult();
|
||||
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
|
||||
|
||||
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
||||
sr.setConditions(verify(a, b, config.conditions()));
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
package eu.dnetlib.pace.distance;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
|
@ -24,6 +26,27 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
/** The weight. */
|
||||
protected double weight = 0.0;
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
protected SecondStringDistanceAlgo(){
|
||||
}
|
||||
|
||||
protected SecondStringDistanceAlgo(Map<String, Number> params){
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setWeight(double w){
|
||||
this.weight = w;
|
||||
}
|
||||
|
||||
public Map<String, Number> getParams(){
|
||||
return this.params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Number> params){
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new second string distance algo.
|
||||
*
|
||||
|
@ -37,6 +60,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
|||
this.weight = weight;
|
||||
}
|
||||
|
||||
protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){
|
||||
this.ssalgo = ssalgo;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize.
|
||||
*
|
||||
|
|
|
@ -1,10 +1,22 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("AlwaysMatch")
|
||||
public class AlwaysMatch extends SecondStringDistanceAlgo {
|
||||
|
||||
public AlwaysMatch(){
|
||||
super();
|
||||
}
|
||||
|
||||
public AlwaysMatch(final Map<String, Number> params){
|
||||
super(params);
|
||||
}
|
||||
|
||||
public AlwaysMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
|
|
@ -1,10 +1,22 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("ExactMatch")
|
||||
public class ExactMatch extends SecondStringDistanceAlgo {
|
||||
|
||||
public ExactMatch(){
|
||||
super();
|
||||
}
|
||||
|
||||
public ExactMatch(Map<String, Number> params){
|
||||
super(params);
|
||||
}
|
||||
|
||||
public ExactMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
|
|
@ -1,11 +1,23 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@DistanceClass("JaroWinkler")
|
||||
public class JaroWinkler extends SecondStringDistanceAlgo {
|
||||
|
||||
public JaroWinkler(){
|
||||
super();
|
||||
}
|
||||
|
||||
public JaroWinkler(Map<String, Number> params){
|
||||
super(params);
|
||||
}
|
||||
|
||||
public JaroWinkler(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
|
|
@ -1,11 +1,23 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||
@DistanceClass("JaroWinklerTitle")
|
||||
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||
|
||||
public JaroWinklerTitle(){
|
||||
super();
|
||||
}
|
||||
|
||||
public JaroWinklerTitle(Map<String, Number> params){
|
||||
super(params);
|
||||
}
|
||||
|
||||
public JaroWinklerTitle(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
@DistanceClass("Level2JaroWinkler")
|
||||
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
|
||||
|
||||
public Level2JaroWinkler(double w) {
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
@DistanceClass("Level2JaroWinklerTitle")
|
||||
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||
|
||||
public Level2JaroWinklerTitle(final double w) {
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
@DistanceClass("Level2Levenstein")
|
||||
public class Level2Levenstein extends SecondStringDistanceAlgo {
|
||||
|
||||
public Level2Levenstein(double w) {
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
@DistanceClass("Levenstein")
|
||||
public class Levenstein extends SecondStringDistanceAlgo {
|
||||
|
||||
public Levenstein(){
|
||||
super(new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public Levenstein(double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
@DistanceClass("LevensteinTitle")
|
||||
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
||||
|
||||
public LevensteinTitle(){
|
||||
super(new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
public LevensteinTitle(final double w) {
|
||||
super(w, new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
|
||||
@DistanceClass("MustBeDifferent")
|
||||
public class MustBeDifferent extends SecondStringDistanceAlgo {
|
||||
|
||||
public MustBeDifferent(final double weight) {
|
||||
|
|
|
@ -1,12 +1,16 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
|
||||
* NullDistanceAlgo.
|
||||
*/
|
||||
@DistanceClass("Null")
|
||||
public class NullDistanceAlgo implements DistanceAlgo {
|
||||
|
||||
@Override
|
||||
|
@ -19,4 +23,16 @@ public class NullDistanceAlgo implements DistanceAlgo {
|
|||
return 0.0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setWeight(double w){
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, Number> getParams() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setParams(Map<String, Number> params) {
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@DistanceClass("SortedJaroWinkler")
|
||||
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
|
||||
/**
|
||||
* The Class SortedJaroWinkler.
|
||||
*/
|
||||
@DistanceClass("Sorted2JaroWinkler")
|
||||
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -8,14 +9,21 @@ import com.wcohen.ss.AbstractStringDistance;
|
|||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* The Class SubStringLevenstein.
|
||||
*/
|
||||
@DistanceClass("SubStringLevenstein")
|
||||
public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||
|
||||
/** The limit. */
|
||||
protected int limit;
|
||||
|
||||
public SubStringLevenstein() {
|
||||
super(new com.wcohen.ss.Levenstein());
|
||||
}
|
||||
|
||||
/**
|
||||
* Instantiates a new sub string levenstein.
|
||||
*
|
||||
|
@ -87,4 +95,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
|||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Number> params){
|
||||
this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit
|
||||
super.setParams(params);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
package eu.dnetlib.pace.distance.algo;
|
||||
|
||||
import eu.dnetlib.pace.distance.DistanceClass;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
|
@ -7,15 +8,24 @@ import java.net.MalformedURLException;
|
|||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
@DistanceClass("urlMatcher")
|
||||
public class UrlMatcher extends Levenstein {
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
public UrlMatcher(){
|
||||
super();
|
||||
}
|
||||
|
||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
||||
super(weight);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public void setParams(Map<String, Number> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(Field a, Field b) {
|
||||
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
|
||||
/**
|
||||
|
@ -8,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
|
|||
*/
|
||||
public class ConditionEval {
|
||||
|
||||
private Cond cond;
|
||||
private String cond;
|
||||
|
||||
private Field a;
|
||||
|
||||
|
@ -16,7 +15,7 @@ public class ConditionEval {
|
|||
|
||||
private int result;
|
||||
|
||||
public ConditionEval(final Cond cond, final Field a, final Field b, final int result) {
|
||||
public ConditionEval(final String cond, final Field a, final Field b, final int result) {
|
||||
this.cond = cond;
|
||||
this.a = a;
|
||||
this.b = b;
|
||||
|
@ -47,11 +46,11 @@ public class ConditionEval {
|
|||
this.result = result;
|
||||
}
|
||||
|
||||
public Cond getCond() {
|
||||
public String getCond() {
|
||||
return cond;
|
||||
}
|
||||
|
||||
public void setCond(final Cond cond) {
|
||||
public void setCond(final String cond) {
|
||||
this.cond = cond;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib.pace.distance.eval;
|
||||
|
||||
import eu.dnetlib.pace.config.Algo;
|
||||
import eu.dnetlib.pace.model.Field;
|
||||
import eu.dnetlib.pace.model.FieldDef;
|
||||
|
||||
|
|
|
@ -2,10 +2,12 @@ package eu.dnetlib.pace.distance.eval;
|
|||
|
||||
import com.google.gson.GsonBuilder;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
/**
|
||||
* Created by claudio on 09/03/16.
|
||||
*/
|
||||
public class ScoreResult {
|
||||
public class ScoreResult implements Serializable {
|
||||
|
||||
private ConditionEvalMap strictConditions;
|
||||
|
||||
|
@ -49,8 +51,12 @@ public class ScoreResult {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
final GsonBuilder b = new GsonBuilder();
|
||||
b.serializeSpecialFloatingPointValues();
|
||||
return b.setPrettyPrinting().create().toJson(this);
|
||||
//TODO cannot print: why?
|
||||
// final GsonBuilder b = new GsonBuilder()
|
||||
// .serializeSpecialFloatingPointValues()
|
||||
// .serializeNulls();
|
||||
//
|
||||
// return b.setPrettyPrinting().create().toJson(this);
|
||||
return "{}";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -9,49 +10,36 @@ import eu.dnetlib.pace.clustering.*;
|
|||
|
||||
public class ClusteringDef implements Serializable {
|
||||
|
||||
private Clustering name;
|
||||
private String name;
|
||||
|
||||
private List<String> fields;
|
||||
|
||||
private Map<String, Integer> params;
|
||||
|
||||
private ClusteringResolver clusteringResolver = new ClusteringResolver();
|
||||
|
||||
public ClusteringDef() {}
|
||||
|
||||
public Clustering getName() {
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final Clustering name) {
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public ClusteringFunction getClusteringFunction() {
|
||||
switch (getName()) {
|
||||
case acronyms:
|
||||
return new Acronyms(getParams());
|
||||
case ngrams:
|
||||
return new Ngrams(getParams());
|
||||
case ngrampairs:
|
||||
return new NgramPairs(getParams());
|
||||
case sortedngrampairs:
|
||||
return new SortedNgramPairs(getParams());
|
||||
case suffixprefix:
|
||||
return new SuffixPrefix(getParams());
|
||||
case spacetrimmingfieldvalue:
|
||||
return new SpaceTrimmingFieldValue(getParams());
|
||||
case immutablefieldvalue:
|
||||
return new ImmutableFieldValue(getParams());
|
||||
case personhash:
|
||||
return new PersonHash(getParams());
|
||||
case personclustering:
|
||||
return new PersonClustering(getParams());
|
||||
case lowercase:
|
||||
return new LowercaseClustering(getParams());
|
||||
case urlclustering:
|
||||
return new UrlClustering(getParams());
|
||||
default:
|
||||
|
||||
try {
|
||||
ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName());
|
||||
clusteringFunction.setParams(params);
|
||||
return clusteringFunction;
|
||||
|
||||
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||
e.printStackTrace();
|
||||
return new RandomClusteringFunction(getParams());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public List<String> getFields() {
|
||||
|
|
|
@ -5,44 +5,36 @@ import java.util.List;
|
|||
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.condition.*;
|
||||
import eu.dnetlib.pace.config.Cond;
|
||||
|
||||
public class CondDef implements Serializable {
|
||||
|
||||
private Cond name;
|
||||
private String name;
|
||||
|
||||
private List<String> fields;
|
||||
|
||||
private ConditionResolver conditionResolver = new ConditionResolver();
|
||||
|
||||
public CondDef() {}
|
||||
|
||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
|
||||
switch (getName()) {
|
||||
case yearMatch:
|
||||
return new YearMatch(getName(), fields);
|
||||
case titleVersionMatch:
|
||||
return new TitleVersionMatch(getName(), fields);
|
||||
case sizeMatch:
|
||||
return new SizeMatch(getName(), fields);
|
||||
case exactMatch:
|
||||
return new ExactMatch(getName(), fields);
|
||||
case mustBeDifferent:
|
||||
return new MustBeDifferent(getName(), fields);
|
||||
case exactMatchIgnoreCase:
|
||||
return new ExactMatchIgnoreCase(getName(), fields);
|
||||
case doiExactMatch:
|
||||
return new DoiExactMatch(getName(), fields);
|
||||
case pidMatch:
|
||||
return new PidMatch(getName(), fields);
|
||||
default:
|
||||
|
||||
try {
|
||||
ConditionAlgo conditionAlgo = conditionResolver.resolve(getName());
|
||||
conditionAlgo.setFields(fields);
|
||||
conditionAlgo.setCond(getName());
|
||||
return conditionAlgo;
|
||||
} catch (IllegalAccessException | InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
return new AlwaysTrueCondition(getName(), fields);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public Cond getName() {
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(final Cond name) {
|
||||
public void setName(final String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gson.Gson;
|
||||
import eu.dnetlib.pace.config.Algo;
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
import eu.dnetlib.pace.distance.*;
|
||||
import eu.dnetlib.pace.distance.algo.*;
|
||||
|
@ -19,7 +19,7 @@ public class FieldDef implements Serializable {
|
|||
|
||||
public final static String PATH_SEPARATOR = "/";
|
||||
|
||||
private Algo algo;
|
||||
private String algo;
|
||||
|
||||
private String name;
|
||||
|
||||
|
@ -37,6 +37,8 @@ public class FieldDef implements Serializable {
|
|||
|
||||
private Map<String, Number> params;
|
||||
|
||||
private DistanceResolver distanceResolver = new DistanceResolver();
|
||||
|
||||
public FieldDef() {}
|
||||
|
||||
// def apply(s: String): Field[A]
|
||||
|
@ -66,40 +68,22 @@ public class FieldDef implements Serializable {
|
|||
}
|
||||
|
||||
public DistanceAlgo getDistanceAlgo() {
|
||||
switch (getAlgo()) {
|
||||
case JaroWinkler:
|
||||
return new JaroWinkler(getWeight());
|
||||
case JaroWinklerTitle:
|
||||
return new JaroWinklerTitle(getWeight());
|
||||
case Level2JaroWinkler:
|
||||
return new Level2JaroWinkler(getWeight());
|
||||
case Level2JaroWinklerTitle:
|
||||
return new Level2JaroWinklerTitle(getWeight());
|
||||
case Level2Levenstein:
|
||||
return new Level2Levenstein(getWeight());
|
||||
case Levenstein:
|
||||
return new Levenstein(getWeight());
|
||||
case LevensteinTitle:
|
||||
return new LevensteinTitle(getWeight());
|
||||
case SubStringLevenstein:
|
||||
return new SubStringLevenstein(getWeight(), getLimit());
|
||||
case SortedJaroWinkler:
|
||||
return new SortedJaroWinkler(getWeight());
|
||||
case SortedLevel2JaroWinkler:
|
||||
return new SortedLevel2JaroWinkler(getWeight());
|
||||
case urlMatcher:
|
||||
return new UrlMatcher(getWeight(), getParams());
|
||||
case ExactMatch:
|
||||
return new ExactMatch(getWeight());
|
||||
case MustBeDifferent:
|
||||
return new MustBeDifferent(getWeight());
|
||||
case AlwaysMatch:
|
||||
return new AlwaysMatch(getWeight());
|
||||
case Null:
|
||||
return new NullDistanceAlgo();
|
||||
default:
|
||||
|
||||
try {
|
||||
if (params == null) {
|
||||
params = new HashMap<>();
|
||||
}
|
||||
params.put("limit", getLimit());
|
||||
params.put("weight", getWeight());
|
||||
DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo());
|
||||
distanceAlgo.setParams(params);
|
||||
distanceAlgo.setWeight(getWeight());
|
||||
return distanceAlgo;
|
||||
} catch (IllegalAccessException | InstantiationException e) {
|
||||
e.printStackTrace();
|
||||
return new NullDistanceAlgo();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public boolean isIgnoreMissing() {
|
||||
|
@ -135,11 +119,11 @@ public class FieldDef implements Serializable {
|
|||
this.weight = weight;
|
||||
}
|
||||
|
||||
public Algo getAlgo() {
|
||||
public String getAlgo() {
|
||||
return algo;
|
||||
}
|
||||
|
||||
public void setAlgo(final Algo algo) {
|
||||
public void setAlgo(final String algo) {
|
||||
this.algo = algo;
|
||||
}
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@ public class BlockProcessor {
|
|||
|
||||
private DedupConfig dedupConf;
|
||||
|
||||
|
||||
public static void constructAccumulator( final DedupConfig dedupConf) {
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
||||
|
|
Binary file not shown.
Loading…
Reference in New Issue