diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml
index fc4acf110..1a41a7416 100644
--- a/dnet-pace-core/pom.xml
+++ b/dnet-pace-core/pom.xml
@@ -55,6 +55,12 @@
junit
test
+
+ org.reflections
+ reflections
+ 0.9.10
+
+
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
index 6f29f22ce..2885994d9 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java
@@ -18,7 +18,13 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
public AbstractClusteringFunction(final Map params) {
this.params = params;
}
-
+
+ public AbstractClusteringFunction(){}
+
+ public void setParams(Map params){
+ this.params = params;
+ }
+
protected abstract Collection doApply(String s);
@Override
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
index 1897e6a87..09d2ce0e6 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java
@@ -7,12 +7,17 @@ import java.util.StringTokenizer;
import com.google.common.collect.Sets;
+@ClusteringClass("acronyms")
public class Acronyms extends AbstractClusteringFunction {
public Acronyms(Map params) {
super(params);
}
+ public Acronyms(){
+ super();
+ }
+
@Override
protected Collection doApply(String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
index 4ecedc4c0..0167d2fd0 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java
@@ -22,9 +22,6 @@ public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
-
-
-
public static Collection filterAndCombine(final MapDocument a, final Config conf) {
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
deleted file mode 100644
index 72575409f..000000000
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java
+++ /dev/null
@@ -1,5 +0,0 @@
-package eu.dnetlib.pace.clustering;
-
-public enum Clustering {
- acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
-}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
new file mode 100644
index 000000000..e67767171
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java
@@ -0,0 +1,13 @@
+package eu.dnetlib.pace.clustering;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface ClusteringClass {
+
+ public String value();
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
index 4fe1b596e..040b92824 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java
@@ -12,4 +12,5 @@ public interface ClusteringFunction {
public Map getParams();
+ public void setParams(Map params);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java
new file mode 100644
index 000000000..06a364c22
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.pace.clustering;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.reflections.Reflections;
+
+public class ClusteringResolver implements Serializable {
+ private final Map> functionMap;
+
+ public ClusteringResolver() {
+
+ this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
+ .filter(ClusteringFunction.class::isAssignableFrom)
+ .collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class)cl));
+ }
+
+ public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
+
+ return functionMap.get(clusteringFunction).newInstance();
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
index 988476ddd..2d5b67ab5 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java
@@ -6,12 +6,17 @@ import java.util.Map;
import com.google.common.collect.Lists;
+@ClusteringClass("immutablefieldvalue")
public class ImmutableFieldValue extends AbstractClusteringFunction {
public ImmutableFieldValue(final Map params) {
super(params);
}
+ public ImmutableFieldValue() {
+ super();
+ }
+
@Override
protected Collection doApply(final String s) {
final List res = Lists.newArrayList();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
index 6d00992bd..50d73cff9 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java
@@ -9,12 +9,17 @@ import com.google.common.collect.Sets;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
+@ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction {
public LowercaseClustering(final Map params) {
super(params);
}
+ public LowercaseClustering(){
+ super();
+ }
+
@Override
public Collection apply(List fields) {
Collection c = Sets.newLinkedHashSet();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
index 3cffa4d54..6c96ca214 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java
@@ -6,8 +6,13 @@ import java.util.Map;
import com.google.common.collect.Lists;
+@ClusteringClass("ngrampairs")
public class NgramPairs extends Ngrams {
+ public NgramPairs() {
+ super();
+ }
+
public NgramPairs(Map params) {
super(params);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
index aaba9afbf..49ce40495 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java
@@ -5,12 +5,17 @@ import java.util.LinkedHashSet;
import java.util.Map;
import java.util.StringTokenizer;
+@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction {
public Ngrams(Map params) {
super(params);
}
-
+
+ public Ngrams() {
+ super();
+ }
+
@Override
protected Collection doApply(String s) {
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
index d71707721..42300797e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.clustering;
+import java.io.Serializable;
import java.util.Collection;
import java.util.List;
import java.util.Map;
@@ -18,6 +19,7 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.gt.Author;
import eu.dnetlib.pace.model.gt.GTAuthor;
+@ClusteringClass("personclustering")
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
private Map params;
@@ -28,6 +30,10 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
this.params = params;
}
+ public void setParams(Map params){
+ this.params = params;
+ }
+
@Override
public Collection apply(final List fields) {
final Set hashes = Sets.newHashSet();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
index 42d9d5bab..b0e57e905 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java
@@ -8,6 +8,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.Person;
+@ClusteringClass("personhash")
public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false;
@@ -16,6 +17,10 @@ public class PersonHash extends AbstractClusteringFunction {
super(params);
}
+ public PersonHash(){
+ super();
+ }
+
@Override
protected Collection doApply(final String s) {
final List res = Lists.newArrayList();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
index f012aacab..893abe8e0 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java
@@ -9,6 +9,10 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
super(params);
}
+ public RandomClusteringFunction(){
+ super();
+ }
+
@Override
protected Collection doApply(String s) {
// TODO Auto-generated method stub
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
index 56e660438..9ce12fc30 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java
@@ -9,12 +9,17 @@ import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
+@ClusteringClass("sortedngrampairs")
public class SortedNgramPairs extends NgramPairs {
public SortedNgramPairs(Map params) {
super(params);
}
+ public SortedNgramPairs(){
+ super();
+ }
+
@Override
protected Collection doApply(String s) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
index 19a51d4ca..8e1fdf3e7 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java
@@ -9,12 +9,17 @@ import org.apache.commons.lang.StringUtils;
import com.google.common.collect.Lists;
+@ClusteringClass("spacetrimmingfieldvalue")
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
public SpaceTrimmingFieldValue(final Map params) {
super(params);
}
+ public SpaceTrimmingFieldValue(){
+ super();
+ }
+
@Override
protected Collection doApply(final String s) {
final List res = Lists.newArrayList();
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
index 3ed336af4..25520d97c 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java
@@ -6,12 +6,17 @@ import java.util.Set;
import com.google.common.collect.Sets;
+@ClusteringClass("suffixprefix")
public class SuffixPrefix extends AbstractClusteringFunction {
public SuffixPrefix(Map params) {
super(params);
}
+ public SuffixPrefix(){
+ super();
+ }
+
@Override
protected Collection doApply(String s) {
return suffixPrefix(s, param("len"), param("max"));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
index 196281444..4c0c33fd1 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java
@@ -11,6 +11,7 @@ import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
+@ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
protected Map params;
@@ -19,6 +20,14 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
this.params = params;
}
+ public UrlClustering() {
+ super();
+ }
+
+ public void setParams(Map params){
+ this.params = params;
+ }
+
@Override
public Collection apply(List fields) {
return fields.stream()
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java
index bbfac97b9..adc68254e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java
@@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.Document;
@@ -17,15 +16,25 @@ import eu.dnetlib.pace.model.FieldDef;
*/
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
- protected Cond cond;
+ protected String cond;
protected List fields;
- public AbstractCondition(final Cond cond, final List fields) {
+ public AbstractCondition(final String cond, final List fields) {
this.cond = cond;
this.fields = fields;
}
+ public AbstractCondition(){}
+
+ public void setCond(String cond){
+ this.cond = cond;
+ }
+
+ public void setFields(List fields){
+ this.fields = fields;
+ }
+
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
@Override
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java
index f9ff2b60b..a67567eeb 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java
@@ -1,7 +1,6 @@
package eu.dnetlib.pace.condition;
import java.util.List;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -11,12 +10,16 @@ import eu.dnetlib.pace.model.FieldDef;
*
* @author claudio
*/
+@ConditionClass("alwaystruecondition")
public class AlwaysTrueCondition extends AbstractCondition {
- public AlwaysTrueCondition(final Cond cond, final List fields) {
+ public AlwaysTrueCondition(final String cond, final List fields) {
super(cond, fields);
}
+ public AlwaysTrueCondition(){
+ super();
+ }
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
return new ConditionEval(cond, a, b, 1);
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java
index ceb7c73cc..1293c7d95 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java
@@ -1,9 +1,9 @@
package eu.dnetlib.pace.condition;
-import java.util.Map;
-
+import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.Document;
+import eu.dnetlib.pace.model.FieldDef;
/**
* Allows to express general conditions to be satisfied or not between two Documents.
@@ -24,4 +24,7 @@ public interface ConditionAlgo {
*/
public abstract ConditionEvalMap verify(Document a, Document b);
+ public void setFields(List fields);
+ public void setCond(String name);
+
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java
new file mode 100644
index 000000000..155360c58
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java
@@ -0,0 +1,13 @@
+package eu.dnetlib.pace.condition;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface ConditionClass {
+
+ public String value();
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java
new file mode 100644
index 000000000..58a30ddda
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java
@@ -0,0 +1,22 @@
+package eu.dnetlib.pace.condition;
+
+import java.io.Serializable;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.reflections.Reflections;
+
+public class ConditionResolver implements Serializable {
+ private final Map> functionMap;
+
+ public ConditionResolver() {
+
+ this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
+ .filter(ConditionAlgo.class::isAssignableFrom)
+ .collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class)cl));
+ }
+
+ public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException {
+ return functionMap.get(name).newInstance();
+ }
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java
index 25b1a01cd..dfdc5cd23 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java
@@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -11,11 +10,12 @@ import eu.dnetlib.pace.model.FieldDef;
*
* @author claudio
*/
+@ConditionClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
- public DoiExactMatch(final Cond cond, final List fields) {
+ public DoiExactMatch(final String cond, final List fields) {
super(cond, fields);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java
index 4f0f37188..f4ba8de42 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java
@@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -13,12 +12,17 @@ import org.apache.commons.lang.StringUtils;
*
* @author claudio
*/
+@ConditionClass("exactMatch")
public class ExactMatch extends AbstractCondition {
- public ExactMatch(final Cond cond, final List fields) {
+ public ExactMatch(final String cond, final List fields) {
super(cond, fields);
}
+ public ExactMatch(){
+ super();
+ }
+
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java
index 8baad5b24..7741f3858 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java
@@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -12,9 +11,10 @@ import eu.dnetlib.pace.model.FieldDef;
*
* @author claudio
*/
+@ConditionClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractCondition {
- public ExactMatchIgnoreCase(final Cond cond, final List fields) {
+ public ExactMatchIgnoreCase(final String cond, final List fields) {
super(cond, fields);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java
index bc99a4cc5..f2b3bdba4 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java
@@ -3,7 +3,6 @@ package eu.dnetlib.pace.condition;
import java.util.List;
import com.google.common.collect.Iterables;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -13,6 +12,7 @@ import eu.dnetlib.pace.model.FieldDef;
*
* @author claudio
*/
+@ConditionClass("mustBeDifferent")
public class MustBeDifferent extends AbstractCondition {
/**
@@ -20,7 +20,7 @@ public class MustBeDifferent extends AbstractCondition {
*
* @param fields the fields
*/
- public MustBeDifferent(final Cond cond, final List fields) {
+ public MustBeDifferent(final String cond, final List fields) {
super(cond, fields);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java
index a20ab9528..53aa2deb9 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java
@@ -6,7 +6,6 @@ import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.collect.Sets;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -20,11 +19,12 @@ import org.apache.commons.logging.LogFactory;
*
* @author claudio
*/
+@ConditionClass("pidMatch")
public class PidMatch extends AbstractCondition {
private static final Log log = LogFactory.getLog(PidMatch.class);
- public PidMatch(final Cond cond, final List fields) {
+ public PidMatch(final String cond, final List fields) {
super(cond, fields);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java
index ae6e94037..afd0a8eaa 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java
@@ -4,7 +4,6 @@ import java.util.List;
import com.google.common.collect.Iterables;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -14,6 +13,7 @@ import eu.dnetlib.pace.model.FieldDef;
*
* @author claudio
*/
+@ConditionClass("sizeMatch")
public class SizeMatch extends AbstractCondition {
/**
@@ -22,7 +22,7 @@ public class SizeMatch extends AbstractCondition {
* @param fields
* the fields
*/
- public SizeMatch(final Cond cond, final List fields) {
+ public SizeMatch(final String cond, final List fields) {
super(cond, fields);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java
index 41a617aa5..4b94a0459 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java
@@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
@@ -13,9 +12,10 @@ import eu.dnetlib.pace.model.FieldDef;
* @author claudio
*
*/
+@ConditionClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractCondition {
- public TitleVersionMatch(final Cond cond, final List fields) {
+ public TitleVersionMatch(final String cond, final List fields) {
super(cond, fields);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java
index 89718426c..54d0ba89f 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java
@@ -1,8 +1,8 @@
package eu.dnetlib.pace.condition;
+import java.time.Year;
import java.util.List;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval;
import org.apache.commons.lang.StringUtils;
@@ -14,14 +14,17 @@ import eu.dnetlib.pace.model.FieldDef;
*
* @author claudio
*/
+@ConditionClass("yearMatch")
public class YearMatch extends AbstractCondition {
private int limit = 4;
- public YearMatch(final Cond cond, final List fields) {
+ public YearMatch(final String cond, final List fields) {
super(cond, fields);
}
+ public YearMatch(){}
+
// @Override
// public boolean verify(final Document a, final Document b) {
// boolean res = true;
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java
deleted file mode 100644
index cb2e434b6..000000000
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java
+++ /dev/null
@@ -1,46 +0,0 @@
-package eu.dnetlib.pace.config;
-
-/**
- * Enumerates the distance Algos.
- */
-public enum Algo {
-
- /** The Jaro winkler. */
- JaroWinkler,
- /** The Jaro winkler title. */
- JaroWinklerTitle,
- /** The Levenstein. */
- Levenstein,
- /** The Levenstein distance for title matching */
- LevensteinTitle,
- /** The Level2 jaro winkler. */
- Level2JaroWinkler,
- /** The Level2 jaro winkler for title matching */
- Level2JaroWinklerTitle,
- /** The Level2 levenstein. */
- Level2Levenstein,
- /** The Sub string levenstein. */
- SubStringLevenstein,
- /** The Year levenstein. */
- YearLevenstein,
- /** The Sorted jaro winkler. */
- SortedJaroWinkler,
- /** The Sorted level2 jaro winkler. */
- SortedLevel2JaroWinkler,
- /** Compares two urls */
- urlMatcher,
- /** Exact match algo. */
- ExactMatch,
- /**
- * Returns 0 for equal strings, 1 for different strings.
- */
- MustBeDifferent,
- /** Always return 1.0 as distance. */
- AlwaysMatch,
- /** Person distance */
- PersonCoAuthorSurnamesDistance,
- PersonCoAnchorsDistance,
- PersonDistance,
- /** The Null. */
- Null
-}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java
deleted file mode 100644
index b287fdd76..000000000
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java
+++ /dev/null
@@ -1,28 +0,0 @@
-package eu.dnetlib.pace.config;
-
-/**
- * The Enum Cond.
- */
-public enum Cond {
-
- /** The year match. */
- yearMatch,
- /** The title version match. */
- titleVersionMatch,
- /** The size match. */
- sizeMatch,
- /**
- * Returns true if the field values are different
- */
- mustBeDifferent,
- /** The Exact match. */
- exactMatch,
- /**
- * The Exact match ignore case.
- */
- exactMatchIgnoreCase,
- /** The Exact match specialized to recognize DOI values. */
- doiExactMatch,
- /** The Exact match that checks if pid type and value are the same */
- pidMatch
-}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java
index e9d009548..c2749c503 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java
@@ -2,6 +2,8 @@ package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.model.Field;
+import java.util.Map;
+
/**
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
* objects.
@@ -11,5 +13,9 @@ public interface DistanceAlgo {
public abstract double distance(Field a, Field b);
public double getWeight();
+ public Map getParams();
+
+ public void setWeight(double w);
+ public void setParams(Map params);
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java
new file mode 100644
index 000000000..9479fdb04
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java
@@ -0,0 +1,13 @@
+package eu.dnetlib.pace.distance;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface DistanceClass {
+
+ public String value();
+}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java
new file mode 100644
index 000000000..09377605e
--- /dev/null
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java
@@ -0,0 +1,24 @@
+package eu.dnetlib.pace.distance;
+
+import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.reflections.Reflections;
+
+public class DistanceResolver implements Serializable {
+ private final Map> functionMap;
+
+ public DistanceResolver() {
+
+ this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
+ .filter(DistanceAlgo.class::isAssignableFrom)
+ .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl));
+ }
+
+ public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException {
+
+ return functionMap.get(algo).newInstance();
+ }
+}
\ No newline at end of file
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
index 0cbb6f4f6..467a19c86 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java
@@ -25,7 +25,7 @@ public class DistanceScorer {
}
public ScoreResult distance(final Document a, final Document b) {
- final ScoreResult sr = new ScoreResult();
+ final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
sr.setStrictConditions(verify(a, b, config.strictConditions()));
sr.setConditions(verify(a, b, config.conditions()));
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java
index 83296048d..785c00bc3 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java
@@ -1,6 +1,8 @@
package eu.dnetlib.pace.distance;
+import java.io.Serializable;
import java.util.List;
+import java.util.Map;
import com.wcohen.ss.AbstractStringDistance;
@@ -24,6 +26,27 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
/** The weight. */
protected double weight = 0.0;
+ private Map params;
+
+ protected SecondStringDistanceAlgo(){
+ }
+
+ protected SecondStringDistanceAlgo(Map params){
+ this.params = params;
+ }
+
+ public void setWeight(double w){
+ this.weight = w;
+ }
+
+ public Map getParams(){
+ return this.params;
+ }
+
+ public void setParams(Map params){
+ this.params = params;
+ }
+
/**
* Instantiates a new second string distance algo.
*
@@ -37,6 +60,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
this.weight = weight;
}
+ protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){
+ this.ssalgo = ssalgo;
+ }
+
/**
* Normalize.
*
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java
index 904498202..7039f05a6 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java
@@ -1,10 +1,22 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+import java.util.Map;
+
+@DistanceClass("AlwaysMatch")
public class AlwaysMatch extends SecondStringDistanceAlgo {
+ public AlwaysMatch(){
+ super();
+ }
+
+ public AlwaysMatch(final Map params){
+ super(params);
+ }
+
public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java
index ef95c024a..2e714c4af 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java
@@ -1,10 +1,22 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+import java.util.Map;
+
+@DistanceClass("ExactMatch")
public class ExactMatch extends SecondStringDistanceAlgo {
+ public ExactMatch(){
+ super();
+ }
+
+ public ExactMatch(Map params){
+ super(params);
+ }
+
public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java
index 87f6c4e6a..ea1e0798e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java
@@ -1,11 +1,23 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+import java.util.Map;
+
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
+@DistanceClass("JaroWinkler")
public class JaroWinkler extends SecondStringDistanceAlgo {
+ public JaroWinkler(){
+ super();
+ }
+
+ public JaroWinkler(Map params){
+ super(params);
+ }
+
public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java
index 1419a072b..b37c88d63 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java
@@ -1,11 +1,23 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+import java.util.Map;
+
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
+@DistanceClass("JaroWinklerTitle")
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
+ public JaroWinklerTitle(){
+ super();
+ }
+
+ public JaroWinklerTitle(Map params){
+ super(params);
+ }
+
public JaroWinklerTitle(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java
index 3ad1cfaaf..a2afc3872 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java
@@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+@DistanceClass("Level2JaroWinkler")
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
public Level2JaroWinkler(double w) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java
index a1c347256..272e53035 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java
@@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+@DistanceClass("Level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
public Level2JaroWinklerTitle(final double w) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java
index 7a2b0295f..1e955bd4a 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java
@@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+@DistanceClass("Level2Levenstein")
public class Level2Levenstein extends SecondStringDistanceAlgo {
public Level2Levenstein(double w) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java
index 9dfce83e5..2e014b67e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java
@@ -1,10 +1,16 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+@DistanceClass("Levenstein")
public class Levenstein extends SecondStringDistanceAlgo {
+ public Levenstein(){
+ super(new com.wcohen.ss.Levenstein());
+ }
+
public Levenstein(double w) {
super(w, new com.wcohen.ss.Levenstein());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java
index 281de31c3..c66f972c3 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java
@@ -1,10 +1,16 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+@DistanceClass("LevensteinTitle")
public class LevensteinTitle extends SecondStringDistanceAlgo {
+ public LevensteinTitle(){
+ super(new com.wcohen.ss.Levenstein());
+ }
+
public LevensteinTitle(final double w) {
super(w, new com.wcohen.ss.Levenstein());
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java
index 1177ed528..0acb82ca4 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java
@@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
+@DistanceClass("MustBeDifferent")
public class MustBeDifferent extends SecondStringDistanceAlgo {
public MustBeDifferent(final double weight) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java
index 8070a0010..ef798cbad 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java
@@ -1,12 +1,16 @@
package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceAlgo;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
+import java.util.Map;
+
/**
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
* NullDistanceAlgo.
*/
+@DistanceClass("Null")
public class NullDistanceAlgo implements DistanceAlgo {
@Override
@@ -19,4 +23,16 @@ public class NullDistanceAlgo implements DistanceAlgo {
return 0.0;
}
+ @Override
+ public void setWeight(double w){
+ }
+
+ @Override
+ public Map getParams() {
+ return null;
+ }
+
+ @Override
+ public void setParams(Map params) {
+ }
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java
index d83420750..5f716001d 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java
@@ -1,10 +1,12 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
/**
* The Class SortedJaroWinkler.
*/
+@DistanceClass("SortedJaroWinkler")
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
/**
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java
index 43ac190e3..493bbef7c 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java
@@ -1,10 +1,12 @@
package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
+import eu.dnetlib.pace.distance.DistanceClass;
/**
* The Class SortedJaroWinkler.
*/
+@DistanceClass("Sorted2JaroWinkler")
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/**
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java
index 1fa358b0f..9fee7df5d 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.distance.algo;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import org.apache.commons.lang.StringUtils;
@@ -8,14 +9,21 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
+import java.util.Map;
+
/**
* The Class SubStringLevenstein.
*/
+@DistanceClass("SubStringLevenstein")
public class SubStringLevenstein extends SecondStringDistanceAlgo {
/** The limit. */
protected int limit;
+ public SubStringLevenstein() {
+ super(new com.wcohen.ss.Levenstein());
+ }
+
/**
* Instantiates a new sub string levenstein.
*
@@ -87,4 +95,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
+ public void setParams(Map params){
+ this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit
+ super.setParams(params);
+ }
+
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java
index 46a438ebe..2aa7ca1ce 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java
@@ -1,5 +1,6 @@
package eu.dnetlib.pace.distance.algo;
+import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils;
@@ -7,15 +8,24 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.util.Map;
+@DistanceClass("urlMatcher")
public class UrlMatcher extends Levenstein {
private Map params;
+ public UrlMatcher(){
+ super();
+ }
+
public UrlMatcher(double weight, Map params) {
super(weight);
this.params = params;
}
+ public void setParams(Map params) {
+ this.params = params;
+ }
+
@Override
public double distance(Field a, Field b) {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java
index 49e526f42..d3fcee59a 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java
@@ -1,6 +1,5 @@
package eu.dnetlib.pace.distance.eval;
-import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.model.Field;
/**
@@ -8,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
*/
public class ConditionEval {
- private Cond cond;
+ private String cond;
private Field a;
@@ -16,7 +15,7 @@ public class ConditionEval {
private int result;
- public ConditionEval(final Cond cond, final Field a, final Field b, final int result) {
+ public ConditionEval(final String cond, final Field a, final Field b, final int result) {
this.cond = cond;
this.a = a;
this.b = b;
@@ -47,11 +46,11 @@ public class ConditionEval {
this.result = result;
}
- public Cond getCond() {
+ public String getCond() {
return cond;
}
- public void setCond(final Cond cond) {
+ public void setCond(final String cond) {
this.cond = cond;
}
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java
index a943d4cea..ef3c4da22 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java
@@ -1,6 +1,5 @@
package eu.dnetlib.pace.distance.eval;
-import eu.dnetlib.pace.config.Algo;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef;
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java
index b5cdad730..61d5c9327 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java
@@ -2,10 +2,12 @@ package eu.dnetlib.pace.distance.eval;
import com.google.gson.GsonBuilder;
+import java.io.Serializable;
+
/**
* Created by claudio on 09/03/16.
*/
-public class ScoreResult {
+public class ScoreResult implements Serializable {
private ConditionEvalMap strictConditions;
@@ -49,8 +51,12 @@ public class ScoreResult {
@Override
public String toString() {
- final GsonBuilder b = new GsonBuilder();
- b.serializeSpecialFloatingPointValues();
- return b.setPrettyPrinting().create().toJson(this);
+ //TODO cannot print: why?
+// final GsonBuilder b = new GsonBuilder()
+// .serializeSpecialFloatingPointValues()
+// .serializeNulls();
+//
+// return b.setPrettyPrinting().create().toJson(this);
+ return "{}";
}
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
index db7092b0d..7e09d446e 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java
@@ -1,6 +1,7 @@
package eu.dnetlib.pace.model;
import java.io.Serializable;
+import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
@@ -9,49 +10,36 @@ import eu.dnetlib.pace.clustering.*;
public class ClusteringDef implements Serializable {
- private Clustering name;
+ private String name;
private List fields;
private Map params;
+ private ClusteringResolver clusteringResolver = new ClusteringResolver();
+
public ClusteringDef() {}
- public Clustering getName() {
+ public String getName() {
return name;
}
- public void setName(final Clustering name) {
+ public void setName(final String name) {
this.name = name;
}
public ClusteringFunction getClusteringFunction() {
- switch (getName()) {
- case acronyms:
- return new Acronyms(getParams());
- case ngrams:
- return new Ngrams(getParams());
- case ngrampairs:
- return new NgramPairs(getParams());
- case sortedngrampairs:
- return new SortedNgramPairs(getParams());
- case suffixprefix:
- return new SuffixPrefix(getParams());
- case spacetrimmingfieldvalue:
- return new SpaceTrimmingFieldValue(getParams());
- case immutablefieldvalue:
- return new ImmutableFieldValue(getParams());
- case personhash:
- return new PersonHash(getParams());
- case personclustering:
- return new PersonClustering(getParams());
- case lowercase:
- return new LowercaseClustering(getParams());
- case urlclustering:
- return new UrlClustering(getParams());
- default:
+
+ try {
+ ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName());
+ clusteringFunction.setParams(params);
+ return clusteringFunction;
+
+ } catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
+ e.printStackTrace();
return new RandomClusteringFunction(getParams());
}
+
}
public List getFields() {
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java
index 747f6c103..14de69a37 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java
@@ -5,44 +5,36 @@ import java.util.List;
import com.google.gson.Gson;
import eu.dnetlib.pace.condition.*;
-import eu.dnetlib.pace.config.Cond;
public class CondDef implements Serializable {
- private Cond name;
+ private String name;
private List fields;
+ private ConditionResolver conditionResolver = new ConditionResolver();
+
public CondDef() {}
public ConditionAlgo getConditionAlgo(final List fields) {
- switch (getName()) {
- case yearMatch:
- return new YearMatch(getName(), fields);
- case titleVersionMatch:
- return new TitleVersionMatch(getName(), fields);
- case sizeMatch:
- return new SizeMatch(getName(), fields);
- case exactMatch:
- return new ExactMatch(getName(), fields);
- case mustBeDifferent:
- return new MustBeDifferent(getName(), fields);
- case exactMatchIgnoreCase:
- return new ExactMatchIgnoreCase(getName(), fields);
- case doiExactMatch:
- return new DoiExactMatch(getName(), fields);
- case pidMatch:
- return new PidMatch(getName(), fields);
- default:
+
+ try {
+ ConditionAlgo conditionAlgo = conditionResolver.resolve(getName());
+ conditionAlgo.setFields(fields);
+ conditionAlgo.setCond(getName());
+ return conditionAlgo;
+ } catch (IllegalAccessException | InstantiationException e) {
+ e.printStackTrace();
return new AlwaysTrueCondition(getName(), fields);
}
+
}
- public Cond getName() {
+ public String getName() {
return name;
}
- public void setName(final Cond name) {
+ public void setName(final String name) {
this.name = name;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
index 5445053bd..3f4619dcf 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java
@@ -1,13 +1,13 @@
package eu.dnetlib.pace.model;
import java.io.Serializable;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
-import eu.dnetlib.pace.config.Algo;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.*;
import eu.dnetlib.pace.distance.algo.*;
@@ -19,7 +19,7 @@ public class FieldDef implements Serializable {
public final static String PATH_SEPARATOR = "/";
- private Algo algo;
+ private String algo;
private String name;
@@ -37,6 +37,8 @@ public class FieldDef implements Serializable {
private Map params;
+ private DistanceResolver distanceResolver = new DistanceResolver();
+
public FieldDef() {}
// def apply(s: String): Field[A]
@@ -66,40 +68,22 @@ public class FieldDef implements Serializable {
}
public DistanceAlgo getDistanceAlgo() {
- switch (getAlgo()) {
- case JaroWinkler:
- return new JaroWinkler(getWeight());
- case JaroWinklerTitle:
- return new JaroWinklerTitle(getWeight());
- case Level2JaroWinkler:
- return new Level2JaroWinkler(getWeight());
- case Level2JaroWinklerTitle:
- return new Level2JaroWinklerTitle(getWeight());
- case Level2Levenstein:
- return new Level2Levenstein(getWeight());
- case Levenstein:
- return new Levenstein(getWeight());
- case LevensteinTitle:
- return new LevensteinTitle(getWeight());
- case SubStringLevenstein:
- return new SubStringLevenstein(getWeight(), getLimit());
- case SortedJaroWinkler:
- return new SortedJaroWinkler(getWeight());
- case SortedLevel2JaroWinkler:
- return new SortedLevel2JaroWinkler(getWeight());
- case urlMatcher:
- return new UrlMatcher(getWeight(), getParams());
- case ExactMatch:
- return new ExactMatch(getWeight());
- case MustBeDifferent:
- return new MustBeDifferent(getWeight());
- case AlwaysMatch:
- return new AlwaysMatch(getWeight());
- case Null:
- return new NullDistanceAlgo();
- default:
+
+ try {
+ if (params == null) {
+ params = new HashMap<>();
+ }
+ params.put("limit", getLimit());
+ params.put("weight", getWeight());
+ DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo());
+ distanceAlgo.setParams(params);
+ distanceAlgo.setWeight(getWeight());
+ return distanceAlgo;
+ } catch (IllegalAccessException | InstantiationException e) {
+ e.printStackTrace();
return new NullDistanceAlgo();
}
+
}
public boolean isIgnoreMissing() {
@@ -135,11 +119,11 @@ public class FieldDef implements Serializable {
this.weight = weight;
}
- public Algo getAlgo() {
+ public String getAlgo() {
return algo;
}
- public void setAlgo(final Algo algo) {
+ public void setAlgo(final String algo) {
this.algo = algo;
}
diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java
index a9979f5ed..3e6cd6ea5 100644
--- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java
+++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java
@@ -23,7 +23,6 @@ public class BlockProcessor {
private DedupConfig dedupConf;
-
public static void constructAccumulator( final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store
new file mode 100644
index 000000000..12db7cf64
Binary files /dev/null and b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store differ