From 1cbbc3f15ae81a87396f2e6807eb0274c0c82cf0 Mon Sep 17 00:00:00 2001 From: Michele De Bonis Date: Wed, 24 Oct 2018 12:09:41 +0200 Subject: [PATCH] update in the discovery of clustering, conditions and distance functions (annotated with custom annotations) --- dnet-pace-core/pom.xml | 6 ++ .../AbstractClusteringFunction.java | 8 ++- .../eu/dnetlib/pace/clustering/Acronyms.java | 5 ++ .../BlacklistAwareClusteringCombiner.java | 3 - .../dnetlib/pace/clustering/Clustering.java | 5 -- .../pace/clustering/ClusteringClass.java | 13 ++++ .../pace/clustering/ClusteringFunction.java | 1 + .../pace/clustering/ClusteringResolver.java | 24 ++++++++ .../pace/clustering/ImmutableFieldValue.java | 5 ++ .../pace/clustering/LowercaseClustering.java | 5 ++ .../dnetlib/pace/clustering/NgramPairs.java | 5 ++ .../eu/dnetlib/pace/clustering/Ngrams.java | 7 ++- .../pace/clustering/PersonClustering.java | 6 ++ .../dnetlib/pace/clustering/PersonHash.java | 5 ++ .../clustering/RandomClusteringFunction.java | 4 ++ .../pace/clustering/SortedNgramPairs.java | 5 ++ .../clustering/SpaceTrimmingFieldValue.java | 5 ++ .../dnetlib/pace/clustering/SuffixPrefix.java | 5 ++ .../pace/clustering/UrlClustering.java | 9 +++ .../pace/condition/AbstractCondition.java | 15 ++++- .../pace/condition/AlwaysTrueCondition.java | 7 ++- .../dnetlib/pace/condition/ConditionAlgo.java | 7 ++- .../pace/condition/ConditionClass.java | 13 ++++ .../pace/condition/ConditionResolver.java | 22 +++++++ .../dnetlib/pace/condition/DoiExactMatch.java | 4 +- .../eu/dnetlib/pace/condition/ExactMatch.java | 8 ++- .../pace/condition/ExactMatchIgnoreCase.java | 4 +- .../pace/condition/MustBeDifferent.java | 4 +- .../eu/dnetlib/pace/condition/PidMatch.java | 4 +- .../eu/dnetlib/pace/condition/SizeMatch.java | 4 +- .../pace/condition/TitleVersionMatch.java | 4 +- .../eu/dnetlib/pace/condition/YearMatch.java | 7 ++- .../java/eu/dnetlib/pace/config/Algo.java | 46 -------------- .../java/eu/dnetlib/pace/config/Cond.java | 28 --------- .../dnetlib/pace/distance/DistanceAlgo.java | 6 ++ .../dnetlib/pace/distance/DistanceClass.java | 13 ++++ .../pace/distance/DistanceResolver.java | 24 ++++++++ .../dnetlib/pace/distance/DistanceScorer.java | 2 +- .../distance/SecondStringDistanceAlgo.java | 27 +++++++++ .../pace/distance/algo/AlwaysMatch.java | 12 ++++ .../pace/distance/algo/ExactMatch.java | 12 ++++ .../pace/distance/algo/JaroWinkler.java | 12 ++++ .../pace/distance/algo/JaroWinklerTitle.java | 12 ++++ .../pace/distance/algo/Level2JaroWinkler.java | 2 + .../distance/algo/Level2JaroWinklerTitle.java | 2 + .../pace/distance/algo/Level2Levenstein.java | 2 + .../pace/distance/algo/Levenstein.java | 6 ++ .../pace/distance/algo/LevensteinTitle.java | 6 ++ .../pace/distance/algo/MustBeDifferent.java | 2 + .../pace/distance/algo/NullDistanceAlgo.java | 16 +++++ .../pace/distance/algo/SortedJaroWinkler.java | 2 + .../algo/SortedLevel2JaroWinkler.java | 2 + .../distance/algo/SubStringLevenstein.java | 13 ++++ .../pace/distance/algo/UrlMatcher.java | 10 ++++ .../pace/distance/eval/ConditionEval.java | 9 ++- .../pace/distance/eval/DistanceEval.java | 1 - .../pace/distance/eval/ScoreResult.java | 14 +++-- .../eu/dnetlib/pace/model/ClusteringDef.java | 42 +++++-------- .../java/eu/dnetlib/pace/model/CondDef.java | 36 +++++------ .../java/eu/dnetlib/pace/model/FieldDef.java | 56 +++++++----------- .../eu/dnetlib/pace/util/BlockProcessor.java | 1 - .../main/resources/eu/dnetlib/pace/.DS_Store | Bin 0 -> 6148 bytes 62 files changed, 441 insertions(+), 204 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index fc4acf110..1a41a7416 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -55,6 +55,12 @@ junit test + + org.reflections + reflections + 0.9.10 + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 6f29f22ce..2885994d9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -18,7 +18,13 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i public AbstractClusteringFunction(final Map params) { this.params = params; } - + + public AbstractClusteringFunction(){} + + public void setParams(Map params){ + this.params = params; + } + protected abstract Collection doApply(String s); @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index 1897e6a87..09d2ce0e6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -7,12 +7,17 @@ import java.util.StringTokenizer; import com.google.common.collect.Sets; +@ClusteringClass("acronyms") public class Acronyms extends AbstractClusteringFunction { public Acronyms(Map params) { super(params); } + public Acronyms(){ + super(); + } + @Override protected Collection doApply(String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java index 4ecedc4c0..0167d2fd0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java @@ -22,9 +22,6 @@ public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class); - - - public static Collection filterAndCombine(final MapDocument a, final Config conf) { final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java deleted file mode 100644 index 72575409f..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java +++ /dev/null @@ -1,5 +0,0 @@ -package eu.dnetlib.pace.clustering; - -public enum Clustering { - acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java new file mode 100644 index 000000000..e67767171 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.clustering; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ClusteringClass { + + public String value(); +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 4fe1b596e..040b92824 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -12,4 +12,5 @@ public interface ClusteringFunction { public Map getParams(); + public void setParams(Map params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java new file mode 100644 index 000000000..06a364c22 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java @@ -0,0 +1,24 @@ +package eu.dnetlib.pace.clustering; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.util.Map; +import java.util.stream.Collectors; + +import org.reflections.Reflections; + +public class ClusteringResolver implements Serializable { + private final Map> functionMap; + + public ClusteringResolver() { + + this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream() + .filter(ClusteringFunction.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class)cl)); + } + + public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { + + return functionMap.get(clusteringFunction).newInstance(); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index 988476ddd..2d5b67ab5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -6,12 +6,17 @@ import java.util.Map; import com.google.common.collect.Lists; +@ClusteringClass("immutablefieldvalue") public class ImmutableFieldValue extends AbstractClusteringFunction { public ImmutableFieldValue(final Map params) { super(params); } + public ImmutableFieldValue() { + super(); + } + @Override protected Collection doApply(final String s) { final List res = Lists.newArrayList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 6d00992bd..50d73cff9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -9,12 +9,17 @@ import com.google.common.collect.Sets; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; +@ClusteringClass("lowercase") public class LowercaseClustering extends AbstractClusteringFunction { public LowercaseClustering(final Map params) { super(params); } + public LowercaseClustering(){ + super(); + } + @Override public Collection apply(List fields) { Collection c = Sets.newLinkedHashSet(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index 3cffa4d54..6c96ca214 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -6,8 +6,13 @@ import java.util.Map; import com.google.common.collect.Lists; +@ClusteringClass("ngrampairs") public class NgramPairs extends Ngrams { + public NgramPairs() { + super(); + } + public NgramPairs(Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index aaba9afbf..49ce40495 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -5,12 +5,17 @@ import java.util.LinkedHashSet; import java.util.Map; import java.util.StringTokenizer; +@ClusteringClass("ngrams") public class Ngrams extends AbstractClusteringFunction { public Ngrams(Map params) { super(params); } - + + public Ngrams() { + super(); + } + @Override protected Collection doApply(String s) { return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index d71707721..42300797e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.clustering; +import java.io.Serializable; import java.util.Collection; import java.util.List; import java.util.Map; @@ -18,6 +19,7 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.gt.Author; import eu.dnetlib.pace.model.gt.GTAuthor; +@ClusteringClass("personclustering") public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { private Map params; @@ -28,6 +30,10 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin this.params = params; } + public void setParams(Map params){ + this.params = params; + } + @Override public Collection apply(final List fields) { final Set hashes = Sets.newHashSet(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index 42d9d5bab..b0e57e905 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -8,6 +8,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.model.Person; +@ClusteringClass("personhash") public class PersonHash extends AbstractClusteringFunction { private boolean DEFAULT_AGGRESSIVE = false; @@ -16,6 +17,10 @@ public class PersonHash extends AbstractClusteringFunction { super(params); } + public PersonHash(){ + super(); + } + @Override protected Collection doApply(final String s) { final List res = Lists.newArrayList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index f012aacab..893abe8e0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -9,6 +9,10 @@ public class RandomClusteringFunction extends AbstractClusteringFunction { super(params); } + public RandomClusteringFunction(){ + super(); + } + @Override protected Collection doApply(String s) { // TODO Auto-generated method stub diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 56e660438..9ce12fc30 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -9,12 +9,17 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; +@ClusteringClass("sortedngrampairs") public class SortedNgramPairs extends NgramPairs { public SortedNgramPairs(Map params) { super(params); } + public SortedNgramPairs(){ + super(); + } + @Override protected Collection doApply(String s) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 19a51d4ca..8e1fdf3e7 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -9,12 +9,17 @@ import org.apache.commons.lang.StringUtils; import com.google.common.collect.Lists; +@ClusteringClass("spacetrimmingfieldvalue") public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { public SpaceTrimmingFieldValue(final Map params) { super(params); } + public SpaceTrimmingFieldValue(){ + super(); + } + @Override protected Collection doApply(final String s) { final List res = Lists.newArrayList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index 3ed336af4..25520d97c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -6,12 +6,17 @@ import java.util.Set; import com.google.common.collect.Sets; +@ClusteringClass("suffixprefix") public class SuffixPrefix extends AbstractClusteringFunction { public SuffixPrefix(Map params) { super(params); } + public SuffixPrefix(){ + super(); + } + @Override protected Collection doApply(String s) { return suffixPrefix(s, param("len"), param("max")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 196281444..4c0c33fd1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +@ClusteringClass("urlclustering") public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { protected Map params; @@ -19,6 +20,14 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu this.params = params; } + public UrlClustering() { + super(); + } + + public void setParams(Map params){ + this.params = params; + } + @Override public Collection apply(List fields) { return fields.stream() diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java index bbfac97b9..adc68254e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; @@ -17,15 +16,25 @@ import eu.dnetlib.pace.model.FieldDef; */ public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo { - protected Cond cond; + protected String cond; protected List fields; - public AbstractCondition(final Cond cond, final List fields) { + public AbstractCondition(final String cond, final List fields) { this.cond = cond; this.fields = fields; } + public AbstractCondition(){} + + public void setCond(String cond){ + this.cond = cond; + } + + public void setFields(List fields){ + this.fields = fields; + } + protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java index f9ff2b60b..a67567eeb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java @@ -1,7 +1,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -11,12 +10,16 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("alwaystruecondition") public class AlwaysTrueCondition extends AbstractCondition { - public AlwaysTrueCondition(final Cond cond, final List fields) { + public AlwaysTrueCondition(final String cond, final List fields) { super(cond, fields); } + public AlwaysTrueCondition(){ + super(); + } @Override protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { return new ConditionEval(cond, a, b, 1); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java index ceb7c73cc..1293c7d95 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -1,9 +1,9 @@ package eu.dnetlib.pace.condition; -import java.util.Map; - +import java.util.List; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.FieldDef; /** * Allows to express general conditions to be satisfied or not between two Documents. @@ -24,4 +24,7 @@ public interface ConditionAlgo { */ public abstract ConditionEvalMap verify(Document a, Document b); + public void setFields(List fields); + public void setCond(String name); + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java new file mode 100644 index 000000000..155360c58 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.condition; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ConditionClass { + + public String value(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java new file mode 100644 index 000000000..58a30ddda --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java @@ -0,0 +1,22 @@ +package eu.dnetlib.pace.condition; + +import java.io.Serializable; +import java.util.Map; +import java.util.stream.Collectors; + +import org.reflections.Reflections; + +public class ConditionResolver implements Serializable { + private final Map> functionMap; + + public ConditionResolver() { + + this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream() + .filter(ConditionAlgo.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class)cl)); + } + + public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException { + return functionMap.get(name).newInstance(); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java index 25b1a01cd..dfdc5cd23 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -11,11 +10,12 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("doiExactMatch") public class DoiExactMatch extends ExactMatchIgnoreCase { public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - public DoiExactMatch(final Cond cond, final List fields) { + public DoiExactMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java index 4f0f37188..f4ba8de42 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -13,12 +12,17 @@ import org.apache.commons.lang.StringUtils; * * @author claudio */ +@ConditionClass("exactMatch") public class ExactMatch extends AbstractCondition { - public ExactMatch(final Cond cond, final List fields) { + public ExactMatch(final String cond, final List fields) { super(cond, fields); } + public ExactMatch(){ + super(); + } + @Override protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java index 8baad5b24..7741f3858 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -12,9 +11,10 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("exactMatchIgnoreCase") public class ExactMatchIgnoreCase extends AbstractCondition { - public ExactMatchIgnoreCase(final Cond cond, final List fields) { + public ExactMatchIgnoreCase(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java index bc99a4cc5..f2b3bdba4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java @@ -3,7 +3,6 @@ package eu.dnetlib.pace.condition; import java.util.List; import com.google.common.collect.Iterables; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -13,6 +12,7 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("mustBeDifferent") public class MustBeDifferent extends AbstractCondition { /** @@ -20,7 +20,7 @@ public class MustBeDifferent extends AbstractCondition { * * @param fields the fields */ - public MustBeDifferent(final Cond cond, final List fields) { + public MustBeDifferent(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java index a20ab9528..53aa2deb9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -6,7 +6,6 @@ import java.util.Set; import java.util.stream.Collectors; import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,11 +19,12 @@ import org.apache.commons.logging.LogFactory; * * @author claudio */ +@ConditionClass("pidMatch") public class PidMatch extends AbstractCondition { private static final Log log = LogFactory.getLog(PidMatch.class); - public PidMatch(final Cond cond, final List fields) { + public PidMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java index ae6e94037..afd0a8eaa 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java @@ -4,7 +4,6 @@ import java.util.List; import com.google.common.collect.Iterables; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -14,6 +13,7 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("sizeMatch") public class SizeMatch extends AbstractCondition { /** @@ -22,7 +22,7 @@ public class SizeMatch extends AbstractCondition { * @param fields * the fields */ - public SizeMatch(final Cond cond, final List fields) { + public SizeMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java index 41a617aa5..4b94a0459 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -13,9 +12,10 @@ import eu.dnetlib.pace.model.FieldDef; * @author claudio * */ +@ConditionClass("titleVersionMatch") public class TitleVersionMatch extends AbstractCondition { - public TitleVersionMatch(final Cond cond, final List fields) { + public TitleVersionMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java index 89718426c..54d0ba89f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java @@ -1,8 +1,8 @@ package eu.dnetlib.pace.condition; +import java.time.Year; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import org.apache.commons.lang.StringUtils; @@ -14,14 +14,17 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("yearMatch") public class YearMatch extends AbstractCondition { private int limit = 4; - public YearMatch(final Cond cond, final List fields) { + public YearMatch(final String cond, final List fields) { super(cond, fields); } + public YearMatch(){} + // @Override // public boolean verify(final Document a, final Document b) { // boolean res = true; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java deleted file mode 100644 index cb2e434b6..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java +++ /dev/null @@ -1,46 +0,0 @@ -package eu.dnetlib.pace.config; - -/** - * Enumerates the distance Algos. - */ -public enum Algo { - - /** The Jaro winkler. */ - JaroWinkler, - /** The Jaro winkler title. */ - JaroWinklerTitle, - /** The Levenstein. */ - Levenstein, - /** The Levenstein distance for title matching */ - LevensteinTitle, - /** The Level2 jaro winkler. */ - Level2JaroWinkler, - /** The Level2 jaro winkler for title matching */ - Level2JaroWinklerTitle, - /** The Level2 levenstein. */ - Level2Levenstein, - /** The Sub string levenstein. */ - SubStringLevenstein, - /** The Year levenstein. */ - YearLevenstein, - /** The Sorted jaro winkler. */ - SortedJaroWinkler, - /** The Sorted level2 jaro winkler. */ - SortedLevel2JaroWinkler, - /** Compares two urls */ - urlMatcher, - /** Exact match algo. */ - ExactMatch, - /** - * Returns 0 for equal strings, 1 for different strings. - */ - MustBeDifferent, - /** Always return 1.0 as distance. */ - AlwaysMatch, - /** Person distance */ - PersonCoAuthorSurnamesDistance, - PersonCoAnchorsDistance, - PersonDistance, - /** The Null. */ - Null -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java deleted file mode 100644 index b287fdd76..000000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java +++ /dev/null @@ -1,28 +0,0 @@ -package eu.dnetlib.pace.config; - -/** - * The Enum Cond. - */ -public enum Cond { - - /** The year match. */ - yearMatch, - /** The title version match. */ - titleVersionMatch, - /** The size match. */ - sizeMatch, - /** - * Returns true if the field values are different - */ - mustBeDifferent, - /** The Exact match. */ - exactMatch, - /** - * The Exact match ignore case. - */ - exactMatchIgnoreCase, - /** The Exact match specialized to recognize DOI values. */ - doiExactMatch, - /** The Exact match that checks if pid type and value are the same */ - pidMatch -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java index e9d009548..c2749c503 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -2,6 +2,8 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.model.Field; +import java.util.Map; + /** * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two * objects. @@ -11,5 +13,9 @@ public interface DistanceAlgo { public abstract double distance(Field a, Field b); public double getWeight(); + public Map getParams(); + + public void setWeight(double w); + public void setParams(Map params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java new file mode 100644 index 000000000..9479fdb04 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.distance; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DistanceClass { + + public String value(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java new file mode 100644 index 000000000..09377605e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java @@ -0,0 +1,24 @@ +package eu.dnetlib.pace.distance; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.util.Map; +import java.util.stream.Collectors; + +import org.reflections.Reflections; + +public class DistanceResolver implements Serializable { + private final Map> functionMap; + + public DistanceResolver() { + + this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream() + .filter(DistanceAlgo.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); + } + + public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException { + + return functionMap.get(algo).newInstance(); + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java index 0cbb6f4f6..467a19c86 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java @@ -25,7 +25,7 @@ public class DistanceScorer { } public ScoreResult distance(final Document a, final Document b) { - final ScoreResult sr = new ScoreResult(); + final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison sr.setStrictConditions(verify(a, b, config.strictConditions())); sr.setConditions(verify(a, b, config.conditions())); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java index 83296048d..785c00bc3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -1,6 +1,8 @@ package eu.dnetlib.pace.distance; +import java.io.Serializable; import java.util.List; +import java.util.Map; import com.wcohen.ss.AbstractStringDistance; @@ -24,6 +26,27 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp /** The weight. */ protected double weight = 0.0; + private Map params; + + protected SecondStringDistanceAlgo(){ + } + + protected SecondStringDistanceAlgo(Map params){ + this.params = params; + } + + public void setWeight(double w){ + this.weight = w; + } + + public Map getParams(){ + return this.params; + } + + public void setParams(Map params){ + this.params = params; + } + /** * Instantiates a new second string distance algo. * @@ -37,6 +60,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp this.weight = weight; } + protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){ + this.ssalgo = ssalgo; + } + /** * Normalize. * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java index 904498202..7039f05a6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java @@ -1,10 +1,22 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + +@DistanceClass("AlwaysMatch") public class AlwaysMatch extends SecondStringDistanceAlgo { + public AlwaysMatch(){ + super(); + } + + public AlwaysMatch(final Map params){ + super(params); + } + public AlwaysMatch(final double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java index ef95c024a..2e714c4af 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java @@ -1,10 +1,22 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + +@DistanceClass("ExactMatch") public class ExactMatch extends SecondStringDistanceAlgo { + public ExactMatch(){ + super(); + } + + public ExactMatch(Map params){ + super(params); + } + public ExactMatch(final double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java index 87f6c4e6a..ea1e0798e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java @@ -1,11 +1,23 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@DistanceClass("JaroWinkler") public class JaroWinkler extends SecondStringDistanceAlgo { + public JaroWinkler(){ + super(); + } + + public JaroWinkler(Map params){ + super(params); + } + public JaroWinkler(double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java index 1419a072b..b37c88d63 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java @@ -1,11 +1,23 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@DistanceClass("JaroWinklerTitle") public class JaroWinklerTitle extends SecondStringDistanceAlgo { + public JaroWinklerTitle(){ + super(); + } + + public JaroWinklerTitle(Map params){ + super(params); + } + public JaroWinklerTitle(double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java index 3ad1cfaaf..a2afc3872 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Level2JaroWinkler") public class Level2JaroWinkler extends SecondStringDistanceAlgo { public Level2JaroWinkler(double w) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java index a1c347256..272e53035 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Level2JaroWinklerTitle") public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { public Level2JaroWinklerTitle(final double w) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java index 7a2b0295f..1e955bd4a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Level2Levenstein") public class Level2Levenstein extends SecondStringDistanceAlgo { public Level2Levenstein(double w) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java index 9dfce83e5..2e014b67e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java @@ -1,10 +1,16 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Levenstein") public class Levenstein extends SecondStringDistanceAlgo { + public Levenstein(){ + super(new com.wcohen.ss.Levenstein()); + } + public Levenstein(double w) { super(w, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 281de31c3..c66f972c3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -1,10 +1,16 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("LevensteinTitle") public class LevensteinTitle extends SecondStringDistanceAlgo { + public LevensteinTitle(){ + super(new com.wcohen.ss.Levenstein()); + } + public LevensteinTitle(final double w) { super(w, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java index 1177ed528..0acb82ca4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("MustBeDifferent") public class MustBeDifferent extends SecondStringDistanceAlgo { public MustBeDifferent(final double weight) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java index 8070a0010..ef798cbad 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -1,12 +1,16 @@ package eu.dnetlib.pace.distance.algo; import eu.dnetlib.pace.distance.DistanceAlgo; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; +import java.util.Map; + /** * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a * NullDistanceAlgo. */ +@DistanceClass("Null") public class NullDistanceAlgo implements DistanceAlgo { @Override @@ -19,4 +23,16 @@ public class NullDistanceAlgo implements DistanceAlgo { return 0.0; } + @Override + public void setWeight(double w){ + } + + @Override + public Map getParams() { + return null; + } + + @Override + public void setParams(Map params) { + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java index d83420750..5f716001d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java @@ -1,10 +1,12 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; /** * The Class SortedJaroWinkler. */ +@DistanceClass("SortedJaroWinkler") public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java index 43ac190e3..493bbef7c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java @@ -1,10 +1,12 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; /** * The Class SortedJaroWinkler. */ +@DistanceClass("Sorted2JaroWinkler") public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java index 1fa358b0f..9fee7df5d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import org.apache.commons.lang.StringUtils; @@ -8,14 +9,21 @@ import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; +import java.util.Map; + /** * The Class SubStringLevenstein. */ +@DistanceClass("SubStringLevenstein") public class SubStringLevenstein extends SecondStringDistanceAlgo { /** The limit. */ protected int limit; + public SubStringLevenstein() { + super(new com.wcohen.ss.Levenstein()); + } + /** * Instantiates a new sub string levenstein. * @@ -87,4 +95,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { return 1 / Math.pow(Math.abs(d) + 1, 0.1); } + public void setParams(Map params){ + this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit + super.setParams(params); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java index 46a438ebe..2aa7ca1ce 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -7,15 +8,24 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Map; +@DistanceClass("urlMatcher") public class UrlMatcher extends Levenstein { private Map params; + public UrlMatcher(){ + super(); + } + public UrlMatcher(double weight, Map params) { super(weight); this.params = params; } + public void setParams(Map params) { + this.params = params; + } + @Override public double distance(Field a, Field b) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java index 49e526f42..d3fcee59a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.distance.eval; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.model.Field; /** @@ -8,7 +7,7 @@ import eu.dnetlib.pace.model.Field; */ public class ConditionEval { - private Cond cond; + private String cond; private Field a; @@ -16,7 +15,7 @@ public class ConditionEval { private int result; - public ConditionEval(final Cond cond, final Field a, final Field b, final int result) { + public ConditionEval(final String cond, final Field a, final Field b, final int result) { this.cond = cond; this.a = a; this.b = b; @@ -47,11 +46,11 @@ public class ConditionEval { this.result = result; } - public Cond getCond() { + public String getCond() { return cond; } - public void setCond(final Cond cond) { + public void setCond(final String cond) { this.cond = cond; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java index a943d4cea..ef3c4da22 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.distance.eval; -import eu.dnetlib.pace.config.Algo; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java index b5cdad730..61d5c9327 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java @@ -2,10 +2,12 @@ package eu.dnetlib.pace.distance.eval; import com.google.gson.GsonBuilder; +import java.io.Serializable; + /** * Created by claudio on 09/03/16. */ -public class ScoreResult { +public class ScoreResult implements Serializable { private ConditionEvalMap strictConditions; @@ -49,8 +51,12 @@ public class ScoreResult { @Override public String toString() { - final GsonBuilder b = new GsonBuilder(); - b.serializeSpecialFloatingPointValues(); - return b.setPrettyPrinting().create().toJson(this); + //TODO cannot print: why? +// final GsonBuilder b = new GsonBuilder() +// .serializeSpecialFloatingPointValues() +// .serializeNulls(); +// +// return b.setPrettyPrinting().create().toJson(this); + return "{}"; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index db7092b0d..7e09d446e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.model; import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; import java.util.List; import java.util.Map; @@ -9,49 +10,36 @@ import eu.dnetlib.pace.clustering.*; public class ClusteringDef implements Serializable { - private Clustering name; + private String name; private List fields; private Map params; + private ClusteringResolver clusteringResolver = new ClusteringResolver(); + public ClusteringDef() {} - public Clustering getName() { + public String getName() { return name; } - public void setName(final Clustering name) { + public void setName(final String name) { this.name = name; } public ClusteringFunction getClusteringFunction() { - switch (getName()) { - case acronyms: - return new Acronyms(getParams()); - case ngrams: - return new Ngrams(getParams()); - case ngrampairs: - return new NgramPairs(getParams()); - case sortedngrampairs: - return new SortedNgramPairs(getParams()); - case suffixprefix: - return new SuffixPrefix(getParams()); - case spacetrimmingfieldvalue: - return new SpaceTrimmingFieldValue(getParams()); - case immutablefieldvalue: - return new ImmutableFieldValue(getParams()); - case personhash: - return new PersonHash(getParams()); - case personclustering: - return new PersonClustering(getParams()); - case lowercase: - return new LowercaseClustering(getParams()); - case urlclustering: - return new UrlClustering(getParams()); - default: + + try { + ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName()); + clusteringFunction.setParams(params); + return clusteringFunction; + + } catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) { + e.printStackTrace(); return new RandomClusteringFunction(getParams()); } + } public List getFields() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java index 747f6c103..14de69a37 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -5,44 +5,36 @@ import java.util.List; import com.google.gson.Gson; import eu.dnetlib.pace.condition.*; -import eu.dnetlib.pace.config.Cond; public class CondDef implements Serializable { - private Cond name; + private String name; private List fields; + private ConditionResolver conditionResolver = new ConditionResolver(); + public CondDef() {} public ConditionAlgo getConditionAlgo(final List fields) { - switch (getName()) { - case yearMatch: - return new YearMatch(getName(), fields); - case titleVersionMatch: - return new TitleVersionMatch(getName(), fields); - case sizeMatch: - return new SizeMatch(getName(), fields); - case exactMatch: - return new ExactMatch(getName(), fields); - case mustBeDifferent: - return new MustBeDifferent(getName(), fields); - case exactMatchIgnoreCase: - return new ExactMatchIgnoreCase(getName(), fields); - case doiExactMatch: - return new DoiExactMatch(getName(), fields); - case pidMatch: - return new PidMatch(getName(), fields); - default: + + try { + ConditionAlgo conditionAlgo = conditionResolver.resolve(getName()); + conditionAlgo.setFields(fields); + conditionAlgo.setCond(getName()); + return conditionAlgo; + } catch (IllegalAccessException | InstantiationException e) { + e.printStackTrace(); return new AlwaysTrueCondition(getName(), fields); } + } - public Cond getName() { + public String getName() { return name; } - public void setName(final Cond name) { + public void setName(final String name) { this.name = name; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 5445053bd..3f4619dcf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -1,13 +1,13 @@ package eu.dnetlib.pace.model; import java.io.Serializable; +import java.util.HashMap; import java.util.List; import java.util.Map; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.gson.Gson; -import eu.dnetlib.pace.config.Algo; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.distance.*; import eu.dnetlib.pace.distance.algo.*; @@ -19,7 +19,7 @@ public class FieldDef implements Serializable { public final static String PATH_SEPARATOR = "/"; - private Algo algo; + private String algo; private String name; @@ -37,6 +37,8 @@ public class FieldDef implements Serializable { private Map params; + private DistanceResolver distanceResolver = new DistanceResolver(); + public FieldDef() {} // def apply(s: String): Field[A] @@ -66,40 +68,22 @@ public class FieldDef implements Serializable { } public DistanceAlgo getDistanceAlgo() { - switch (getAlgo()) { - case JaroWinkler: - return new JaroWinkler(getWeight()); - case JaroWinklerTitle: - return new JaroWinklerTitle(getWeight()); - case Level2JaroWinkler: - return new Level2JaroWinkler(getWeight()); - case Level2JaroWinklerTitle: - return new Level2JaroWinklerTitle(getWeight()); - case Level2Levenstein: - return new Level2Levenstein(getWeight()); - case Levenstein: - return new Levenstein(getWeight()); - case LevensteinTitle: - return new LevensteinTitle(getWeight()); - case SubStringLevenstein: - return new SubStringLevenstein(getWeight(), getLimit()); - case SortedJaroWinkler: - return new SortedJaroWinkler(getWeight()); - case SortedLevel2JaroWinkler: - return new SortedLevel2JaroWinkler(getWeight()); - case urlMatcher: - return new UrlMatcher(getWeight(), getParams()); - case ExactMatch: - return new ExactMatch(getWeight()); - case MustBeDifferent: - return new MustBeDifferent(getWeight()); - case AlwaysMatch: - return new AlwaysMatch(getWeight()); - case Null: - return new NullDistanceAlgo(); - default: + + try { + if (params == null) { + params = new HashMap<>(); + } + params.put("limit", getLimit()); + params.put("weight", getWeight()); + DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo()); + distanceAlgo.setParams(params); + distanceAlgo.setWeight(getWeight()); + return distanceAlgo; + } catch (IllegalAccessException | InstantiationException e) { + e.printStackTrace(); return new NullDistanceAlgo(); } + } public boolean isIgnoreMissing() { @@ -135,11 +119,11 @@ public class FieldDef implements Serializable { this.weight = weight; } - public Algo getAlgo() { + public String getAlgo() { return algo; } - public void setAlgo(final Algo algo) { + public void setAlgo(final String algo) { this.algo = algo; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index a9979f5ed..3e6cd6ea5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -23,7 +23,6 @@ public class BlockProcessor { private DedupConfig dedupConf; - public static void constructAccumulator( final DedupConfig dedupConf) { accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..12db7cf64da79dd932ccfccbbd901c4eda211125 GIT binary patch literal 6148 zcmeHKI|>3Z5S>vG!N$@uSMUZw^aNf&P!vH{5VYRPb9pr1d>UQtw2?P3dC6p6LSC`6 zBO*G#Y-b`95gEY^B@zm@)+3y3!9m+;d(#Yi0p zwti-h{ zq5pp-aYY5Fz+Wk#gT-nw$CI+Qb{=Q7w!qhL%elkNFn0c7o_bQ`6`Nzf VCbof2N8IT^{tTEdG%E0G1s?X~6^#G@ literal 0 HcmV?d00001