forked from antonis.lempesis/dnet-hadoop
modification in the initialization of clustering functions, distance algos and conditions.
This commit is contained in:
parent
1cbbc3f15a
commit
3cf3dc1934
|
@ -58,9 +58,11 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.reflections</groupId>
|
<groupId>org.reflections</groupId>
|
||||||
<artifactId>reflections</artifactId>
|
<artifactId>reflections</artifactId>
|
||||||
<version>0.9.10</version>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_2.11</artifactId>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -19,12 +19,6 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public AbstractClusteringFunction(){}
|
|
||||||
|
|
||||||
public void setParams(Map<String, Integer> params){
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected abstract Collection<String> doApply(String s);
|
protected abstract Collection<String> doApply(String s);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -14,10 +14,6 @@ public class Acronyms extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Acronyms(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||||
|
|
|
@ -12,5 +12,4 @@ public interface ClusteringFunction {
|
||||||
|
|
||||||
public Map<String, Integer> getParams();
|
public Map<String, Integer> getParams();
|
||||||
|
|
||||||
public void setParams(Map<String, Integer> params);
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,8 @@ public class ClusteringResolver implements Serializable {
|
||||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||||
}
|
}
|
||||||
|
|
||||||
public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
public ClusteringFunction resolve(String clusteringFunction, Map<String, Integer> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||||
|
|
||||||
return functionMap.get(clusteringFunction).newInstance();
|
return functionMap.get(clusteringFunction).getDeclaredConstructor(Map.class).newInstance(params);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,10 +13,6 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ImmutableFieldValue() {
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
|
@ -16,10 +16,6 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LowercaseClustering(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(List<Field> fields) {
|
||||||
Collection<String> c = Sets.newLinkedHashSet();
|
Collection<String> c = Sets.newLinkedHashSet();
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -9,10 +10,6 @@ import com.google.common.collect.Lists;
|
||||||
@ClusteringClass("ngrampairs")
|
@ClusteringClass("ngrampairs")
|
||||||
public class NgramPairs extends Ngrams {
|
public class NgramPairs extends Ngrams {
|
||||||
|
|
||||||
public NgramPairs() {
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public NgramPairs(Map<String, Integer> params) {
|
public NgramPairs(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.*;
|
||||||
import java.util.LinkedHashSet;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.StringTokenizer;
|
|
||||||
|
|
||||||
@ClusteringClass("ngrams")
|
@ClusteringClass("ngrams")
|
||||||
public class Ngrams extends AbstractClusteringFunction {
|
public class Ngrams extends AbstractClusteringFunction {
|
||||||
|
@ -12,10 +9,6 @@ public class Ngrams extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Ngrams() {
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||||
|
|
|
@ -30,10 +30,6 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParams(Map<String, Integer> params){
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final List<Field> fields) {
|
public Collection<String> apply(final List<Field> fields) {
|
||||||
final Set<String> hashes = Sets.newHashSet();
|
final Set<String> hashes = Sets.newHashSet();
|
||||||
|
|
|
@ -17,10 +17,6 @@ public class PersonHash extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public PersonHash(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
|
@ -9,10 +9,6 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public RandomClusteringFunction(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
|
|
|
@ -1,9 +1,6 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
|
@ -16,10 +13,6 @@ public class SortedNgramPairs extends NgramPairs {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SortedNgramPairs(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
|
|
||||||
|
|
|
@ -16,10 +16,6 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SpaceTrimmingFieldValue(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
|
@ -13,10 +13,6 @@ public class SuffixPrefix extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SuffixPrefix(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
return suffixPrefix(s, param("len"), param("max"));
|
return suffixPrefix(s, param("len"), param("max"));
|
||||||
|
|
|
@ -20,14 +20,6 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public UrlClustering() {
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setParams(Map<String, Integer> params){
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(List<Field> fields) {
|
||||||
return fields.stream()
|
return fields.stream()
|
||||||
|
|
|
@ -25,16 +25,6 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements
|
||||||
this.fields = fields;
|
this.fields = fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
public AbstractCondition(){}
|
|
||||||
|
|
||||||
public void setCond(String cond){
|
|
||||||
this.cond = cond;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setFields(List<FieldDef> fields){
|
|
||||||
this.fields = fields;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -17,9 +17,6 @@ public class AlwaysTrueCondition extends AbstractCondition {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
public AlwaysTrueCondition(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
@Override
|
@Override
|
||||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||||
return new ConditionEval(cond, a, b, 1);
|
return new ConditionEval(cond, a, b, 1);
|
||||||
|
|
|
@ -24,7 +24,4 @@ public interface ConditionAlgo {
|
||||||
*/
|
*/
|
||||||
public abstract ConditionEvalMap verify(Document a, Document b);
|
public abstract ConditionEvalMap verify(Document a, Document b);
|
||||||
|
|
||||||
public void setFields(List<FieldDef> fields);
|
|
||||||
public void setCond(String name);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
package eu.dnetlib.pace.condition;
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
import org.reflections.Reflections;
|
import org.reflections.Reflections;
|
||||||
|
|
||||||
public class ConditionResolver implements Serializable {
|
public class ConditionResolver implements Serializable {
|
||||||
|
@ -16,7 +19,7 @@ public class ConditionResolver implements Serializable {
|
||||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
||||||
}
|
}
|
||||||
|
|
||||||
public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException {
|
public ConditionAlgo resolve(String name, List<FieldDef> fields) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||||
return functionMap.get(name).newInstance();
|
return functionMap.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,10 +19,6 @@ public class ExactMatch extends AbstractCondition {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ExactMatch(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,6 @@ public class YearMatch extends AbstractCondition {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
public YearMatch(){}
|
|
||||||
|
|
||||||
// @Override
|
// @Override
|
||||||
// public boolean verify(final Document a, final Document b) {
|
// public boolean verify(final Document a, final Document b) {
|
||||||
// boolean res = true;
|
// boolean res = true;
|
||||||
|
|
|
@ -13,9 +13,5 @@ public interface DistanceAlgo {
|
||||||
public abstract double distance(Field a, Field b);
|
public abstract double distance(Field a, Field b);
|
||||||
|
|
||||||
public double getWeight();
|
public double getWeight();
|
||||||
public Map<String, Number> getParams();
|
|
||||||
|
|
||||||
public void setWeight(double w);
|
|
||||||
public void setParams(Map<String, Number> params);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,8 @@ public class DistanceResolver implements Serializable {
|
||||||
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||||
}
|
}
|
||||||
|
|
||||||
public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException {
|
public DistanceAlgo resolve(String algo, Map<String, Number> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||||
|
|
||||||
return functionMap.get(algo).newInstance();
|
return functionMap.get(algo).getDeclaredConstructor(Map.class).newInstance(params);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -28,23 +28,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, Number> params;
|
||||||
|
|
||||||
protected SecondStringDistanceAlgo(){
|
protected SecondStringDistanceAlgo(Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||||
}
|
|
||||||
|
|
||||||
protected SecondStringDistanceAlgo(Map<String, Number> params){
|
|
||||||
this.params = params;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setWeight(double w){
|
|
||||||
this.weight = w;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, Number> getParams(){
|
|
||||||
return this.params;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setParams(Map<String, Number> params){
|
|
||||||
this.params = params;
|
this.params = params;
|
||||||
|
this.weight = params.get("weight").doubleValue();
|
||||||
|
this.ssalgo = ssalgo;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -9,12 +9,8 @@ import java.util.Map;
|
||||||
@DistanceClass("AlwaysMatch")
|
@DistanceClass("AlwaysMatch")
|
||||||
public class AlwaysMatch extends SecondStringDistanceAlgo {
|
public class AlwaysMatch extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public AlwaysMatch(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public AlwaysMatch(final Map<String, Number> params){
|
public AlwaysMatch(final Map<String, Number> params){
|
||||||
super(params);
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
public AlwaysMatch(final double weight) {
|
public AlwaysMatch(final double weight) {
|
||||||
|
|
|
@ -9,12 +9,8 @@ import java.util.Map;
|
||||||
@DistanceClass("ExactMatch")
|
@DistanceClass("ExactMatch")
|
||||||
public class ExactMatch extends SecondStringDistanceAlgo {
|
public class ExactMatch extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public ExactMatch(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public ExactMatch(Map<String, Number> params){
|
public ExactMatch(Map<String, Number> params){
|
||||||
super(params);
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
public ExactMatch(final double weight) {
|
public ExactMatch(final double weight) {
|
||||||
|
|
|
@ -4,18 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||||
@DistanceClass("JaroWinkler")
|
@DistanceClass("JaroWinkler")
|
||||||
public class JaroWinkler extends SecondStringDistanceAlgo {
|
public class JaroWinkler extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public JaroWinkler(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public JaroWinkler(Map<String, Number> params){
|
public JaroWinkler(Map<String, Number> params){
|
||||||
super(params);
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
public JaroWinkler(double weight) {
|
public JaroWinkler(double weight) {
|
||||||
|
|
|
@ -10,12 +10,8 @@ import java.util.Map;
|
||||||
@DistanceClass("JaroWinklerTitle")
|
@DistanceClass("JaroWinklerTitle")
|
||||||
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public JaroWinklerTitle(){
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
public JaroWinklerTitle(Map<String, Number> params){
|
public JaroWinklerTitle(Map<String, Number> params){
|
||||||
super(params);
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
public JaroWinklerTitle(double weight) {
|
public JaroWinklerTitle(double weight) {
|
||||||
|
|
|
@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("Level2JaroWinkler")
|
@DistanceClass("Level2JaroWinkler")
|
||||||
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
|
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public Level2JaroWinkler(Map<String, Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
public Level2JaroWinkler(double w) {
|
public Level2JaroWinkler(double w) {
|
||||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("Level2JaroWinklerTitle")
|
@DistanceClass("Level2JaroWinklerTitle")
|
||||||
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public Level2JaroWinklerTitle(Map<String,Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
public Level2JaroWinklerTitle(final double w) {
|
public Level2JaroWinklerTitle(final double w) {
|
||||||
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
super(w, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("Level2Levenstein")
|
@DistanceClass("Level2Levenstein")
|
||||||
public class Level2Levenstein extends SecondStringDistanceAlgo {
|
public class Level2Levenstein extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public Level2Levenstein(Map<String,Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||||
|
}
|
||||||
|
|
||||||
public Level2Levenstein(double w) {
|
public Level2Levenstein(double w) {
|
||||||
super(w, new com.wcohen.ss.Level2Levenstein());
|
super(w, new com.wcohen.ss.Level2Levenstein());
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,13 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("Levenstein")
|
@DistanceClass("Levenstein")
|
||||||
public class Levenstein extends SecondStringDistanceAlgo {
|
public class Levenstein extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public Levenstein(){
|
public Levenstein(Map<String,Number> params){
|
||||||
super(new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
public Levenstein(double w) {
|
public Levenstein(double w) {
|
||||||
|
|
|
@ -4,11 +4,13 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("LevensteinTitle")
|
@DistanceClass("LevensteinTitle")
|
||||||
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public LevensteinTitle(){
|
public LevensteinTitle(Map<String,Number> params){
|
||||||
super(new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
public LevensteinTitle(final double w) {
|
public LevensteinTitle(final double w) {
|
||||||
|
|
|
@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
@DistanceClass("MustBeDifferent")
|
@DistanceClass("MustBeDifferent")
|
||||||
public class MustBeDifferent extends SecondStringDistanceAlgo {
|
public class MustBeDifferent extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public MustBeDifferent(Map<String,Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
|
}
|
||||||
|
|
||||||
public MustBeDifferent(final double weight) {
|
public MustBeDifferent(final double weight) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,6 +13,9 @@ import java.util.Map;
|
||||||
@DistanceClass("Null")
|
@DistanceClass("Null")
|
||||||
public class NullDistanceAlgo implements DistanceAlgo {
|
public class NullDistanceAlgo implements DistanceAlgo {
|
||||||
|
|
||||||
|
public NullDistanceAlgo(Map<String, Number> params){
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(Field a, Field b) {
|
public double distance(Field a, Field b) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
@ -23,16 +26,4 @@ public class NullDistanceAlgo implements DistanceAlgo {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setWeight(double w){
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Map<String, Number> getParams() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void setParams(Map<String, Number> params) {
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,12 +3,18 @@ package eu.dnetlib.pace.distance.algo;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class SortedJaroWinkler.
|
* The Class SortedJaroWinkler.
|
||||||
*/
|
*/
|
||||||
@DistanceClass("SortedJaroWinkler")
|
@DistanceClass("SortedJaroWinkler")
|
||||||
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public SortedJaroWinkler(Map<String,Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sorted jaro winkler.
|
* Instantiates a new sorted jaro winkler.
|
||||||
*
|
*
|
||||||
|
|
|
@ -3,6 +3,8 @@ package eu.dnetlib.pace.distance.algo;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.distance.DistanceClass;
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class SortedJaroWinkler.
|
* The Class SortedJaroWinkler.
|
||||||
*/
|
*/
|
||||||
|
@ -19,6 +21,10 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||||
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
super(weight, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SortedLevel2JaroWinkler(final Map<String, Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sorted jaro winkler.
|
* Instantiates a new sorted jaro winkler.
|
||||||
*
|
*
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
@ -27,6 +28,10 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc
|
||||||
super(weight, ssalgo);
|
super(weight, ssalgo);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected SortedSecondStringDistanceAlgo(final Map<String, Number> params, final AbstractStringDistance ssalgo){
|
||||||
|
super(params.get("weight").doubleValue(), ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
*
|
*
|
||||||
|
|
|
@ -20,10 +20,6 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||||
/** The limit. */
|
/** The limit. */
|
||||||
protected int limit;
|
protected int limit;
|
||||||
|
|
||||||
public SubStringLevenstein() {
|
|
||||||
super(new com.wcohen.ss.Levenstein());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sub string levenstein.
|
* Instantiates a new sub string levenstein.
|
||||||
*
|
*
|
||||||
|
@ -34,6 +30,11 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||||
super(w, new com.wcohen.ss.Levenstein());
|
super(w, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SubStringLevenstein(Map<String, Number> params){
|
||||||
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
|
this.limit = params.get("limit").intValue();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sub string levenstein.
|
* Instantiates a new sub string levenstein.
|
||||||
*
|
*
|
||||||
|
@ -95,9 +96,4 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParams(Map<String, Number> params){
|
|
||||||
this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit
|
|
||||||
super.setParams(params);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,8 +13,9 @@ public class UrlMatcher extends Levenstein {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, Number> params;
|
||||||
|
|
||||||
public UrlMatcher(){
|
public UrlMatcher(Map<String, Number> params){
|
||||||
super();
|
super(params);
|
||||||
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
public UrlMatcher(double weight, Map<String, Number> params) {
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
package eu.dnetlib.pace.distance.eval;
|
package eu.dnetlib.pace.distance.eval;
|
||||||
|
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -51,12 +53,10 @@ public class ScoreResult implements Serializable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
//TODO cannot print: why?
|
try {
|
||||||
// final GsonBuilder b = new GsonBuilder()
|
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
|
||||||
// .serializeSpecialFloatingPointValues()
|
} catch (IOException e) {
|
||||||
// .serializeNulls();
|
return e.getStackTrace().toString();
|
||||||
//
|
}
|
||||||
// return b.setPrettyPrinting().create().toJson(this);
|
|
||||||
return "{}";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,15 +31,11 @@ public class ClusteringDef implements Serializable {
|
||||||
public ClusteringFunction getClusteringFunction() {
|
public ClusteringFunction getClusteringFunction() {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName());
|
return clusteringResolver.resolve(getName(), params);
|
||||||
clusteringFunction.setParams(params);
|
|
||||||
return clusteringFunction;
|
|
||||||
|
|
||||||
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
return new RandomClusteringFunction(getParams());
|
return new RandomClusteringFunction(getParams());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getFields() {
|
public List<String> getFields() {
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
@ -19,11 +20,8 @@ public class CondDef implements Serializable {
|
||||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
|
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ConditionAlgo conditionAlgo = conditionResolver.resolve(getName());
|
return conditionResolver.resolve(getName(), fields);
|
||||||
conditionAlgo.setFields(fields);
|
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||||
conditionAlgo.setCond(getName());
|
|
||||||
return conditionAlgo;
|
|
||||||
} catch (IllegalAccessException | InstantiationException e) {
|
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
return new AlwaysTrueCondition(getName(), fields);
|
return new AlwaysTrueCondition(getName(), fields);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -75,13 +76,10 @@ public class FieldDef implements Serializable {
|
||||||
}
|
}
|
||||||
params.put("limit", getLimit());
|
params.put("limit", getLimit());
|
||||||
params.put("weight", getWeight());
|
params.put("weight", getWeight());
|
||||||
DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo());
|
return distanceResolver.resolve(getAlgo(), params);
|
||||||
distanceAlgo.setParams(params);
|
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||||
distanceAlgo.setWeight(getWeight());
|
|
||||||
return distanceAlgo;
|
|
||||||
} catch (IllegalAccessException | InstantiationException e) {
|
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
return new NullDistanceAlgo();
|
return new NullDistanceAlgo(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
public class ClusteringResolverTest {
|
||||||
|
|
||||||
|
private ClusteringResolver clusteringResolver;
|
||||||
|
private Map<String,Integer> params = new HashMap<String, Integer>();
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp(){
|
||||||
|
clusteringResolver = new ClusteringResolver();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
|
||||||
|
|
||||||
|
ClusteringFunction ngrams = clusteringResolver.resolve("ngrams", params);
|
||||||
|
|
||||||
|
assertEquals(ngrams.getClass(), Ngrams.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.clustering.ClusteringFunction;
|
||||||
|
import eu.dnetlib.pace.clustering.ClusteringResolver;
|
||||||
|
import eu.dnetlib.pace.clustering.Ngrams;
|
||||||
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
|
||||||
|
public class ConditionResolverTest {
|
||||||
|
|
||||||
|
private ConditionResolver conditionResolver;
|
||||||
|
private List<FieldDef> fields;
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setUp(){
|
||||||
|
conditionResolver = new ConditionResolver();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
|
||||||
|
|
||||||
|
ConditionAlgo sizeMatch = conditionResolver.resolve("sizeMatch", fields);
|
||||||
|
|
||||||
|
assertEquals(sizeMatch.getClass(), SizeMatch.class);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue