Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
47 changed files with 164 additions and 187 deletions
Showing only changes of commit 3cf3dc1934 - Show all commits

View File

@ -58,9 +58,11 @@
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.10</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -19,12 +19,6 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
this.params = params;
}
public AbstractClusteringFunction(){}
public void setParams(Map<String, Integer> params){
this.params = params;
}
protected abstract Collection<String> doApply(String s);
@Override

View File

@ -14,10 +14,6 @@ public class Acronyms extends AbstractClusteringFunction {
super(params);
}
public Acronyms(){
super();
}
@Override
protected Collection<String> doApply(String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));

View File

@ -12,5 +12,4 @@ public interface ClusteringFunction {
public Map<String, Integer> getParams();
public void setParams(Map<String, Integer> params);
}

View File

@ -17,8 +17,8 @@ public class ClusteringResolver implements Serializable {
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
}
public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
public ClusteringFunction resolve(String clusteringFunction, Map<String, Integer> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(clusteringFunction).newInstance();
return functionMap.get(clusteringFunction).getDeclaredConstructor(Map.class).newInstance(params);
}
}

View File

@ -13,10 +13,6 @@ public class ImmutableFieldValue extends AbstractClusteringFunction {
super(params);
}
public ImmutableFieldValue() {
super();
}
@Override
protected Collection<String> doApply(final String s) {
final List<String> res = Lists.newArrayList();

View File

@ -16,10 +16,6 @@ public class LowercaseClustering extends AbstractClusteringFunction {
super(params);
}
public LowercaseClustering(){
super();
}
@Override
public Collection<String> apply(List<Field> fields) {
Collection<String> c = Sets.newLinkedHashSet();

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -9,10 +10,6 @@ import com.google.common.collect.Lists;
@ClusteringClass("ngrampairs")
public class NgramPairs extends Ngrams {
public NgramPairs() {
super();
}
public NgramPairs(Map<String, Integer> params) {
super(params);
}

View File

@ -1,9 +1,6 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.*;
@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction {
@ -12,10 +9,6 @@ public class Ngrams extends AbstractClusteringFunction {
super(params);
}
public Ngrams() {
super();
}
@Override
protected Collection<String> doApply(String s) {
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));

View File

@ -30,10 +30,6 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
this.params = params;
}
public void setParams(Map<String, Integer> params){
this.params = params;
}
@Override
public Collection<String> apply(final List<Field> fields) {
final Set<String> hashes = Sets.newHashSet();

View File

@ -17,10 +17,6 @@ public class PersonHash extends AbstractClusteringFunction {
super(params);
}
public PersonHash(){
super();
}
@Override
protected Collection<String> doApply(final String s) {
final List<String> res = Lists.newArrayList();

View File

@ -9,10 +9,6 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
super(params);
}
public RandomClusteringFunction(){
super();
}
@Override
protected Collection<String> doApply(String s) {
// TODO Auto-generated method stub

View File

@ -1,9 +1,6 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.*;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
@ -16,10 +13,6 @@ public class SortedNgramPairs extends NgramPairs {
super(params);
}
public SortedNgramPairs(){
super();
}
@Override
protected Collection<String> doApply(String s) {

View File

@ -16,10 +16,6 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
super(params);
}
public SpaceTrimmingFieldValue(){
super();
}
@Override
protected Collection<String> doApply(final String s) {
final List<String> res = Lists.newArrayList();

View File

@ -13,10 +13,6 @@ public class SuffixPrefix extends AbstractClusteringFunction {
super(params);
}
public SuffixPrefix(){
super();
}
@Override
protected Collection<String> doApply(String s) {
return suffixPrefix(s, param("len"), param("max"));

View File

@ -20,14 +20,6 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
this.params = params;
}
public UrlClustering() {
super();
}
public void setParams(Map<String, Integer> params){
this.params = params;
}
@Override
public Collection<String> apply(List<Field> fields) {
return fields.stream()

View File

@ -25,16 +25,6 @@ public abstract class AbstractCondition extends AbstractPaceFunctions implements
this.fields = fields;
}
public AbstractCondition(){}
public void setCond(String cond){
this.cond = cond;
}
public void setFields(List<FieldDef> fields){
this.fields = fields;
}
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
@Override

View File

@ -17,9 +17,6 @@ public class AlwaysTrueCondition extends AbstractCondition {
super(cond, fields);
}
public AlwaysTrueCondition(){
super();
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
return new ConditionEval(cond, a, b, 1);

View File

@ -24,7 +24,4 @@ public interface ConditionAlgo {
*/
public abstract ConditionEvalMap verify(Document a, Document b);
public void setFields(List<FieldDef> fields);
public void setCond(String name);
}

View File

@ -1,9 +1,12 @@
package eu.dnetlib.pace.condition;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import eu.dnetlib.pace.model.FieldDef;
import org.reflections.Reflections;
public class ConditionResolver implements Serializable {
@ -16,7 +19,7 @@ public class ConditionResolver implements Serializable {
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
}
public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException {
return functionMap.get(name).newInstance();
public ConditionAlgo resolve(String name, List<FieldDef> fields) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
}
}

View File

@ -19,10 +19,6 @@ public class ExactMatch extends AbstractCondition {
super(cond, fields);
}
public ExactMatch(){
super();
}
@Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {

View File

@ -23,8 +23,6 @@ public class YearMatch extends AbstractCondition {
super(cond, fields);
}
public YearMatch(){}
// @Override
// public boolean verify(final Document a, final Document b) {
// boolean res = true;

View File

@ -13,9 +13,5 @@ public interface DistanceAlgo {
public abstract double distance(Field a, Field b);
public double getWeight();
public Map<String, Number> getParams();
public void setWeight(double w);
public void setParams(Map<String, Number> params);
}

View File

@ -17,8 +17,8 @@ public class DistanceResolver implements Serializable {
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
}
public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException {
public DistanceAlgo resolve(String algo, Map<String, Number> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(algo).newInstance();
return functionMap.get(algo).getDeclaredConstructor(Map.class).newInstance(params);
}
}

View File

@ -28,23 +28,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
private Map<String, Number> params;
protected SecondStringDistanceAlgo(){
}
protected SecondStringDistanceAlgo(Map<String, Number> params){
this.params = params;
}
public void setWeight(double w){
this.weight = w;
}
public Map<String, Number> getParams(){
return this.params;
}
public void setParams(Map<String, Number> params){
protected SecondStringDistanceAlgo(Map<String, Number> params, final AbstractStringDistance ssalgo){
this.params = params;
this.weight = params.get("weight").doubleValue();
this.ssalgo = ssalgo;
}
/**

View File

@ -9,12 +9,8 @@ import java.util.Map;
@DistanceClass("AlwaysMatch")
public class AlwaysMatch extends SecondStringDistanceAlgo {
public AlwaysMatch(){
super();
}
public AlwaysMatch(final Map<String, Number> params){
super(params);
super(params, new com.wcohen.ss.JaroWinkler());
}
public AlwaysMatch(final double weight) {

View File

@ -9,12 +9,8 @@ import java.util.Map;
@DistanceClass("ExactMatch")
public class ExactMatch extends SecondStringDistanceAlgo {
public ExactMatch(){
super();
}
public ExactMatch(Map<String, Number> params){
super(params);
super(params, new com.wcohen.ss.JaroWinkler());
}
public ExactMatch(final double weight) {

View File

@ -4,18 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.io.Serializable;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@DistanceClass("JaroWinkler")
public class JaroWinkler extends SecondStringDistanceAlgo {
public JaroWinkler(){
super();
}
public JaroWinkler(Map<String, Number> params){
super(params);
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinkler(double weight) {

View File

@ -10,12 +10,8 @@ import java.util.Map;
@DistanceClass("JaroWinklerTitle")
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
public JaroWinklerTitle(){
super();
}
public JaroWinklerTitle(Map<String, Number> params){
super(params);
super(params, new com.wcohen.ss.JaroWinkler());
}
public JaroWinklerTitle(double weight) {

View File

@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Level2JaroWinkler")
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
public Level2JaroWinkler(Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinkler(double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}

View File

@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
public Level2JaroWinklerTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
public Level2JaroWinklerTitle(final double w) {
super(w, new com.wcohen.ss.Level2JaroWinkler());
}

View File

@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Level2Levenstein")
public class Level2Levenstein extends SecondStringDistanceAlgo {
public Level2Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Level2Levenstein());
}
public Level2Levenstein(double w) {
super(w, new com.wcohen.ss.Level2Levenstein());
}

View File

@ -4,11 +4,13 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("Levenstein")
public class Levenstein extends SecondStringDistanceAlgo {
public Levenstein(){
super(new com.wcohen.ss.Levenstein());
public Levenstein(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public Levenstein(double w) {

View File

@ -4,11 +4,13 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("LevensteinTitle")
public class LevensteinTitle extends SecondStringDistanceAlgo {
public LevensteinTitle(){
super(new com.wcohen.ss.Levenstein());
public LevensteinTitle(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public LevensteinTitle(final double w) {

View File

@ -4,9 +4,15 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("MustBeDifferent")
public class MustBeDifferent extends SecondStringDistanceAlgo {
public MustBeDifferent(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
public MustBeDifferent(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}

View File

@ -13,6 +13,9 @@ import java.util.Map;
@DistanceClass("Null")
public class NullDistanceAlgo implements DistanceAlgo {
public NullDistanceAlgo(Map<String, Number> params){
}
@Override
public double distance(Field a, Field b) {
return 0.0;
@ -23,16 +26,4 @@ public class NullDistanceAlgo implements DistanceAlgo {
return 0.0;
}
@Override
public void setWeight(double w){
}
@Override
public Map<String, Number> getParams() {
return null;
}
@Override
public void setParams(Map<String, Number> params) {
}
}

View File

@ -3,12 +3,18 @@ package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@DistanceClass("SortedJaroWinkler")
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
public SortedJaroWinkler(Map<String,Number> params){
super(params, new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sorted jaro winkler.
*

View File

@ -3,6 +3,8 @@ package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import java.util.Map;
/**
* The Class SortedJaroWinkler.
*/
@ -19,6 +21,10 @@ public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
super(weight, new com.wcohen.ss.Level2JaroWinkler());
}
public SortedLevel2JaroWinkler(final Map<String, Number> params){
super(params, new com.wcohen.ss.Level2JaroWinkler());
}
/**
* Instantiates a new sorted jaro winkler.
*

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.distance.algo;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Lists;
import com.wcohen.ss.AbstractStringDistance;
@ -27,6 +28,10 @@ public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanc
super(weight, ssalgo);
}
protected SortedSecondStringDistanceAlgo(final Map<String, Number> params, final AbstractStringDistance ssalgo){
super(params.get("weight").doubleValue(), ssalgo);
}
/*
* (non-Javadoc)
*

View File

@ -20,10 +20,6 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
/** The limit. */
protected int limit;
public SubStringLevenstein() {
super(new com.wcohen.ss.Levenstein());
}
/**
* Instantiates a new sub string levenstein.
*
@ -34,6 +30,11 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
super(w, new com.wcohen.ss.Levenstein());
}
public SubStringLevenstein(Map<String, Number> params){
super(params, new com.wcohen.ss.Levenstein());
this.limit = params.get("limit").intValue();
}
/**
* Instantiates a new sub string levenstein.
*
@ -95,9 +96,4 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
}
public void setParams(Map<String, Number> params){
this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit
super.setParams(params);
}
}

View File

@ -13,8 +13,9 @@ public class UrlMatcher extends Levenstein {
private Map<String, Number> params;
public UrlMatcher(){
super();
public UrlMatcher(Map<String, Number> params){
super(params);
this.params = params;
}
public UrlMatcher(double weight, Map<String, Number> params) {

View File

@ -1,7 +1,9 @@
package eu.dnetlib.pace.distance.eval;
import com.google.gson.GsonBuilder;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
import java.io.Serializable;
/**
@ -51,12 +53,10 @@ public class ScoreResult implements Serializable {
@Override
public String toString() {
//TODO cannot print: why?
// final GsonBuilder b = new GsonBuilder()
// .serializeSpecialFloatingPointValues()
// .serializeNulls();
//
// return b.setPrettyPrinting().create().toJson(this);
return "{}";
try {
return new ObjectMapper().writerWithDefaultPrettyPrinter().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
}
}

View File

@ -31,15 +31,11 @@ public class ClusteringDef implements Serializable {
public ClusteringFunction getClusteringFunction() {
try {
ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName());
clusteringFunction.setParams(params);
return clusteringFunction;
return clusteringResolver.resolve(getName(), params);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new RandomClusteringFunction(getParams());
}
}
public List<String> getFields() {

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.model;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import com.google.gson.Gson;
@ -19,11 +20,8 @@ public class CondDef implements Serializable {
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
try {
ConditionAlgo conditionAlgo = conditionResolver.resolve(getName());
conditionAlgo.setFields(fields);
conditionAlgo.setCond(getName());
return conditionAlgo;
} catch (IllegalAccessException | InstantiationException e) {
return conditionResolver.resolve(getName(), fields);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new AlwaysTrueCondition(getName(), fields);
}

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.model;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -75,13 +76,10 @@ public class FieldDef implements Serializable {
}
params.put("limit", getLimit());
params.put("weight", getWeight());
DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo());
distanceAlgo.setParams(params);
distanceAlgo.setWeight(getWeight());
return distanceAlgo;
} catch (IllegalAccessException | InstantiationException e) {
return distanceResolver.resolve(getAlgo(), params);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new NullDistanceAlgo();
return new NullDistanceAlgo(params);
}
}

View File

@ -0,0 +1,30 @@
package eu.dnetlib.pace.clustering;
import org.junit.Before;
import org.junit.Test;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class ClusteringResolverTest {
private ClusteringResolver clusteringResolver;
private Map<String,Integer> params = new HashMap<String, Integer>();
@Before
public void setUp(){
clusteringResolver = new ClusteringResolver();
}
@Test
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
ClusteringFunction ngrams = clusteringResolver.resolve("ngrams", params);
assertEquals(ngrams.getClass(), Ngrams.class);
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.pace.condition;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.clustering.ClusteringResolver;
import eu.dnetlib.pace.clustering.Ngrams;
import eu.dnetlib.pace.model.FieldDef;
import org.junit.Before;
import org.junit.Test;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class ConditionResolverTest {
private ConditionResolver conditionResolver;
private List<FieldDef> fields;
private String name;
@Before
public void setUp(){
conditionResolver = new ConditionResolver();
}
@Test
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
ConditionAlgo sizeMatch = conditionResolver.resolve("sizeMatch", fields);
assertEquals(sizeMatch.getClass(), SizeMatch.class);
}
}