Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
16 changed files with 134 additions and 186 deletions
Showing only changes of commit ea36007d1f - Show all commits

View File

@ -1,24 +0,0 @@
package eu.dnetlib.pace.clustering;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class ClusteringResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> functionMap;
public ClusteringResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
}
public ClusteringFunction resolve(String clusteringFunction, Map<String, Integer> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(clusteringFunction).getDeclaredConstructor(Map.class).newInstance(params);
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.pace.condition;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import eu.dnetlib.pace.model.FieldDef;
import org.reflections.Reflections;
public class ConditionResolver implements Serializable {
private final Map<String, Class<ConditionAlgo>> functionMap;
public ConditionResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
.filter(ConditionAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
}
public ConditionAlgo resolve(String name, List<FieldDef> fields) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
}
}

View File

@ -7,12 +7,11 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry; import java.util.Map.Entry;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate; import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.pace.condition.ConditionAlgo; import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
@ -35,7 +34,7 @@ public class DedupConfig implements Config, Serializable {
static { static {
defaults.put("threshold", "0"); defaults.put("threshold", "0");
defaults.put("run", "001"); defaults.put("dedupRun", "001");
defaults.put("entityType", "result"); defaults.put("entityType", "result");
defaults.put("orderField", "title"); defaults.put("orderField", "title");
defaults.put("queueMaxSize", "2000"); defaults.put("queueMaxSize", "2000");
@ -49,11 +48,15 @@ public class DedupConfig implements Config, Serializable {
public static DedupConfig load(final String json) { public static DedupConfig load(final String json) {
final DedupConfig config = new Gson().fromJson(json, DedupConfig.class); final DedupConfig config;
try {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel(); config.getPace().initModel();
return config; return config;
} catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e);
}
} }
public static DedupConfig loadDefault() throws IOException { public static DedupConfig loadDefault() throws IOException {

View File

@ -11,6 +11,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef; import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.CondDef; import eu.dnetlib.pace.model.CondDef;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.CollectionUtils;
public class PaceConfig implements Serializable { public class PaceConfig implements Serializable {
@ -23,6 +24,8 @@ public class PaceConfig implements Serializable {
private Map<String, FieldDef> modelMap; private Map<String, FieldDef> modelMap;
public static PaceResolver paceResolver;
public PaceConfig() {} public PaceConfig() {}
public void initModel() { public void initModel() {
@ -30,6 +33,8 @@ public class PaceConfig implements Serializable {
for(FieldDef fd : getModel()) { for(FieldDef fd : getModel()) {
modelMap.put(fd.getName(), fd); modelMap.put(fd.getName(), fd);
} }
paceResolver = new PaceResolver();
} }
public List<FieldDef> getModel() { public List<FieldDef> getModel() {

View File

@ -1,24 +0,0 @@
package eu.dnetlib.pace.distance;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class DistanceResolver implements Serializable {
private final Map<String, Class<DistanceAlgo>> functionMap;
public DistanceResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
}
public DistanceAlgo resolve(String algo, Map<String, Number> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(algo).getDeclaredConstructor(Map.class).newInstance(params);
}
}

View File

@ -1,12 +1,14 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.gson.Gson;
import eu.dnetlib.pace.clustering.*; import eu.dnetlib.pace.clustering.*;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
public class ClusteringDef implements Serializable { public class ClusteringDef implements Serializable {
@ -16,8 +18,6 @@ public class ClusteringDef implements Serializable {
private Map<String, Integer> params; private Map<String, Integer> params;
private ClusteringResolver clusteringResolver = new ClusteringResolver();
public ClusteringDef() {} public ClusteringDef() {}
public String getName() { public String getName() {
@ -29,12 +29,11 @@ public class ClusteringDef implements Serializable {
} }
public ClusteringFunction getClusteringFunction() { public ClusteringFunction getClusteringFunction() {
try { try {
return clusteringResolver.resolve(getName(), params); return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) { } catch (PaceException e) {
e.printStackTrace(); e.printStackTrace();
return new RandomClusteringFunction(getParams()); return null;
} }
} }
@ -56,7 +55,11 @@ public class ClusteringDef implements Serializable {
@Override @Override
public String toString() { public String toString() {
return new Gson().toJson(this); try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
} }
} }

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List; import java.util.List;
import com.google.gson.Gson;
import eu.dnetlib.pace.condition.*; import eu.dnetlib.pace.condition.*;
import eu.dnetlib.pace.config.PaceConfig;
import org.codehaus.jackson.map.ObjectMapper;
public class CondDef implements Serializable { public class CondDef implements Serializable {
@ -13,19 +14,10 @@ public class CondDef implements Serializable {
private List<String> fields; private List<String> fields;
private ConditionResolver conditionResolver = new ConditionResolver();
public CondDef() {} public CondDef() {}
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) { public ConditionAlgo getConditionAlgo(final List<FieldDef> fields){
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
try {
return conditionResolver.resolve(getName(), fields);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new AlwaysTrueCondition(getName(), fields);
}
} }
public String getName() { public String getName() {
@ -46,7 +38,11 @@ public class CondDef implements Serializable {
@Override @Override
public String toString() { public String toString() {
return new Gson().toJson(this); try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
} }
} }

View File

@ -9,9 +9,11 @@ import java.util.Map;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.*; import eu.dnetlib.pace.distance.*;
import eu.dnetlib.pace.distance.algo.*; import eu.dnetlib.pace.distance.algo.*;
import eu.dnetlib.pace.util.PaceException;
/** /**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
@ -38,8 +40,6 @@ public class FieldDef implements Serializable {
private Map<String, Number> params; private Map<String, Number> params;
private DistanceResolver distanceResolver = new DistanceResolver();
public FieldDef() {} public FieldDef() {}
// def apply(s: String): Field[A] // def apply(s: String): Field[A]
@ -70,18 +70,12 @@ public class FieldDef implements Serializable {
public DistanceAlgo getDistanceAlgo() { public DistanceAlgo getDistanceAlgo() {
try {
if (params == null) { if (params == null) {
params = new HashMap<>(); params = new HashMap<>();
} }
params.put("limit", getLimit()); params.put("limit", getLimit());
params.put("weight", getWeight()); params.put("weight", getWeight());
return distanceResolver.resolve(getAlgo(), params); return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new NullDistanceAlgo(params);
}
} }
public boolean isIgnoreMissing() { public boolean isIgnoreMissing() {

View File

@ -13,7 +13,6 @@ import com.google.common.collect.Maps;
import com.google.common.collect.Ordering; import com.google.common.collect.Ordering;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import eu.dnetlib.pace.model.adaptor.PidOafSerialiser;
public class GTAuthor implements Comparable<GTAuthor> { public class GTAuthor implements Comparable<GTAuthor> {

View File

@ -1,6 +1,8 @@
package eu.dnetlib.pace.model.gt; package eu.dnetlib.pace.model.gt;
import com.google.gson.Gson; import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
public class ScoredResult extends Result { public class ScoredResult extends Result {
@ -20,7 +22,11 @@ public class ScoredResult extends Result {
@Override @Override
public String toString() { public String toString() {
return new Gson().toJson(this); try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
} }
} }

View File

@ -0,0 +1,9 @@
package eu.dnetlib.pace.util;
public class PaceException extends RuntimeException {
public PaceException(String s, Throwable e){
super(s, e);
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef;
import org.reflections.Reflections;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PaceResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
public PaceResolver() {
this.clusteringFunctions = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
this.conditionAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
.filter(ConditionAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
this.distanceAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
try {
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + "not found", e);
}
}
public DistanceAlgo getDistanceAlgo(String name, Map<String, Number> params) throws PaceException {
try {
return distanceAlgos.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + "not found", e);
}
}
public ConditionAlgo getConditionAlgo(String name, List<FieldDef> fields) throws PaceException {
try {
return conditionAlgos.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + "not found", e);
}
}
}

View File

@ -1,30 +0,0 @@
package eu.dnetlib.pace.clustering;
import org.junit.Before;
import org.junit.Test;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class ClusteringResolverTest {
private ClusteringResolver clusteringResolver;
private Map<String,Integer> params = new HashMap<String, Integer>();
@Before
public void setUp(){
clusteringResolver = new ClusteringResolver();
}
@Test
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
ClusteringFunction ngrams = clusteringResolver.resolve("ngrams", params);
assertEquals(ngrams.getClass(), Ngrams.class);
}
}

View File

@ -1,35 +0,0 @@
package eu.dnetlib.pace.condition;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.clustering.ClusteringResolver;
import eu.dnetlib.pace.clustering.Ngrams;
import eu.dnetlib.pace.model.FieldDef;
import org.junit.Before;
import org.junit.Test;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class ConditionResolverTest {
private ConditionResolver conditionResolver;
private List<FieldDef> fields;
private String name;
@Before
public void setUp(){
conditionResolver = new ConditionResolver();
}
@Test
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
ConditionAlgo sizeMatch = conditionResolver.resolve("sizeMatch", fields);
assertEquals(sizeMatch.getClass(), SizeMatch.class);
}
}

View File

@ -76,8 +76,16 @@ public class ConfigTest extends AbstractPaceTest {
final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString()); final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString());
String params = "\"params\":{\"limit\":-1,\"weight\":0.0}"; String params = "\"params\":{\"limit\":-1,\"weight\":0.0}";
//verify if the serialization produces the same result of the input json //verify if the serialization produces the same result of the input json
// assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", "")); assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
} }
@Test
public void dedupConfigTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
System.out.println(load.toString());
}
} }

View File

@ -1,7 +1,7 @@
{ {
"wf" : { "wf" : {
"threshold" : "0.99", "threshold" : "0.99",
"run" : "001", "dedupRun" : "001",
"entityType" : "result", "entityType" : "result",
"orderField" : "title", "orderField" : "title",
"queueMaxSize" : "2000", "queueMaxSize" : "2000",