Master branch updates from beta September 2023 #337

Manually merged
claudio.atzori merged 1271 commits from beta into master 2023-09-06 11:31:09 +02:00
16 changed files with 134 additions and 186 deletions
Showing only changes of commit ea36007d1f - Show all commits

View File

@ -1,24 +0,0 @@
package eu.dnetlib.pace.clustering;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class ClusteringResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> functionMap;
public ClusteringResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
}
public ClusteringFunction resolve(String clusteringFunction, Map<String, Integer> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(clusteringFunction).getDeclaredConstructor(Map.class).newInstance(params);
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.pace.condition;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import eu.dnetlib.pace.model.FieldDef;
import org.reflections.Reflections;
public class ConditionResolver implements Serializable {
private final Map<String, Class<ConditionAlgo>> functionMap;
public ConditionResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
.filter(ConditionAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
}
public ConditionAlgo resolve(String name, List<FieldDef> fields) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
}
}

View File

@ -7,12 +7,11 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import com.google.common.collect.Maps;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef;
@ -35,7 +34,7 @@ public class DedupConfig implements Config, Serializable {
static {
defaults.put("threshold", "0");
defaults.put("run", "001");
defaults.put("dedupRun", "001");
defaults.put("entityType", "result");
defaults.put("orderField", "title");
defaults.put("queueMaxSize", "2000");
@ -49,11 +48,15 @@ public class DedupConfig implements Config, Serializable {
public static DedupConfig load(final String json) {
final DedupConfig config = new Gson().fromJson(json, DedupConfig.class);
final DedupConfig config;
try {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
return config;
} catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e);
}
}
public static DedupConfig loadDefault() throws IOException {

View File

@ -11,6 +11,7 @@ import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.CondDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceResolver;
import org.apache.commons.collections.CollectionUtils;
public class PaceConfig implements Serializable {
@ -23,6 +24,8 @@ public class PaceConfig implements Serializable {
private Map<String, FieldDef> modelMap;
public static PaceResolver paceResolver;
public PaceConfig() {}
public void initModel() {
@ -30,6 +33,8 @@ public class PaceConfig implements Serializable {
for(FieldDef fd : getModel()) {
modelMap.put(fd.getName(), fd);
}
paceResolver = new PaceResolver();
}
public List<FieldDef> getModel() {

View File

@ -1,24 +0,0 @@
package eu.dnetlib.pace.distance;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class DistanceResolver implements Serializable {
private final Map<String, Class<DistanceAlgo>> functionMap;
public DistanceResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
}
public DistanceAlgo resolve(String algo, Map<String, Number> params) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(algo).getDeclaredConstructor(Map.class).newInstance(params);
}
}

View File

@ -1,12 +1,14 @@
package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import com.google.gson.Gson;
import eu.dnetlib.pace.clustering.*;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
public class ClusteringDef implements Serializable {
@ -16,8 +18,6 @@ public class ClusteringDef implements Serializable {
private Map<String, Integer> params;
private ClusteringResolver clusteringResolver = new ClusteringResolver();
public ClusteringDef() {}
public String getName() {
@ -29,12 +29,11 @@ public class ClusteringDef implements Serializable {
}
public ClusteringFunction getClusteringFunction() {
try {
return clusteringResolver.resolve(getName(), params);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
return PaceConfig.paceResolver.getClusteringFunction(getName(), params);
} catch (PaceException e) {
e.printStackTrace();
return new RandomClusteringFunction(getParams());
return null;
}
}
@ -56,7 +55,11 @@ public class ClusteringDef implements Serializable {
@Override
public String toString() {
return new Gson().toJson(this);
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
}
}

View File

@ -1,11 +1,12 @@
package eu.dnetlib.pace.model;
import java.io.IOException;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import com.google.gson.Gson;
import eu.dnetlib.pace.condition.*;
import eu.dnetlib.pace.config.PaceConfig;
import org.codehaus.jackson.map.ObjectMapper;
public class CondDef implements Serializable {
@ -13,19 +14,10 @@ public class CondDef implements Serializable {
private List<String> fields;
private ConditionResolver conditionResolver = new ConditionResolver();
public CondDef() {}
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields){
try {
return conditionResolver.resolve(getName(), fields);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new AlwaysTrueCondition(getName(), fields);
}
return PaceConfig.paceResolver.getConditionAlgo(getName(), fields);
}
public String getName() {
@ -46,7 +38,11 @@ public class CondDef implements Serializable {
@Override
public String toString() {
return new Gson().toJson(this);
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
}
}

View File

@ -9,9 +9,11 @@ import java.util.Map;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.gson.Gson;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.*;
import eu.dnetlib.pace.distance.algo.*;
import eu.dnetlib.pace.util.PaceException;
/**
* The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm.
@ -38,8 +40,6 @@ public class FieldDef implements Serializable {
private Map<String, Number> params;
private DistanceResolver distanceResolver = new DistanceResolver();
public FieldDef() {}
// def apply(s: String): Field[A]
@ -70,18 +70,12 @@ public class FieldDef implements Serializable {
public DistanceAlgo getDistanceAlgo() {
try {
if (params == null) {
params = new HashMap<>();
}
params.put("limit", getLimit());
params.put("weight", getWeight());
return distanceResolver.resolve(getAlgo(), params);
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
e.printStackTrace();
return new NullDistanceAlgo(params);
}
return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params);
}
public boolean isIgnoreMissing() {

View File

@ -13,7 +13,6 @@ import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import eu.dnetlib.pace.model.adaptor.PidOafSerialiser;
public class GTAuthor implements Comparable<GTAuthor> {

View File

@ -1,6 +1,8 @@
package eu.dnetlib.pace.model.gt;
import com.google.gson.Gson;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
public class ScoredResult extends Result {
@ -20,7 +22,11 @@ public class ScoredResult extends Result {
@Override
public String toString() {
return new Gson().toJson(this);
try {
return new ObjectMapper().writeValueAsString(this);
} catch (IOException e) {
return e.getStackTrace().toString();
}
}
}

View File

@ -0,0 +1,9 @@
package eu.dnetlib.pace.util;
public class PaceException extends RuntimeException {
public PaceException(String s, Throwable e){
super(s, e);
}
}

View File

@ -0,0 +1,63 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.condition.ConditionAlgo;
import eu.dnetlib.pace.condition.ConditionClass;
import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.FieldDef;
import org.reflections.Reflections;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class PaceResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> clusteringFunctions;
private final Map<String, Class<ConditionAlgo>> conditionAlgos;
private final Map<String, Class<DistanceAlgo>> distanceAlgos;
public PaceResolver() {
this.clusteringFunctions = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
this.conditionAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
.filter(ConditionAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
this.distanceAlgos = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
}
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
try {
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + "not found", e);
}
}
public DistanceAlgo getDistanceAlgo(String name, Map<String, Number> params) throws PaceException {
try {
return distanceAlgos.get(name).getDeclaredConstructor(Map.class).newInstance(params);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + "not found", e);
}
}
public ConditionAlgo getConditionAlgo(String name, List<FieldDef> fields) throws PaceException {
try {
return conditionAlgos.get(name).getDeclaredConstructor(String.class, List.class).newInstance(name, fields);
} catch (InstantiationException | IllegalAccessException | InvocationTargetException | NoSuchMethodException e) {
throw new PaceException(name + "not found", e);
}
}
}

View File

@ -1,30 +0,0 @@
package eu.dnetlib.pace.clustering;
import org.junit.Before;
import org.junit.Test;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class ClusteringResolverTest {
private ClusteringResolver clusteringResolver;
private Map<String,Integer> params = new HashMap<String, Integer>();
@Before
public void setUp(){
clusteringResolver = new ClusteringResolver();
}
@Test
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
ClusteringFunction ngrams = clusteringResolver.resolve("ngrams", params);
assertEquals(ngrams.getClass(), Ngrams.class);
}
}

View File

@ -1,35 +0,0 @@
package eu.dnetlib.pace.condition;
import eu.dnetlib.pace.clustering.ClusteringFunction;
import eu.dnetlib.pace.clustering.ClusteringResolver;
import eu.dnetlib.pace.clustering.Ngrams;
import eu.dnetlib.pace.model.FieldDef;
import org.junit.Before;
import org.junit.Test;
import java.lang.reflect.InvocationTargetException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
public class ConditionResolverTest {
private ConditionResolver conditionResolver;
private List<FieldDef> fields;
private String name;
@Before
public void setUp(){
conditionResolver = new ConditionResolver();
}
@Test
public void testResolve() throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException {
ConditionAlgo sizeMatch = conditionResolver.resolve("sizeMatch", fields);
assertEquals(sizeMatch.getClass(), SizeMatch.class);
}
}

View File

@ -76,8 +76,16 @@ public class ConfigTest extends AbstractPaceTest {
final DedupConfig cfgFromSerialization = DedupConfig.load(cfgFromClasspath.toString());
String params = "\"params\":{\"limit\":-1,\"weight\":0.0}";
//verify if the serialization produces the same result of the input json
// assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
assertEquals(cfgFromSerialization.toString().replaceAll("[\n\t\r ]", "").replaceAll("\"params\":null", params), cfgFromClasspath.toString().replaceAll("[\n\t\r ]", ""));
}
@Test
public void dedupConfigTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("result.pace.conf.json"));
System.out.println(load.toString());
}
}

View File

@ -1,7 +1,7 @@
{
"wf" : {
"threshold" : "0.99",
"run" : "001",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",